Commit 55827458 authored by David S. Miller's avatar David S. Miller

Merge branch 'mlxsw-Add-one-armed-router-support'

Ido Schimmel says:

====================
mlxsw: Add one-armed router support

Up until now, when a packet was routed by the ASIC through the same
router interface (RIF) from which it ingressed from, the ASIC passed the
sole copy of the packet to the kernel. This allowed the kernel to route
the packet and also potentially generate an ICMP redirect.

There are scenarios (e.g., "one-armed router") where packets are
intentionally routed this way and are therefore not deemed as
exceptions. In such scenarios the current method of trapping packets to
the CPU is problematic, as it results in major packet loss.

This patchset solves the problem by having the ASIC forward the packet,
but also send a copy to the CPU, which gives the kernel the opportunity
to generate required exceptions.

To prevent the kernel from forwarding such packets again, the driver
marks them with 'offload_l3_fwd_mark', which causes the kernel to
consume them in ip{,6}_forward_finish().

Patch #1 renames 'offload_mr_fwd_mark' to 'offload_l3_fwd_mark'. When
set, the field indicates that a packet was already forwarded in L3
(unicast / multicast) by a capable device.

Patch #2 teaches the kernel to consume unicast packets that have
'offload_l3_fwd_mark' set.

Patch #3 changes mlxsw to mirror loopbacked (iRIF == eRIF) packets,
instead of trapping them.

Patch #4 adds a test case for above mentioned scenario.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents d9bbd6a1 b6f153d3
...@@ -5072,6 +5072,7 @@ enum mlxsw_reg_htgt_trap_group { ...@@ -5072,6 +5072,7 @@ enum mlxsw_reg_htgt_trap_group {
MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT, MLXSW_REG_HTGT_TRAP_GROUP_SP_EVENT,
MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD, MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_MLD,
MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND, MLXSW_REG_HTGT_TRAP_GROUP_SP_IPV6_ND,
MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR,
}; };
/* reg_htgt_trap_group /* reg_htgt_trap_group
......
...@@ -3554,10 +3554,10 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port, ...@@ -3554,10 +3554,10 @@ static void mlxsw_sp_rx_listener_mark_func(struct sk_buff *skb, u8 local_port,
return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
} }
static void mlxsw_sp_rx_listener_mr_mark_func(struct sk_buff *skb, static void mlxsw_sp_rx_listener_l3_mark_func(struct sk_buff *skb,
u8 local_port, void *priv) u8 local_port, void *priv)
{ {
skb->offload_mr_fwd_mark = 1; skb->offload_l3_fwd_mark = 1;
skb->offload_fwd_mark = 1; skb->offload_fwd_mark = 1;
return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv); return mlxsw_sp_rx_listener_no_mark_func(skb, local_port, priv);
} }
...@@ -3605,8 +3605,8 @@ static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, ...@@ -3605,8 +3605,8 @@ static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port,
MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action, \ MLXSW_RXL(mlxsw_sp_rx_listener_mark_func, _trap_id, _action, \
_is_ctrl, SP_##_trap_group, DISCARD) _is_ctrl, SP_##_trap_group, DISCARD)
#define MLXSW_SP_RXL_MR_MARK(_trap_id, _action, _trap_group, _is_ctrl) \ #define MLXSW_SP_RXL_L3_MARK(_trap_id, _action, _trap_group, _is_ctrl) \
MLXSW_RXL(mlxsw_sp_rx_listener_mr_mark_func, _trap_id, _action, \ MLXSW_RXL(mlxsw_sp_rx_listener_l3_mark_func, _trap_id, _action, \
_is_ctrl, SP_##_trap_group, DISCARD) _is_ctrl, SP_##_trap_group, DISCARD)
#define MLXSW_SP_EVENTL(_func, _trap_id) \ #define MLXSW_SP_EVENTL(_func, _trap_id) \
...@@ -3639,7 +3639,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { ...@@ -3639,7 +3639,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = {
/* L3 traps */ /* L3 traps */
MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(MTUERROR, TRAP_TO_CPU, ROUTER_EXP, false),
MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_MARK(TTLERROR, TRAP_TO_CPU, ROUTER_EXP, false),
MLXSW_SP_RXL_MARK(LBERROR, TRAP_TO_CPU, ROUTER_EXP, false), MLXSW_SP_RXL_L3_MARK(LBERROR, MIRROR_TO_CPU, LBERROR, false),
MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false), MLXSW_SP_RXL_MARK(IP2ME, TRAP_TO_CPU, IP2ME, false),
MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP, MLXSW_SP_RXL_MARK(IPV6_UNSPECIFIED_ADDRESS, TRAP_TO_CPU, ROUTER_EXP,
false), false),
...@@ -3683,7 +3683,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { ...@@ -3683,7 +3683,7 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = {
MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false), MLXSW_SP_RXL_MARK(IPV6_PIM, TRAP_TO_CPU, PIM, false),
MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false), MLXSW_SP_RXL_MARK(RPF, TRAP_TO_CPU, RPF, false),
MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false),
MLXSW_SP_RXL_MR_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false),
/* NVE traps */ /* NVE traps */
MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, ARP, false), MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, ARP, false),
}; };
...@@ -3713,6 +3713,7 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core) ...@@ -3713,6 +3713,7 @@ static int mlxsw_sp_cpu_policers_set(struct mlxsw_core *mlxsw_core)
case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF: case MLXSW_REG_HTGT_TRAP_GROUP_SP_OSPF:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM: case MLXSW_REG_HTGT_TRAP_GROUP_SP_PIM:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF: case MLXSW_REG_HTGT_TRAP_GROUP_SP_RPF:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR:
rate = 128; rate = 128;
burst_size = 7; burst_size = 7;
break; break;
...@@ -3798,6 +3799,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core) ...@@ -3798,6 +3799,7 @@ static int mlxsw_sp_trap_groups_set(struct mlxsw_core *mlxsw_core)
case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP: case MLXSW_REG_HTGT_TRAP_GROUP_SP_ROUTER_EXP:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE: case MLXSW_REG_HTGT_TRAP_GROUP_SP_REMOTE_ROUTE:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST: case MLXSW_REG_HTGT_TRAP_GROUP_SP_MULTICAST:
case MLXSW_REG_HTGT_TRAP_GROUP_SP_LBERROR:
priority = 1; priority = 1;
tc = 1; tc = 1;
break; break;
......
...@@ -616,6 +616,8 @@ typedef unsigned char *sk_buff_data_t; ...@@ -616,6 +616,8 @@ typedef unsigned char *sk_buff_data_t;
* @pkt_type: Packet class * @pkt_type: Packet class
* @fclone: skbuff clone status * @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs * @ipvs_property: skbuff is owned by ipvs
* @offload_fwd_mark: Packet was L2-forwarded in hardware
* @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
* @tc_skip_classify: do not classify packet. set by IFB device * @tc_skip_classify: do not classify packet. set by IFB device
* @tc_at_ingress: used within tc_classify to distinguish in/egress * @tc_at_ingress: used within tc_classify to distinguish in/egress
* @tc_redirected: packet was redirected by a tc action * @tc_redirected: packet was redirected by a tc action
...@@ -799,7 +801,7 @@ struct sk_buff { ...@@ -799,7 +801,7 @@ struct sk_buff {
__u8 remcsum_offload:1; __u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV #ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1; __u8 offload_fwd_mark:1;
__u8 offload_mr_fwd_mark:1; __u8 offload_l3_fwd_mark:1;
#endif #endif
#ifdef CONFIG_NET_CLS_ACT #ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1; __u8 tc_skip_classify:1;
......
...@@ -4885,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) ...@@ -4885,7 +4885,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
#ifdef CONFIG_NET_SWITCHDEV #ifdef CONFIG_NET_SWITCHDEV
skb->offload_fwd_mark = 0; skb->offload_fwd_mark = 0;
skb->offload_mr_fwd_mark = 0; skb->offload_l3_fwd_mark = 0;
#endif #endif
if (!xnet) if (!xnet)
......
...@@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s ...@@ -69,6 +69,13 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
#ifdef CONFIG_NET_SWITCHDEV
if (skb->offload_l3_fwd_mark) {
consume_skb(skb);
return 0;
}
#endif
if (unlikely(opt->optlen)) if (unlikely(opt->optlen))
ip_forward_options(skb); ip_forward_options(skb);
......
...@@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, ...@@ -1802,7 +1802,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
struct vif_device *out_vif = &mrt->vif_table[out_vifi]; struct vif_device *out_vif = &mrt->vif_table[out_vifi];
struct vif_device *in_vif = &mrt->vif_table[in_vifi]; struct vif_device *in_vif = &mrt->vif_table[in_vifi];
if (!skb->offload_mr_fwd_mark) if (!skb->offload_l3_fwd_mark)
return false; return false;
if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
return false; return false;
......
...@@ -378,6 +378,13 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, ...@@ -378,6 +378,13 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
#ifdef CONFIG_NET_SWITCHDEV
if (skb->offload_l3_fwd_mark) {
consume_skb(skb);
return 0;
}
#endif
return dst_output(net, sk, skb); return dst_output(net, sk, skb);
} }
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Test a "one-armed router" [1] scenario. Packets forwarded between H1 and H2
# should be forwarded by the ASIC, but also trapped so that ICMP redirect
# packets could be potentially generated.
#
# 1. https://en.wikipedia.org/wiki/One-armed_router
#
# +---------------------------------+
# | H1 (vrf) |
# | + $h1 |
# | | 192.0.2.1/24 |
# | | 2001:db8:1::1/64 |
# | | |
# | | default via 192.0.2.2 |
# | | default via 2001:db8:1::2 |
# +----|----------------------------+
# |
# +----|----------------------------------------------------------------------+
# | SW | |
# | +--|--------------------------------------------------------------------+ |
# | | + $swp1 BR0 (802.1d) | |
# | | | |
# | | 192.0.2.2/24 | |
# | | 2001:db8:1::2/64 | |
# | | 198.51.100.2/24 | |
# | | 2001:db8:2::2/64 | |
# | | | |
# | | + $swp2 | |
# | +--|--------------------------------------------------------------------+ |
# | | |
# +----|----------------------------------------------------------------------+
# |
# +----|----------------------------+
# | | default via 198.51.100.2 |
# | | default via 2001:db8:2::2 |
# | | |
# | | 2001:db8:2::1/64 |
# | | 198.51.100.1/24 |
# | + $h2 |
# | H2 (vrf) |
# +---------------------------------+
lib_dir=$(dirname $0)/../../../net/forwarding
ALL_TESTS="ping_ipv4 ping_ipv6 fwd_mark_ipv4 fwd_mark_ipv6"
NUM_NETIFS=4
source $lib_dir/tc_common.sh
source $lib_dir/lib.sh
h1_create()
{
simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
}
h1_destroy()
{
ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
}
h2_create()
{
simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64
ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
}
h2_destroy()
{
ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64
}
switch_create()
{
ip link add name br0 type bridge mcast_snooping 0
ip link set dev br0 up
ip link set dev $swp1 master br0
ip link set dev $swp1 up
ip link set dev $swp2 master br0
ip link set dev $swp2 up
tc qdisc add dev $swp1 clsact
tc qdisc add dev $swp2 clsact
__addr_add_del br0 add 192.0.2.2/24 2001:db8:1::2/64
__addr_add_del br0 add 198.51.100.2/24 2001:db8:2::2/64
}
switch_destroy()
{
__addr_add_del br0 del 198.51.100.2/24 2001:db8:2::2/64
__addr_add_del br0 del 192.0.2.2/24 2001:db8:1::2/64
tc qdisc del dev $swp2 clsact
tc qdisc del dev $swp1 clsact
ip link set dev $swp2 down
ip link set dev $swp2 nomaster
ip link set dev $swp1 down
ip link set dev $swp1 nomaster
ip link set dev br0 down
ip link del dev br0
}
ping_ipv4()
{
ping_test $h1 198.51.100.1 ": h1->h2"
}
ping_ipv6()
{
ping6_test $h1 2001:db8:2::1 ": h1->h2"
}
fwd_mark_ipv4()
{
# Transmit packets from H1 to H2 and make sure they are trapped at
# swp1 due to loopback error, but only forwarded by the ASIC through
# swp2
tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
skip_hw dst_ip 198.51.100.1 ip_proto udp dst_port 52768 \
action pass
tc filter add dev $swp2 egress protocol ip pref 1 handle 101 flower \
skip_hw dst_ip 198.51.100.1 ip_proto udp dst_port 52768 \
action pass
tc filter add dev $swp2 egress protocol ip pref 2 handle 102 flower \
skip_sw dst_ip 198.51.100.1 ip_proto udp dst_port 52768 \
action pass
ip vrf exec v$h1 $MZ $h1 -c 10 -d 100msec -p 64 -A 192.0.2.1 \
-B 198.51.100.1 -t udp dp=52768,sp=42768 -q
RET=0
tc_check_packets "dev $swp1 ingress" 101 10
check_err $?
log_test "fwd mark: trapping IPv4 packets due to LBERROR"
RET=0
tc_check_packets "dev $swp2 egress" 101 0
check_err $?
log_test "fwd mark: forwarding IPv4 packets in software"
RET=0
tc_check_packets "dev $swp2 egress" 102 10
check_err $?
log_test "fwd mark: forwarding IPv4 packets in hardware"
tc filter del dev $swp2 egress protocol ip pref 2 handle 102 flower
tc filter del dev $swp2 egress protocol ip pref 1 handle 101 flower
tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
}
fwd_mark_ipv6()
{
tc filter add dev $swp1 ingress protocol ipv6 pref 1 handle 101 flower \
skip_hw dst_ip 2001:db8:2::1 ip_proto udp dst_port 52768 \
action pass
tc filter add dev $swp2 egress protocol ipv6 pref 1 handle 101 flower \
skip_hw dst_ip 2001:db8:2::1 ip_proto udp dst_port 52768 \
action pass
tc filter add dev $swp2 egress protocol ipv6 pref 2 handle 102 flower \
skip_sw dst_ip 2001:db8:2::1 ip_proto udp dst_port 52768 \
action pass
ip vrf exec v$h1 $MZ $h1 -6 -c 10 -d 100msec -p 64 -A 2001:db8:1::1 \
-B 2001:db8:2::1 -t udp dp=52768,sp=42768 -q
RET=0
tc_check_packets "dev $swp1 ingress" 101 10
check_err $?
log_test "fwd mark: trapping IPv6 packets due to LBERROR"
RET=0
tc_check_packets "dev $swp2 egress" 101 0
check_err $?
log_test "fwd mark: forwarding IPv6 packets in software"
RET=0
tc_check_packets "dev $swp2 egress" 102 10
check_err $?
log_test "fwd mark: forwarding IPv6 packets in hardware"
tc filter del dev $swp2 egress protocol ipv6 pref 2 handle 102 flower
tc filter del dev $swp2 egress protocol ipv6 pref 1 handle 101 flower
tc filter del dev $swp1 ingress protocol ipv6 pref 1 handle 101 flower
}
setup_prepare()
{
h1=${NETIFS[p1]}
swp1=${NETIFS[p2]}
swp2=${NETIFS[p3]}
h2=${NETIFS[p4]}
vrf_prepare
forwarding_enable
sysctl_set net.ipv4.conf.all.accept_redirects 0
sysctl_set net.ipv6.conf.all.accept_redirects 0
h1_create
h2_create
switch_create
}
cleanup()
{
pre_cleanup
switch_destroy
h2_destroy
h1_destroy
sysctl_restore net.ipv6.conf.all.accept_redirects
sysctl_restore net.ipv4.conf.all.accept_redirects
forwarding_restore
vrf_cleanup
}
trap cleanup EXIT
setup_prepare
setup_wait
tests_run
exit $EXIT_STATUS
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment