Commit eb9da2c1 authored by David S. Miller's avatar David S. Miller

Merge branch 'mlxsw-resilient-nh-groups'

Ido Schimmel says:

====================
mlxsw: Add support for resilient nexthop groups

This patchset adds support for resilient nexthop groups in mlxsw. As far
as the hardware is concerned, resilient groups are the same as regular
groups. The differences lie in how mlxsw manages the individual
adjacency entries (nexthop buckets) that make up the group.

The first difference is that unlike regular groups the driver needs to
periodically update the kernel about activity of nexthop buckets so that
the kernel will not treat the buckets as idle, given traffic is
offloaded from the CPU to the ASIC. This is similar to what mlxsw is
already doing with respect to neighbour entries. The update interval is
set to 1 second to allow for short idle timers.

The second difference is that nexthop buckets that correspond to an
unresolved neighbour must be programmed to the device, as the size of
the group must remain fixed. This is achieved by programming such
entries with trap action, in order to trigger neighbour resolution by
the kernel.

The third difference is atomic replacement of individual nexthop
buckets. While the driver periodically updates the kernel about activity
of nexthop buckets, it is possible for a bucket to become active just
before the kernel decides to replace it with a different nexthop. To
avoid such situations and connections being reset, the driver instructs
the device to only replace an adjacency entry if it is inactive.
Failures are propagated back to the nexthop code.

Patchset overview:

Patches #1-#7 gradually add support for resilient nexthop groups

Patch #8 finally enables such groups to be programmed to the device

Patches #9-#10 add mlxsw-specific selftests
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3c85a8b8 ffd3e9b0
......@@ -8130,6 +8130,60 @@ mlxsw_reg_rtdp_ipip4_pack(char *payload, u16 irif,
mlxsw_reg_rtdp_ipip_expected_gre_key_set(payload, expected_gre_key);
}
/* RATRAD - Router Adjacency Table Activity Dump Register
* ------------------------------------------------------
* The RATRAD register is used to dump and optionally clear activity bits of
* router adjacency table entries.
*/
#define MLXSW_REG_RATRAD_ID 0x8022
#define MLXSW_REG_RATRAD_LEN 0x210
MLXSW_REG_DEFINE(ratrad, MLXSW_REG_RATRAD_ID, MLXSW_REG_RATRAD_LEN);
enum {
/* Read activity */
MLXSW_REG_RATRAD_OP_READ_ACTIVITY,
/* Read and clear activity */
MLXSW_REG_RATRAD_OP_READ_CLEAR_ACTIVITY,
};
/* reg_ratrad_op
* Access: Operation
*/
MLXSW_ITEM32(reg, ratrad, op, 0x00, 30, 2);
/* reg_ratrad_ecmp_size
* ecmp_size is the amount of sequential entries from adjacency_index. Valid
* ranges:
* Spectrum-1: 32-64, 512, 1024, 2048, 4096
* Spectrum-2/3: 32-128, 256, 512, 1024, 2048, 4096
* Access: Index
*/
MLXSW_ITEM32(reg, ratrad, ecmp_size, 0x00, 0, 13);
/* reg_ratrad_adjacency_index
* Index into the adjacency table.
* Access: Index
*/
MLXSW_ITEM32(reg, ratrad, adjacency_index, 0x04, 0, 24);
/* reg_ratrad_activity_vector
* Activity bit per adjacency index.
* Bits higher than ecmp_size are reserved.
* Access: RO
*/
MLXSW_ITEM_BIT_ARRAY(reg, ratrad, activity_vector, 0x10, 0x200, 1);
static inline void mlxsw_reg_ratrad_pack(char *payload, u32 adjacency_index,
u16 ecmp_size)
{
MLXSW_REG_ZERO(ratrad, payload);
mlxsw_reg_ratrad_op_set(payload,
MLXSW_REG_RATRAD_OP_READ_CLEAR_ACTIVITY);
mlxsw_reg_ratrad_ecmp_size_set(payload, ecmp_size);
mlxsw_reg_ratrad_adjacency_index_set(payload, adjacency_index);
}
/* RIGR-V2 - Router Interface Group Register Version 2
* ---------------------------------------------------
* The RIGR_V2 register is used to add, remove and query egress interface list
......@@ -12114,6 +12168,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
MLXSW_REG(rtar),
MLXSW_REG(ratr),
MLXSW_REG(rtdp),
MLXSW_REG(ratrad),
MLXSW_REG(rdpm),
MLXSW_REG(ricnt),
MLXSW_REG(rrcr),
......
......@@ -1178,6 +1178,7 @@ mlxsw_sp_dpipe_table_adj_entries_dump(void *priv, bool counters_enabled,
static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable)
{
char ratr_pl[MLXSW_REG_RATR_LEN];
struct mlxsw_sp *mlxsw_sp = priv;
struct mlxsw_sp_nexthop *nh;
u32 adj_hash_index = 0;
......@@ -1196,7 +1197,8 @@ static int mlxsw_sp_dpipe_table_adj_counters_update(void *priv, bool enable)
else
mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh);
mlxsw_sp_nexthop_eth_update(mlxsw_sp,
adj_index + adj_hash_index, nh);
adj_index + adj_hash_index, nh,
true, ratr_pl);
}
return 0;
}
......
......@@ -127,14 +127,16 @@ bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr)
static int
mlxsw_sp_ipip_nexthop_update_gre4(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
struct mlxsw_sp_ipip_entry *ipip_entry)
struct mlxsw_sp_ipip_entry *ipip_entry,
bool force, char *ratr_pl)
{
u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
__be32 daddr4 = mlxsw_sp_ipip_netdev_daddr4(ipip_entry->ol_dev);
char ratr_pl[MLXSW_REG_RATR_LEN];
enum mlxsw_reg_ratr_op op;
mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY,
true, MLXSW_REG_RATR_TYPE_IPIP,
op = force ? MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY :
MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY_ON_ACTIVITY;
mlxsw_reg_ratr_pack(ratr_pl, op, true, MLXSW_REG_RATR_TYPE_IPIP,
adj_index, rif_index);
mlxsw_reg_ratr_ipip4_entry_pack(ratr_pl, be32_to_cpu(daddr4));
......
......@@ -40,7 +40,8 @@ struct mlxsw_sp_ipip_ops {
enum mlxsw_sp_l3proto ul_proto; /* Underlay. */
int (*nexthop_update)(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
struct mlxsw_sp_ipip_entry *ipip_entry);
struct mlxsw_sp_ipip_entry *ipip_entry,
bool force, char *ratr_pl);
bool (*can_offload)(const struct mlxsw_sp *mlxsw_sp,
const struct net_device *ol_dev);
......
......@@ -80,6 +80,8 @@ struct mlxsw_sp_router {
struct mlxsw_sp_router_xm *xm;
const struct mlxsw_sp_adj_grp_size_range *adj_grp_size_ranges;
size_t adj_grp_size_ranges_count;
struct delayed_work nh_grp_activity_dw;
struct list_head nh_res_grp_list;
};
struct mlxsw_sp_fib_entry_priv {
......@@ -209,7 +211,8 @@ bool mlxsw_sp_nexthop_group_has_ipip(struct mlxsw_sp_nexthop *nh);
int mlxsw_sp_nexthop_counter_get(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_nexthop *nh, u64 *p_counter);
int mlxsw_sp_nexthop_eth_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
struct mlxsw_sp_nexthop *nh);
struct mlxsw_sp_nexthop *nh, bool force,
char *ratr_pl);
void mlxsw_sp_nexthop_counter_alloc(struct mlxsw_sp *mlxsw_sp,
struct mlxsw_sp_nexthop *nh);
void mlxsw_sp_nexthop_counter_free(struct mlxsw_sp *mlxsw_sp,
......
......@@ -446,6 +446,35 @@ __invalid_nexthop_test()
log_test "Unresolved neigh: nexthop does not exist: $desc"
}
__invalid_nexthop_bucket_test()
{
local desc=$1; shift
local dip=$1; shift
local via_add=$1; shift
local trap_name="unresolved_neigh"
RET=0
# Check that route to nexthop that does not exist triggers
# unresolved_neigh
ip nexthop add id 1 via $via_add dev $rp2
ip nexthop add id 10 group 1 type resilient buckets 32
ip route add $dip nhid 10
t0_packets=$(devlink_trap_rx_packets_get $trap_name)
ping_do $h1 $dip
t1_packets=$(devlink_trap_rx_packets_get $trap_name)
if [[ $t0_packets -eq $t1_packets ]]; then
check_err 1 "Trap counter did not increase"
fi
ip route del $dip nhid 10
ip nexthop del id 10
ip nexthop del id 1
log_test "Unresolved neigh: nexthop bucket does not exist: $desc"
}
unresolved_neigh_test()
{
__host_miss_test "IPv4" 198.51.100.1
......@@ -453,6 +482,8 @@ unresolved_neigh_test()
__invalid_nexthop_test "IPv4" 198.51.100.1 198.51.100.3 24 198.51.100.4
__invalid_nexthop_test "IPv6" 2001:db8:2::1 2001:db8:2::3 64 \
2001:db8:2::4
__invalid_nexthop_bucket_test "IPv4" 198.51.100.1 198.51.100.4
__invalid_nexthop_bucket_test "IPv6" 2001:db8:2::1 2001:db8:2::4
}
vrf_without_routes_create()
......
......@@ -33,6 +33,7 @@ ALL_TESTS="
nexthop_obj_invalid_test
nexthop_obj_offload_test
nexthop_obj_group_offload_test
nexthop_obj_bucket_offload_test
nexthop_obj_blackhole_offload_test
nexthop_obj_route_offload_test
devlink_reload_test
......@@ -739,11 +740,28 @@ nexthop_obj_invalid_test()
ip nexthop add id 1 dev $swp1
ip nexthop add id 2 dev $swp1
ip nexthop add id 3 via 192.0.2.3 dev $swp1
ip nexthop add id 10 group 1/2
check_fail $? "managed to configure a nexthop group with device-only nexthops when should not"
ip nexthop add id 10 group 3 type resilient buckets 7
check_fail $? "managed to configure a too small resilient nexthop group when should not"
ip nexthop add id 10 group 3 type resilient buckets 129
check_fail $? "managed to configure a resilient nexthop group with invalid number of buckets when should not"
ip nexthop add id 10 group 1/2 type resilient buckets 32
check_fail $? "managed to configure a resilient nexthop group with device-only nexthops when should not"
ip nexthop add id 10 group 3 type resilient buckets 32
check_err $? "failed to configure a valid resilient nexthop group"
ip nexthop replace id 3 dev $swp1
check_fail $? "managed to populate a nexthop bucket with a device-only nexthop when should not"
log_test "nexthop objects - invalid configurations"
ip nexthop del id 10
ip nexthop del id 3
ip nexthop del id 2
ip nexthop del id 1
......@@ -858,6 +876,70 @@ nexthop_obj_group_offload_test()
simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64
}
nexthop_obj_bucket_offload_test()
{
# Test offload indication of nexthop buckets
RET=0
simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64
simple_if_init $swp2
setup_wait
ip nexthop add id 1 via 192.0.2.2 dev $swp1
ip nexthop add id 2 via 2001:db8:1::2 dev $swp1
ip nexthop add id 10 group 1/2 type resilient buckets 32 idle_timer 0
ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \
dev $swp1
ip neigh replace 192.0.2.3 lladdr 00:11:22:33:44:55 nud reachable \
dev $swp1
ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \
dev $swp1
busywait "$TIMEOUT" wait_for_offload \
ip nexthop bucket show nhid 1
check_err $? "IPv4 nexthop buckets not marked as offloaded when should"
busywait "$TIMEOUT" wait_for_offload \
ip nexthop bucket show nhid 2
check_err $? "IPv6 nexthop buckets not marked as offloaded when should"
# Invalidate nexthop id 1
ip neigh replace 192.0.2.2 nud failed dev $swp1
busywait "$TIMEOUT" wait_for_trap \
ip nexthop bucket show nhid 1
check_err $? "IPv4 nexthop buckets not marked with trap when should"
# Invalidate nexthop id 2
ip neigh replace 2001:db8:1::2 nud failed dev $swp1
busywait "$TIMEOUT" wait_for_trap \
ip nexthop bucket show nhid 2
check_err $? "IPv6 nexthop buckets not marked with trap when should"
# Revalidate nexthop id 1 by changing its configuration
ip nexthop replace id 1 via 192.0.2.3 dev $swp1
busywait "$TIMEOUT" wait_for_offload \
ip nexthop bucket show nhid 1
check_err $? "nexthop bucket not marked as offloaded after revalidating nexthop"
# Revalidate nexthop id 2 by changing its neighbour
ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \
dev $swp1
busywait "$TIMEOUT" wait_for_offload \
ip nexthop bucket show nhid 2
check_err $? "nexthop bucket not marked as offloaded after revalidating neighbour"
log_test "nexthop bucket offload indication"
ip neigh del 2001:db8:1::2 dev $swp1
ip neigh del 192.0.2.3 dev $swp1
ip neigh del 192.0.2.2 dev $swp1
ip nexthop del id 10
ip nexthop del id 2
ip nexthop del id 1
simple_if_fini $swp2
simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64
}
nexthop_obj_blackhole_offload_test()
{
# Test offload indication of blackhole nexthop objects
......
......@@ -353,6 +353,11 @@ wait_for_offload()
"$@" | grep -q offload
}
wait_for_trap()
{
"$@" | grep -q trap
}
until_counter_is()
{
local expr=$1; shift
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment