Commit e9e90a70 authored by David S. Miller's avatar David S. Miller

Merge branch 'resil-nhgroups-netdevsim-selftests'

Petr Machata says:

====================
net: Resilient NH groups: netdevsim, selftests

Support for resilient next-hop groups was added in a previous patch set.
Resilient next hop groups add a layer of indirection between the SKB hash
and the next hop. Thus the hash is used to reference a hash table bucket,
which is then used to reference a particular next hop. This allows the
system more flexibility when assigning SKB hash space to next hops.
Previously, each next hop had to be assigned a continuous range of SKB hash
space. With a hash table as an intermediate layer, it is possible to
reassign next hops with a hash table bucket granularity. In turn, this
mends issues with traffic flow redirection resulting from next hop removal
or adjustments in next-hop weights.

This patch set introduces mock offloading of resilient next hop groups by
the netdevsim driver, and a suite of selftests.

- Patch #1 adds a netdevsim-specific lock to protect next-hop hashtable.
  Previously, netdevsim relied on RTNL to maintain mutual exclusion.
  Patch #2 extracts a helper to make the following patches clearer.

- Patch #3 implements the support for offloading of resilient next-hop
  groups.

- Patch #4 introduces a new debugfs interface to set activity on a selected
  next-hop bucket. This simulates how HW can periodically report bucket
  activity, and buckets thus marked are expected to be exempt from
  migration to new next hops when the group changes.

- Patches #5 and #6 clean up the fib_nexthop selftests.

- Patches #7, #8 and #9 add tests for resilient next hop groups. Patch #7
  adds resilient-hashing counterparts to fib_nexthops.sh. Patch #8 adds a
  new traffic test for resilient next-hop groups. Patch #9 adds a new
  traffic test for tunneling.

- Patch #10 actually leverages the netdevsim offload to implement a suite
  of algorithmic tests that verify how and when buckets are migrated under
  various simulated workload scenarios.

The overall plan is to contribute approximately the following patchsets:

1) Nexthop policy refactoring (already pushed)
2) Preparations for resilient next hop groups (already pushed)
3) Implementation of resilient next hop group (already pushed)
4) Netdevsim offload plus a suite of selftests (this patchset)
5) Preparations for mlxsw offload of resilient next-hop groups
6) mlxsw offload including selftests

Interested parties can look at the complete code at [2].

[1] https://tools.ietf.org/html/rfc2992
[2] https://github.com/idosch/linux/commits/submit/res_integ_v1
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b202923d b8a07c4c
......@@ -14,6 +14,7 @@
* THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
*/
#include <linux/bitmap.h>
#include <linux/in6.h>
#include <linux/kernel.h>
#include <linux/list.h>
......@@ -47,15 +48,18 @@ struct nsim_fib_data {
struct nsim_fib_entry nexthops;
struct rhashtable fib_rt_ht;
struct list_head fib_rt_list;
struct mutex fib_lock; /* Protects hashtable and list */
struct mutex fib_lock; /* Protects FIB HT and list */
struct notifier_block nexthop_nb;
struct rhashtable nexthop_ht;
struct devlink *devlink;
struct work_struct fib_event_work;
struct list_head fib_event_queue;
spinlock_t fib_event_queue_lock; /* Protects fib event queue list */
struct mutex nh_lock; /* Protects NH HT */
struct dentry *ddir;
bool fail_route_offload;
bool fail_res_nexthop_group_replace;
bool fail_nexthop_bucket_replace;
};
struct nsim_fib_rt_key {
......@@ -116,6 +120,7 @@ struct nsim_nexthop {
struct rhash_head ht_node;
u64 occ;
u32 id;
bool is_resilient;
};
static const struct rhashtable_params nsim_nexthop_ht_params = {
......@@ -1114,6 +1119,10 @@ static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data,
for (i = 0; i < info->nh_grp->num_nh; i++)
occ += info->nh_grp->nh_entries[i].weight;
break;
case NH_NOTIFIER_INFO_TYPE_RES_TABLE:
occ = info->nh_res_table->num_nh_buckets;
nexthop->is_resilient = true;
break;
default:
NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type");
kfree(nexthop);
......@@ -1156,6 +1165,21 @@ static int nsim_nexthop_account(struct nsim_fib_data *data, u64 occ,
}
static void nsim_nexthop_hw_flags_set(struct net *net,
const struct nsim_nexthop *nexthop,
bool trap)
{
int i;
nexthop_set_hw_flags(net, nexthop->id, false, trap);
if (!nexthop->is_resilient)
return;
for (i = 0; i < nexthop->occ; i++)
nexthop_bucket_set_hw_flags(net, nexthop->id, i, false, trap);
}
static int nsim_nexthop_add(struct nsim_fib_data *data,
struct nsim_nexthop *nexthop,
struct netlink_ext_ack *extack)
......@@ -1174,7 +1198,7 @@ static int nsim_nexthop_add(struct nsim_fib_data *data,
goto err_nexthop_dismiss;
}
nexthop_set_hw_flags(net, nexthop->id, false, true);
nsim_nexthop_hw_flags_set(net, nexthop, true);
return 0;
......@@ -1203,7 +1227,7 @@ static int nsim_nexthop_replace(struct nsim_fib_data *data,
goto err_nexthop_dismiss;
}
nexthop_set_hw_flags(net, nexthop->id, false, true);
nsim_nexthop_hw_flags_set(net, nexthop, true);
nsim_nexthop_account(data, nexthop_old->occ, false, extack);
nsim_nexthop_destroy(nexthop_old);
......@@ -1254,6 +1278,32 @@ static void nsim_nexthop_remove(struct nsim_fib_data *data,
nsim_nexthop_destroy(nexthop);
}
static int nsim_nexthop_res_table_pre_replace(struct nsim_fib_data *data,
struct nh_notifier_info *info)
{
if (data->fail_res_nexthop_group_replace) {
NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace a resilient nexthop group");
return -EINVAL;
}
return 0;
}
static int nsim_nexthop_bucket_replace(struct nsim_fib_data *data,
struct nh_notifier_info *info)
{
if (data->fail_nexthop_bucket_replace) {
NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace nexthop bucket");
return -EINVAL;
}
nexthop_bucket_set_hw_flags(info->net, info->id,
info->nh_res_bucket->bucket_index,
false, true);
return 0;
}
static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event,
void *ptr)
{
......@@ -1262,8 +1312,7 @@ static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event,
struct nh_notifier_info *info = ptr;
int err = 0;
ASSERT_RTNL();
mutex_lock(&data->nh_lock);
switch (event) {
case NEXTHOP_EVENT_REPLACE:
err = nsim_nexthop_insert(data, info);
......@@ -1271,10 +1320,17 @@ static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event,
case NEXTHOP_EVENT_DEL:
nsim_nexthop_remove(data, info);
break;
case NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE:
err = nsim_nexthop_res_table_pre_replace(data, info);
break;
case NEXTHOP_EVENT_BUCKET_REPLACE:
err = nsim_nexthop_bucket_replace(data, info);
break;
default:
break;
}
mutex_unlock(&data->nh_lock);
return notifier_from_errno(err);
}
......@@ -1285,11 +1341,68 @@ static void nsim_nexthop_free(void *ptr, void *arg)
struct net *net;
net = devlink_net(data->devlink);
nexthop_set_hw_flags(net, nexthop->id, false, false);
nsim_nexthop_hw_flags_set(net, nexthop, false);
nsim_nexthop_account(data, nexthop->occ, false, NULL);
nsim_nexthop_destroy(nexthop);
}
static ssize_t nsim_nexthop_bucket_activity_write(struct file *file,
const char __user *user_buf,
size_t size, loff_t *ppos)
{
struct nsim_fib_data *data = file->private_data;
struct net *net = devlink_net(data->devlink);
struct nsim_nexthop *nexthop;
unsigned long *activity;
loff_t pos = *ppos;
u16 bucket_index;
char buf[128];
int err = 0;
u32 nhid;
if (pos != 0)
return -EINVAL;
if (size > sizeof(buf))
return -EINVAL;
if (copy_from_user(buf, user_buf, size))
return -EFAULT;
if (sscanf(buf, "%u %hu", &nhid, &bucket_index) != 2)
return -EINVAL;
rtnl_lock();
nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &nhid,
nsim_nexthop_ht_params);
if (!nexthop || !nexthop->is_resilient ||
bucket_index >= nexthop->occ) {
err = -EINVAL;
goto out;
}
activity = bitmap_zalloc(nexthop->occ, GFP_KERNEL);
if (!activity) {
err = -ENOMEM;
goto out;
}
bitmap_set(activity, bucket_index, 1);
nexthop_res_grp_activity_update(net, nhid, nexthop->occ, activity);
bitmap_free(activity);
out:
rtnl_unlock();
*ppos = size;
return err ?: size;
}
static const struct file_operations nsim_nexthop_bucket_activity_fops = {
.open = simple_open,
.write = nsim_nexthop_bucket_activity_write,
.llseek = no_llseek,
.owner = THIS_MODULE,
};
static u64 nsim_fib_ipv4_resource_occ_get(void *priv)
{
struct nsim_fib_data *data = priv;
......@@ -1379,6 +1492,17 @@ nsim_fib_debugfs_init(struct nsim_fib_data *data, struct nsim_dev *nsim_dev)
data->fail_route_offload = false;
debugfs_create_bool("fail_route_offload", 0600, data->ddir,
&data->fail_route_offload);
data->fail_res_nexthop_group_replace = false;
debugfs_create_bool("fail_res_nexthop_group_replace", 0600, data->ddir,
&data->fail_res_nexthop_group_replace);
data->fail_nexthop_bucket_replace = false;
debugfs_create_bool("fail_nexthop_bucket_replace", 0600, data->ddir,
&data->fail_nexthop_bucket_replace);
debugfs_create_file("nexthop_bucket_activity", 0200, data->ddir,
data, &nsim_nexthop_bucket_activity_fops);
return 0;
}
......@@ -1404,6 +1528,7 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink,
if (err)
goto err_data_free;
mutex_init(&data->nh_lock);
err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params);
if (err)
goto err_debugfs_exit;
......@@ -1469,6 +1594,7 @@ struct nsim_fib_data *nsim_fib_create(struct devlink *devlink,
data);
mutex_destroy(&data->fib_lock);
err_debugfs_exit:
mutex_destroy(&data->nh_lock);
nsim_fib_debugfs_exit(data);
err_data_free:
kfree(data);
......@@ -1497,6 +1623,7 @@ void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data)
WARN_ON_ONCE(!list_empty(&data->fib_event_queue));
WARN_ON_ONCE(!list_empty(&data->fib_rt_list));
mutex_destroy(&data->fib_lock);
mutex_destroy(&data->nh_lock);
nsim_fib_debugfs_exit(data);
kfree(data);
}
This diff is collapsed.
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Test traffic distribution when a wECMP route forwards traffic to two GRE
# tunnels.
#
# +-------------------------+
# | H1 |
# | $h1 + |
# | 192.0.2.1/28 | |
# | 2001:db8:1::1/64 | |
# +-------------------|-----+
# |
# +-------------------|------------------------+
# | SW1 | |
# | $ol1 + |
# | 192.0.2.2/28 |
# | 2001:db8:1::2/64 |
# | |
# | + g1a (gre) + g1b (gre) |
# | loc=192.0.2.65 loc=192.0.2.81 |
# | rem=192.0.2.66 --. rem=192.0.2.82 --. |
# | tos=inherit | tos=inherit | |
# | .------------------' | |
# | | .------------------' |
# | v v |
# | + $ul1.111 (vlan) + $ul1.222 (vlan) |
# | | 192.0.2.129/28 | 192.0.2.145/28 |
# | \ / |
# | \________________/ |
# | | |
# | + $ul1 |
# +------------|-------------------------------+
# |
# +------------|-------------------------------+
# | SW2 + $ul2 |
# | _______|________ |
# | / \ |
# | / \ |
# | + $ul2.111 (vlan) + $ul2.222 (vlan) |
# | ^ 192.0.2.130/28 ^ 192.0.2.146/28 |
# | | | |
# | | '------------------. |
# | '------------------. | |
# | + g2a (gre) | + g2b (gre) | |
# | loc=192.0.2.66 | loc=192.0.2.82 | |
# | rem=192.0.2.65 --' rem=192.0.2.81 --' |
# | tos=inherit tos=inherit |
# | |
# | $ol2 + |
# | 192.0.2.17/28 | |
# | 2001:db8:2::1/64 | |
# +-------------------|------------------------+
# |
# +-------------------|-----+
# | H2 | |
# | $h2 + |
# | 192.0.2.18/28 |
# | 2001:db8:2::2/64 |
# +-------------------------+
ALL_TESTS="
ping_ipv4
ping_ipv6
multipath_ipv4
multipath_ipv6
multipath_ipv6_l4
"
NUM_NETIFS=6
source lib.sh
h1_create()
{
simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2
ip route add vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2
}
h1_destroy()
{
ip route del vrf v$h1 2001:db8:2::/64 via 2001:db8:1::2
ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2
simple_if_fini $h1 192.0.2.1/28
}
sw1_create()
{
simple_if_init $ol1 192.0.2.2/28 2001:db8:1::2/64
__simple_if_init $ul1 v$ol1
vlan_create $ul1 111 v$ol1 192.0.2.129/28
vlan_create $ul1 222 v$ol1 192.0.2.145/28
tunnel_create g1a gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1
__simple_if_init g1a v$ol1 192.0.2.65/32
ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130
tunnel_create g1b gre 192.0.2.81 192.0.2.82 tos inherit dev v$ol1
__simple_if_init g1b v$ol1 192.0.2.81/32
ip route add vrf v$ol1 192.0.2.82/32 via 192.0.2.146
ip -6 nexthop add id 101 dev g1a
ip -6 nexthop add id 102 dev g1b
ip nexthop add id 103 group 101/102 type resilient buckets 512 \
idle_timer 0
ip route add vrf v$ol1 192.0.2.16/28 nhid 103
ip route add vrf v$ol1 2001:db8:2::/64 nhid 103
}
sw1_destroy()
{
ip route del vrf v$ol1 2001:db8:2::/64
ip route del vrf v$ol1 192.0.2.16/28
ip nexthop del id 103
ip -6 nexthop del id 102
ip -6 nexthop del id 101
ip route del vrf v$ol1 192.0.2.82/32 via 192.0.2.146
__simple_if_fini g1b 192.0.2.81/32
tunnel_destroy g1b
ip route del vrf v$ol1 192.0.2.66/32 via 192.0.2.130
__simple_if_fini g1a 192.0.2.65/32
tunnel_destroy g1a
vlan_destroy $ul1 222
vlan_destroy $ul1 111
__simple_if_fini $ul1
simple_if_fini $ol1 192.0.2.2/28 2001:db8:1::2/64
}
sw2_create()
{
simple_if_init $ol2 192.0.2.17/28 2001:db8:2::1/64
__simple_if_init $ul2 v$ol2
vlan_create $ul2 111 v$ol2 192.0.2.130/28
vlan_create $ul2 222 v$ol2 192.0.2.146/28
tunnel_create g2a gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol2
__simple_if_init g2a v$ol2 192.0.2.66/32
ip route add vrf v$ol2 192.0.2.65/32 via 192.0.2.129
tunnel_create g2b gre 192.0.2.82 192.0.2.81 tos inherit dev v$ol2
__simple_if_init g2b v$ol2 192.0.2.82/32
ip route add vrf v$ol2 192.0.2.81/32 via 192.0.2.145
ip -6 nexthop add id 201 dev g2a
ip -6 nexthop add id 202 dev g2b
ip nexthop add id 203 group 201/202 type resilient buckets 512 \
idle_timer 0
ip route add vrf v$ol2 192.0.2.0/28 nhid 203
ip route add vrf v$ol2 2001:db8:1::/64 nhid 203
tc qdisc add dev $ul2 clsact
tc filter add dev $ul2 ingress pref 111 prot 802.1Q \
flower vlan_id 111 action pass
tc filter add dev $ul2 ingress pref 222 prot 802.1Q \
flower vlan_id 222 action pass
}
sw2_destroy()
{
tc qdisc del dev $ul2 clsact
ip route del vrf v$ol2 2001:db8:1::/64
ip route del vrf v$ol2 192.0.2.0/28
ip nexthop del id 203
ip -6 nexthop del id 202
ip -6 nexthop del id 201
ip route del vrf v$ol2 192.0.2.81/32 via 192.0.2.145
__simple_if_fini g2b 192.0.2.82/32
tunnel_destroy g2b
ip route del vrf v$ol2 192.0.2.65/32 via 192.0.2.129
__simple_if_fini g2a 192.0.2.66/32
tunnel_destroy g2a
vlan_destroy $ul2 222
vlan_destroy $ul2 111
__simple_if_fini $ul2
simple_if_fini $ol2 192.0.2.17/28 2001:db8:2::1/64
}
h2_create()
{
simple_if_init $h2 192.0.2.18/28 2001:db8:2::2/64
ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17
ip route add vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1
}
h2_destroy()
{
ip route del vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1
ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17
simple_if_fini $h2 192.0.2.18/28 2001:db8:2::2/64
}
setup_prepare()
{
h1=${NETIFS[p1]}
ol1=${NETIFS[p2]}
ul1=${NETIFS[p3]}
ul2=${NETIFS[p4]}
ol2=${NETIFS[p5]}
h2=${NETIFS[p6]}
vrf_prepare
h1_create
sw1_create
sw2_create
h2_create
forwarding_enable
}
cleanup()
{
pre_cleanup
forwarding_restore
h2_destroy
sw2_destroy
sw1_destroy
h1_destroy
vrf_cleanup
}
multipath4_test()
{
local what=$1; shift
local weight1=$1; shift
local weight2=$1; shift
sysctl_set net.ipv4.fib_multipath_hash_policy 1
ip nexthop replace id 103 group 101,$weight1/102,$weight2 \
type resilient
local t0_111=$(tc_rule_stats_get $ul2 111 ingress)
local t0_222=$(tc_rule_stats_get $ul2 222 ingress)
ip vrf exec v$h1 \
$MZ $h1 -q -p 64 -A 192.0.2.1 -B 192.0.2.18 \
-d 1msec -t udp "sp=1024,dp=0-32768"
local t1_111=$(tc_rule_stats_get $ul2 111 ingress)
local t1_222=$(tc_rule_stats_get $ul2 222 ingress)
local d111=$((t1_111 - t0_111))
local d222=$((t1_222 - t0_222))
multipath_eval "$what" $weight1 $weight2 $d111 $d222
ip nexthop replace id 103 group 101/102 type resilient
sysctl_restore net.ipv4.fib_multipath_hash_policy
}
multipath6_test()
{
local what=$1; shift
local weight1=$1; shift
local weight2=$1; shift
sysctl_set net.ipv6.fib_multipath_hash_policy 0
ip nexthop replace id 103 group 101,$weight1/102,$weight2 \
type resilient
local t0_111=$(tc_rule_stats_get $ul2 111 ingress)
local t0_222=$(tc_rule_stats_get $ul2 222 ingress)
# Generate 16384 echo requests, each with a random flow label.
for ((i=0; i < 16384; ++i)); do
ip vrf exec v$h1 $PING6 2001:db8:2::2 -F 0 -c 1 -q &> /dev/null
done
local t1_111=$(tc_rule_stats_get $ul2 111 ingress)
local t1_222=$(tc_rule_stats_get $ul2 222 ingress)
local d111=$((t1_111 - t0_111))
local d222=$((t1_222 - t0_222))
multipath_eval "$what" $weight1 $weight2 $d111 $d222
ip nexthop replace id 103 group 101/102 type resilient
sysctl_restore net.ipv6.fib_multipath_hash_policy
}
multipath6_l4_test()
{
local what=$1; shift
local weight1=$1; shift
local weight2=$1; shift
sysctl_set net.ipv6.fib_multipath_hash_policy 1
ip nexthop replace id 103 group 101,$weight1/102,$weight2 \
type resilient
local t0_111=$(tc_rule_stats_get $ul2 111 ingress)
local t0_222=$(tc_rule_stats_get $ul2 222 ingress)
ip vrf exec v$h1 \
$MZ $h1 -6 -q -p 64 -A 2001:db8:1::1 -B 2001:db8:2::2 \
-d 1msec -t udp "sp=1024,dp=0-32768"
local t1_111=$(tc_rule_stats_get $ul2 111 ingress)
local t1_222=$(tc_rule_stats_get $ul2 222 ingress)
local d111=$((t1_111 - t0_111))
local d222=$((t1_222 - t0_222))
multipath_eval "$what" $weight1 $weight2 $d111 $d222
ip nexthop replace id 103 group 101/102 type resilient
sysctl_restore net.ipv6.fib_multipath_hash_policy
}
ping_ipv4()
{
ping_test $h1 192.0.2.18
}
ping_ipv6()
{
ping6_test $h1 2001:db8:2::2
}
multipath_ipv4()
{
log_info "Running IPv4 multipath tests"
multipath4_test "ECMP" 1 1
multipath4_test "Weighted MP 2:1" 2 1
multipath4_test "Weighted MP 11:45" 11 45
}
multipath_ipv6()
{
log_info "Running IPv6 multipath tests"
multipath6_test "ECMP" 1 1
multipath6_test "Weighted MP 2:1" 2 1
multipath6_test "Weighted MP 11:45" 11 45
}
multipath_ipv6_l4()
{
log_info "Running IPv6 L4 hash multipath tests"
multipath6_l4_test "ECMP" 1 1
multipath6_l4_test "Weighted MP 2:1" 2 1
multipath6_l4_test "Weighted MP 11:45" 11 45
}
trap cleanup EXIT
setup_prepare
setup_wait
tests_run
exit $EXIT_STATUS
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment