Commit 4d5c32ec authored by David S. Miller's avatar David S. Miller

Merge branch 'nexthop-group-fixes'

David Ahern says:

====================
nexthops: Fix 2 fundamental flaws with nexthop groups

Nik's torture tests have exposed 2 fundamental mistakes with the initial
nexthop code for groups. First, the nexthops entries and num_nh in the
nh_grp struct should not be modified once the struct is set under rcu.
Doing so has major affects on the datapath seeing valid nexthop entries.

Second, the helpers in the header file were convenient for not repeating
code, but they cause datapath walks to potentially see 2 different group
structs after an rcu replace, disrupting a walk of the path objects.
This second problem applies solely to IPv4 as I re-used too much of the
existing code in walking legs of a multipath route.

Patches 1 is refactoring change to simplify the overhead of reviewing and
understanding the change in patch 2 which fixes the update of nexthop
groups when a compnent leg is removed.

Patches 3-5 address the second problem. Patch 3 inlines the multipath
check such that the mpath lookup and subsequent calls all use the same
nh_grp struct. Patches 4 and 5 fix datapath uses of fib_info_num_path
with iterative calls to fib_info_nhc.

fib_info_num_path can be used in control plane path in a 'for loop' with
subsequent fib_info_nhc calls to get each leg since the nh_grp struct is
only changed while holding the rtnl; the combination can not be used in
the data plane with external nexthops as it involves repeated dereferences
of nh_grp struct which can change between calls.

Similarly, nexthop_is_multipath can be used for branching decisions in
the datapath since the nexthop type can not be changed (a group can not
be converted to standalone and vice versa).

Patch set developed in coordination with Nikolay Aleksandrov. He did a
lot of work creating a good reproducer, discussing options to fix it
and testing iterations.

I have adapted Nik's commands into additional tests in the nexthops
selftest script which I will send against -next.

v2
- fixed whitespace errors
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 963bdfc7 1fd1c768
...@@ -447,6 +447,16 @@ static inline int fib_num_tclassid_users(struct net *net) ...@@ -447,6 +447,16 @@ static inline int fib_num_tclassid_users(struct net *net)
#endif #endif
int fib_unmerge(struct net *net); int fib_unmerge(struct net *net);
static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc,
const struct net_device *dev)
{
if (nhc->nhc_dev == dev ||
l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex)
return true;
return false;
}
/* Exported by fib_semantics.c */ /* Exported by fib_semantics.c */
int ip_fib_check_default(__be32 gw, struct net_device *dev); int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
...@@ -479,6 +489,8 @@ void fib_nh_common_release(struct fib_nh_common *nhc); ...@@ -479,6 +489,8 @@ void fib_nh_common_release(struct fib_nh_common *nhc);
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri);
void fib_trie_init(void); void fib_trie_init(void);
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
const struct flowi4 *flp);
static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
{ {
......
...@@ -70,6 +70,7 @@ struct nh_grp_entry { ...@@ -70,6 +70,7 @@ struct nh_grp_entry {
}; };
struct nh_group { struct nh_group {
struct nh_group *spare; /* spare group for removals */
u16 num_nh; u16 num_nh;
bool mpath; bool mpath;
bool has_v4; bool has_v4;
...@@ -136,21 +137,20 @@ static inline unsigned int nexthop_num_path(const struct nexthop *nh) ...@@ -136,21 +137,20 @@ static inline unsigned int nexthop_num_path(const struct nexthop *nh)
{ {
unsigned int rc = 1; unsigned int rc = 1;
if (nexthop_is_multipath(nh)) { if (nh->is_group) {
struct nh_group *nh_grp; struct nh_group *nh_grp;
nh_grp = rcu_dereference_rtnl(nh->nh_grp); nh_grp = rcu_dereference_rtnl(nh->nh_grp);
rc = nh_grp->num_nh; if (nh_grp->mpath)
rc = nh_grp->num_nh;
} }
return rc; return rc;
} }
static inline static inline
struct nexthop *nexthop_mpath_select(const struct nexthop *nh, int nhsel) struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel)
{ {
const struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
/* for_nexthops macros in fib_semantics.c grabs a pointer to /* for_nexthops macros in fib_semantics.c grabs a pointer to
* the nexthop before checking nhsel * the nexthop before checking nhsel
*/ */
...@@ -185,12 +185,14 @@ static inline bool nexthop_is_blackhole(const struct nexthop *nh) ...@@ -185,12 +185,14 @@ static inline bool nexthop_is_blackhole(const struct nexthop *nh)
{ {
const struct nh_info *nhi; const struct nh_info *nhi;
if (nexthop_is_multipath(nh)) { if (nh->is_group) {
if (nexthop_num_path(nh) > 1) struct nh_group *nh_grp;
return false;
nh = nexthop_mpath_select(nh, 0); nh_grp = rcu_dereference_rtnl(nh->nh_grp);
if (!nh) if (nh_grp->num_nh > 1)
return false; return false;
nh = nh_grp->nh_entries[0].nh;
} }
nhi = rcu_dereference_rtnl(nh->nh_info); nhi = rcu_dereference_rtnl(nh->nh_info);
...@@ -216,16 +218,79 @@ struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) ...@@ -216,16 +218,79 @@ struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel)
BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0); BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0);
BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0); BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0);
if (nexthop_is_multipath(nh)) { if (nh->is_group) {
nh = nexthop_mpath_select(nh, nhsel); struct nh_group *nh_grp;
if (!nh)
return NULL; nh_grp = rcu_dereference_rtnl(nh->nh_grp);
if (nh_grp->mpath) {
nh = nexthop_mpath_select(nh_grp, nhsel);
if (!nh)
return NULL;
}
} }
nhi = rcu_dereference_rtnl(nh->nh_info); nhi = rcu_dereference_rtnl(nh->nh_info);
return &nhi->fib_nhc; return &nhi->fib_nhc;
} }
/* called from fib_table_lookup with rcu_lock */
static inline
struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh,
int fib_flags,
const struct flowi4 *flp,
int *nhsel)
{
struct nh_info *nhi;
if (nh->is_group) {
struct nh_group *nhg = rcu_dereference(nh->nh_grp);
int i;
for (i = 0; i < nhg->num_nh; i++) {
struct nexthop *nhe = nhg->nh_entries[i].nh;
nhi = rcu_dereference(nhe->nh_info);
if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
*nhsel = i;
return &nhi->fib_nhc;
}
}
} else {
nhi = rcu_dereference(nh->nh_info);
if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
*nhsel = 0;
return &nhi->fib_nhc;
}
}
return NULL;
}
static inline bool nexthop_uses_dev(const struct nexthop *nh,
const struct net_device *dev)
{
struct nh_info *nhi;
if (nh->is_group) {
struct nh_group *nhg = rcu_dereference(nh->nh_grp);
int i;
for (i = 0; i < nhg->num_nh; i++) {
struct nexthop *nhe = nhg->nh_entries[i].nh;
nhi = rcu_dereference(nhe->nh_info);
if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
return true;
}
} else {
nhi = rcu_dereference(nh->nh_info);
if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
return true;
}
return false;
}
static inline unsigned int fib_info_num_path(const struct fib_info *fi) static inline unsigned int fib_info_num_path(const struct fib_info *fi)
{ {
if (unlikely(fi->nh)) if (unlikely(fi->nh))
...@@ -263,8 +328,11 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) ...@@ -263,8 +328,11 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{ {
struct nh_info *nhi; struct nh_info *nhi;
if (nexthop_is_multipath(nh)) { if (nh->is_group) {
nh = nexthop_mpath_select(nh, 0); struct nh_group *nh_grp;
nh_grp = rcu_dereference_rtnl(nh->nh_grp);
nh = nexthop_mpath_select(nh_grp, 0);
if (!nh) if (!nh)
return NULL; return NULL;
} }
......
...@@ -309,17 +309,18 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) ...@@ -309,17 +309,18 @@ bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
{ {
bool dev_match = false; bool dev_match = false;
#ifdef CONFIG_IP_ROUTE_MULTIPATH #ifdef CONFIG_IP_ROUTE_MULTIPATH
int ret; if (unlikely(fi->nh)) {
dev_match = nexthop_uses_dev(fi->nh, dev);
} else {
int ret;
for (ret = 0; ret < fib_info_num_path(fi); ret++) { for (ret = 0; ret < fib_info_num_path(fi); ret++) {
const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
if (nhc->nhc_dev == dev) { if (nhc_l3mdev_matches_dev(nhc, dev)) {
dev_match = true; dev_match = true;
break; break;
} else if (l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) { }
dev_match = true;
break;
} }
} }
#else #else
......
...@@ -1371,6 +1371,26 @@ static inline t_key prefix_mismatch(t_key key, struct key_vector *n) ...@@ -1371,6 +1371,26 @@ static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
return (key ^ prefix) & (prefix | -prefix); return (key ^ prefix) & (prefix | -prefix);
} }
bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
const struct flowi4 *flp)
{
if (nhc->nhc_flags & RTNH_F_DEAD)
return false;
if (ip_ignore_linkdown(nhc->nhc_dev) &&
nhc->nhc_flags & RTNH_F_LINKDOWN &&
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
if (flp->flowi4_oif &&
flp->flowi4_oif != nhc->nhc_oif)
return false;
}
return true;
}
/* should be called with rcu_read_lock */ /* should be called with rcu_read_lock */
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags) struct fib_result *res, int fib_flags)
...@@ -1503,6 +1523,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1503,6 +1523,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Step 3: Process the leaf, if that fails fall back to backtracing */ /* Step 3: Process the leaf, if that fails fall back to backtracing */
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info; struct fib_info *fi = fa->fa_info;
struct fib_nh_common *nhc;
int nhsel, err; int nhsel, err;
if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
...@@ -1528,26 +1549,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1528,26 +1549,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
if (fi->fib_flags & RTNH_F_DEAD) if (fi->fib_flags & RTNH_F_DEAD)
continue; continue;
if (unlikely(fi->nh && nexthop_is_blackhole(fi->nh))) { if (unlikely(fi->nh)) {
err = fib_props[RTN_BLACKHOLE].error; if (nexthop_is_blackhole(fi->nh)) {
goto out_reject; err = fib_props[RTN_BLACKHOLE].error;
goto out_reject;
}
nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
&nhsel);
if (nhc)
goto set_result;
goto miss;
} }
for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); nhc = fib_info_nhc(fi, nhsel);
if (nhc->nhc_flags & RTNH_F_DEAD) if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
continue; continue;
if (ip_ignore_linkdown(nhc->nhc_dev) && set_result:
nhc->nhc_flags & RTNH_F_LINKDOWN &&
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
if (flp->flowi4_oif &&
flp->flowi4_oif != nhc->nhc_oif)
continue;
}
if (!(fib_flags & FIB_LOOKUP_NOREF)) if (!(fib_flags & FIB_LOOKUP_NOREF))
refcount_inc(&fi->fib_clntref); refcount_inc(&fi->fib_clntref);
...@@ -1568,6 +1588,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1568,6 +1588,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
return err; return err;
} }
} }
miss:
#ifdef CONFIG_IP_FIB_TRIE_STATS #ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_miss); this_cpu_inc(stats->semantic_match_miss);
#endif #endif
......
...@@ -63,9 +63,16 @@ static void nexthop_free_mpath(struct nexthop *nh) ...@@ -63,9 +63,16 @@ static void nexthop_free_mpath(struct nexthop *nh)
int i; int i;
nhg = rcu_dereference_raw(nh->nh_grp); nhg = rcu_dereference_raw(nh->nh_grp);
for (i = 0; i < nhg->num_nh; ++i) for (i = 0; i < nhg->num_nh; ++i) {
WARN_ON(nhg->nh_entries[i].nh); struct nh_grp_entry *nhge = &nhg->nh_entries[i];
WARN_ON(!list_empty(&nhge->nh_list));
nexthop_put(nhge->nh);
}
WARN_ON(nhg->spare == nhg);
kfree(nhg->spare);
kfree(nhg); kfree(nhg);
} }
...@@ -694,41 +701,56 @@ static void nh_group_rebalance(struct nh_group *nhg) ...@@ -694,41 +701,56 @@ static void nh_group_rebalance(struct nh_group *nhg)
} }
} }
static void remove_nh_grp_entry(struct nh_grp_entry *nhge, static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
struct nh_group *nhg,
struct nl_info *nlinfo) struct nl_info *nlinfo)
{ {
struct nh_grp_entry *nhges, *new_nhges;
struct nexthop *nhp = nhge->nh_parent;
struct nexthop *nh = nhge->nh; struct nexthop *nh = nhge->nh;
struct nh_grp_entry *nhges; struct nh_group *nhg, *newg;
bool found = false; int i, j;
int i;
WARN_ON(!nh); WARN_ON(!nh);
nhges = nhg->nh_entries; nhg = rtnl_dereference(nhp->nh_grp);
for (i = 0; i < nhg->num_nh; ++i) { newg = nhg->spare;
if (found) {
nhges[i-1].nh = nhges[i].nh;
nhges[i-1].weight = nhges[i].weight;
list_del(&nhges[i].nh_list);
list_add(&nhges[i-1].nh_list, &nhges[i-1].nh->grp_list);
} else if (nhg->nh_entries[i].nh == nh) {
found = true;
}
}
if (WARN_ON(!found)) /* last entry, keep it visible and remove the parent */
if (nhg->num_nh == 1) {
remove_nexthop(net, nhp, nlinfo);
return; return;
}
newg->has_v4 = nhg->has_v4;
newg->mpath = nhg->mpath;
newg->num_nh = nhg->num_nh;
nhg->num_nh--; /* copy old entries to new except the one getting removed */
nhg->nh_entries[nhg->num_nh].nh = NULL; nhges = nhg->nh_entries;
new_nhges = newg->nh_entries;
for (i = 0, j = 0; i < nhg->num_nh; ++i) {
/* current nexthop getting removed */
if (nhg->nh_entries[i].nh == nh) {
newg->num_nh--;
continue;
}
nh_group_rebalance(nhg); list_del(&nhges[i].nh_list);
new_nhges[j].nh_parent = nhges[i].nh_parent;
new_nhges[j].nh = nhges[i].nh;
new_nhges[j].weight = nhges[i].weight;
list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
j++;
}
nexthop_put(nh); nh_group_rebalance(newg);
rcu_assign_pointer(nhp->nh_grp, newg);
list_del(&nhge->nh_list);
nexthop_put(nhge->nh);
if (nlinfo) if (nlinfo)
nexthop_notify(RTM_NEWNEXTHOP, nhge->nh_parent, nlinfo); nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
} }
static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
...@@ -736,17 +758,11 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, ...@@ -736,17 +758,11 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
{ {
struct nh_grp_entry *nhge, *tmp; struct nh_grp_entry *nhge, *tmp;
list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) { list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
struct nh_group *nhg; remove_nh_grp_entry(net, nhge, nlinfo);
list_del(&nhge->nh_list);
nhg = rtnl_dereference(nhge->nh_parent->nh_grp);
remove_nh_grp_entry(nhge, nhg, nlinfo);
/* if this group has no more entries then remove it */ /* make sure all see the newly published array before releasing rtnl */
if (!nhg->num_nh) synchronize_rcu();
remove_nexthop(net, nhge->nh_parent, nlinfo);
}
} }
static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
...@@ -760,10 +776,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) ...@@ -760,10 +776,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
if (WARN_ON(!nhge->nh)) if (WARN_ON(!nhge->nh))
continue; continue;
list_del(&nhge->nh_list); list_del_init(&nhge->nh_list);
nexthop_put(nhge->nh);
nhge->nh = NULL;
nhg->num_nh--;
} }
} }
...@@ -1086,6 +1099,7 @@ static struct nexthop *nexthop_create_group(struct net *net, ...@@ -1086,6 +1099,7 @@ static struct nexthop *nexthop_create_group(struct net *net,
{ {
struct nlattr *grps_attr = cfg->nh_grp; struct nlattr *grps_attr = cfg->nh_grp;
struct nexthop_grp *entry = nla_data(grps_attr); struct nexthop_grp *entry = nla_data(grps_attr);
u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
struct nh_group *nhg; struct nh_group *nhg;
struct nexthop *nh; struct nexthop *nh;
int i; int i;
...@@ -1096,12 +1110,21 @@ static struct nexthop *nexthop_create_group(struct net *net, ...@@ -1096,12 +1110,21 @@ static struct nexthop *nexthop_create_group(struct net *net,
nh->is_group = 1; nh->is_group = 1;
nhg = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry)); nhg = nexthop_grp_alloc(num_nh);
if (!nhg) { if (!nhg) {
kfree(nh); kfree(nh);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
/* spare group used for removals */
nhg->spare = nexthop_grp_alloc(num_nh);
if (!nhg) {
kfree(nhg);
kfree(nh);
return NULL;
}
nhg->spare->spare = nhg;
for (i = 0; i < nhg->num_nh; ++i) { for (i = 0; i < nhg->num_nh; ++i) {
struct nexthop *nhe; struct nexthop *nhe;
struct nh_info *nhi; struct nh_info *nhi;
...@@ -1133,6 +1156,7 @@ static struct nexthop *nexthop_create_group(struct net *net, ...@@ -1133,6 +1156,7 @@ static struct nexthop *nexthop_create_group(struct net *net,
for (; i >= 0; --i) for (; i >= 0; --i)
nexthop_put(nhg->nh_entries[i].nh); nexthop_put(nhg->nh_entries[i].nh);
kfree(nhg->spare);
kfree(nhg); kfree(nhg);
kfree(nh); kfree(nh);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment