Commit 73bf1fc5 authored by David S. Miller's avatar David S. Miller

Merge branch 'net-ipv6-Fix-route-append-and-replace-use-cases'

David Ahern says:

====================
net/ipv6: Fix route append and replace use cases

This patch set fixes a few append and replace uses cases for IPv6 and
adds test cases that codifies the expectations of how append and replace
are expected to work. In paricular it allows a multipath route to have
a dev-only nexthop, something Thomas tried to accomplish with commit
edd7ceb7 ("ipv6: Allow non-gateway ECMP for IPv6") which had to be
reverted because of breakage, and to replace an existing FIB entry
with a reject route.

There are a number of inconsistent and surprising aspects to the Linux
API for adding, deleting, replacing and changing FIB entries. For example,
with IPv4 NLM_F_APPEND means insert the route after any existing entries
with the same key (prefix + priority + TOS for IPv4) and NLM_F_CREATE
without the append flag inserts the new route before any existing entries.

IPv6 on the other hand attempts to guess whether a new route should be
appended to an existing one, possibly creating a multipath route, or to
add a new entry after any existing ones. This applies to both the 'append'
(NLM_F_CREATE + NLM_F_APPEND) and 'prepend' (NLM_F_CREATE only) cases
meaning for IPv6 the NLM_F_APPEND is basically ignored. This guessing
whether the route should be added to a multipath route (gateway routes)
or inserted after existing entries (non-gateway based routes) means a
multipath route can not have a dev only nexthop (potentially required in
some cases - tunnels or VRF route leaking for example) and route 'replace'
is a bit adhoc treating gateway based routes and dev-only / reject routes
differently.

This has led to frustration with developers working on routing suites
such as FRR where workarounds such as delete and add are used instead of
replace.

After this patch set there are 2 differences between IPv4 and IPv6:
1. 'ip ro prepend' = NLM_F_CREATE only
    IPv4 adds the new route before any existing ones
    IPv6 adds new route after any existing ones

2. 'ip ro append' = NLM_F_CREATE|NLM_F_APPEND
   IPv4 adds the new route after any existing ones
   IPv6 adds the nexthop to existing routes converting to multipath

For the former, there are cases where we want same prefix routes added
after existing ones (e.g., multicast, prefix routes for macvlan when used
for virtual router redundancy). Requiring the APPEND flag to add a new
route to an existing one helps here but is a slight change in behavior
since prepend with gateway routes now create a separate entry.

For the latter IPv6 behavior is preferred - appending a route for the same
prefix and metric to make a multipath route, so really IPv4 not allowing an
existing route to be updated is the limiter. This will be fixed when
nexthops become separate objects - a future patch set.

Thank you to Thomas and Ido for testing earlier versions of this set, and
to Ido for providing an update to the mlxsw driver.

Changes since RFC
- cleanup wording in test script; add comments about expected failures
  and why
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e3bb946c abb1860a
......@@ -5725,6 +5725,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work)
switch (fib_work->event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD:
replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE;
err = mlxsw_sp_router_fib6_add(mlxsw_sp,
......@@ -5831,6 +5832,7 @@ static void mlxsw_sp_router_fib6_event(struct mlxsw_sp_fib_event_work *fib_work,
switch (fib_work->event) {
case FIB_EVENT_ENTRY_REPLACE: /* fall through */
case FIB_EVENT_ENTRY_APPEND: /* fall through */
case FIB_EVENT_ENTRY_ADD: /* fall through */
case FIB_EVENT_ENTRY_DEL:
fen6_info = container_of(info, struct fib6_entry_notifier_info,
......
......@@ -66,12 +66,6 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
RTF_GATEWAY;
}
void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net,
struct net_device *dev,
......
......@@ -934,19 +934,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
{
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->fib6_table->tb6_lock));
struct fib6_info *iter = NULL;
struct fib6_info *iter = NULL, *match = NULL;
struct fib6_info __rcu **ins;
struct fib6_info __rcu **fallback_ins = NULL;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
int append = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_APPEND));
int add = (!info->nlh ||
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL;
int err;
if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
if (append)
nlflags |= NLM_F_APPEND;
ins = &fn->leaf;
......@@ -968,14 +968,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
nlflags &= ~NLM_F_EXCL;
if (replace) {
if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
found++;
break;
}
if (rt_can_ecmp)
fallback_ins = fallback_ins ?: ins;
goto next_iter;
}
if (rt6_duplicate_nexthop(iter, rt)) {
if (rt->fib6_nsiblings)
......@@ -989,86 +984,67 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
return -EEXIST;
}
/* If we have the same destination and the same metric,
* but not the same gateway, then the route we try to
* add is sibling to this route, increment our counter
* of siblings, and later we will add our route to the
* list.
* Only static routes (which don't have flag
* RTF_EXPIRES) are used for ECMPv6.
*
* To avoid long list, we only had siblings if the
* route have a gateway.
*/
if (rt_can_ecmp &&
rt6_qualify_for_ecmp(iter))
rt->fib6_nsiblings++;
/* first route that matches */
if (!match)
match = iter;
}
if (iter->fib6_metric > rt->fib6_metric)
break;
next_iter:
ins = &iter->fib6_next;
}
if (fallback_ins && !found) {
/* No ECMP-able route found, replace first non-ECMP one */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
found++;
}
/* Reset round-robin state, if necessary */
if (ins == &fn->leaf)
fn->rr_ptr = NULL;
/* Link this route to others same route. */
if (rt->fib6_nsiblings) {
unsigned int fib6_nsiblings;
if (append && match) {
struct fib6_info *sibling, *temp_sibling;
/* Find the first route that have the same metric */
sibling = leaf;
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
list_add_tail(&rt->fib6_siblings,
&sibling->fib6_siblings);
break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
if (rt->fib6_flags & RTF_REJECT) {
NL_SET_ERR_MSG(extack,
"Can not append a REJECT route");
return -EINVAL;
} else if (match->fib6_flags & RTF_REJECT) {
NL_SET_ERR_MSG(extack,
"Can not append to a REJECT route");
return -EINVAL;
}
rt->fib6_nsiblings = match->fib6_nsiblings;
list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
match->fib6_nsiblings++;
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
* is broken!
*/
fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
&rt->fib6_siblings, fib6_siblings) {
&match->fib6_siblings, fib6_siblings) {
sibling->fib6_nsiblings++;
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
fib6_nsiblings++;
BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
}
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
rt6_multipath_rebalance(temp_sibling);
rt6_multipath_rebalance(match);
}
/*
* insert node
*/
if (!replace) {
enum fib_event_type event;
if (!add)
pr_warn("NLM_F_CREATE should be set when creating new route\n");
add:
nlflags |= NLM_F_CREATE;
err = call_fib6_entry_notifiers(info->nl_net,
FIB_EVENT_ENTRY_ADD,
rt, extack);
event = append ? FIB_EVENT_ENTRY_APPEND : FIB_EVENT_ENTRY_ADD;
err = call_fib6_entry_notifiers(info->nl_net, event, rt,
extack);
if (err)
return err;
......@@ -1086,7 +1062,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
}
} else {
int nsiblings;
struct fib6_info *tmp;
if (!found) {
if (add)
......@@ -1101,48 +1077,57 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
if (err)
return err;
/* if route being replaced has siblings, set tmp to
* last one, otherwise tmp is current route. this is
* used to set fib6_next for new route
*/
if (iter->fib6_nsiblings)
tmp = list_last_entry(&iter->fib6_siblings,
struct fib6_info,
fib6_siblings);
else
tmp = iter;
/* insert new route */
atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(rt->fib6_node, fn);
rt->fib6_next = iter->fib6_next;
rt->fib6_next = tmp->fib6_next;
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
nsiblings = iter->fib6_nsiblings;
iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
fib6_info_release(iter);
if (nsiblings) {
/* delete old route */
rt = iter;
if (rt->fib6_nsiblings) {
struct fib6_info *tmp;
/* Replacing an ECMP route, remove all siblings */
ins = &rt->fib6_next;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
if (iter->fib6_metric > rt->fib6_metric)
break;
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->fib6_next;
list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
fib6_siblings) {
iter->fib6_node = NULL;
fib6_purge_rt(iter, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
fib6_info_release(iter);
nsiblings--;
rt->fib6_nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
ins = &iter->fib6_next;
}
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
}
WARN_ON(nsiblings != 0);
}
WARN_ON(rt->fib6_nsiblings != 0);
rt->fib6_node = NULL;
fib6_purge_rt(rt, fn, info->nl_net);
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
fib6_info_release(rt);
}
return 0;
......
......@@ -3791,7 +3791,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
if (iter->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(iter))
iter->fib6_nsiblings)
return iter;
iter = rcu_dereference_protected(iter->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
......@@ -4381,6 +4381,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE);
cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
nhn++;
}
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment