Commit 8448f91f authored by David S. Miller's avatar David S. Miller

Merge branch 'ipv6-Add-support-for-non-equal-cost-multipath'

Ido Schimmel says:

====================
ipv6: Add support for non-equal-cost multipath

This set aims to add support for IPv6 non-equal-cost multipath routes.
The first three patches convert multipath selection to use the
hash-threshold method (RFC 2992) instead of modulo-N. The same method is
employed by the IPv4 routing code since commit 0e884c78 ("ipv4: L3
hash-based multipath").

Unlike modulo-N, with hash-threshold only the flows near the region
boundaries are affected when a nexthop is added or removed. In addition,
it allows us to easily add support for non-equal-cost multipath in the
last patch by sizing the different regions according to the provided
weights.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e2b3b35e 398958ae
...@@ -149,6 +149,7 @@ struct rt6_info { ...@@ -149,6 +149,7 @@ struct rt6_info {
*/ */
struct list_head rt6i_siblings; struct list_head rt6i_siblings;
unsigned int rt6i_nsiblings; unsigned int rt6i_nsiblings;
atomic_t rt6i_nh_upper_bound;
atomic_t rt6i_ref; atomic_t rt6i_ref;
...@@ -170,6 +171,7 @@ struct rt6_info { ...@@ -170,6 +171,7 @@ struct rt6_info {
u32 rt6i_metric; u32 rt6i_metric;
u32 rt6i_pmtu; u32 rt6i_pmtu;
/* more non-fragment space at head required */ /* more non-fragment space at head required */
int rt6i_nh_weight;
unsigned short rt6i_nfheader_len; unsigned short rt6i_nfheader_len;
u8 rt6i_protocol; u8 rt6i_protocol;
u8 exception_bucket_flushed:1, u8 exception_bucket_flushed:1,
......
...@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) ...@@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
} }
static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
{
return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
RTF_GATEWAY;
}
void ip6_route_input(struct sk_buff *skb); void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net, struct dst_entry *ip6_route_input_lookup(struct net *net,
struct net_device *dev, struct net_device *dev,
...@@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); ...@@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_disable_ip(struct net_device *dev, unsigned long event);
void rt6_sync_down_dev(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
void rt6_multipath_rebalance(struct rt6_info *rt);
static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
{ {
......
...@@ -796,12 +796,6 @@ static struct fib6_node *fib6_add_1(struct net *net, ...@@ -796,12 +796,6 @@ static struct fib6_node *fib6_add_1(struct net *net,
return ln; return ln;
} }
static bool rt6_qualify_for_ecmp(struct rt6_info *rt)
{
return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
RTF_GATEWAY;
}
static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc)
{ {
int i; int i;
...@@ -991,6 +985,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ...@@ -991,6 +985,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
rt6i_nsiblings++; rt6i_nsiblings++;
} }
BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings);
rt6_multipath_rebalance(temp_sibling);
} }
/* /*
...@@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, ...@@ -1672,6 +1667,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
sibling->rt6i_nsiblings--; sibling->rt6i_nsiblings--;
rt->rt6i_nsiblings = 0; rt->rt6i_nsiblings = 0;
list_del_init(&rt->rt6i_siblings); list_del_init(&rt->rt6i_siblings);
rt6_multipath_rebalance(next_sibling);
} }
/* Adjust walkers */ /* Adjust walkers */
......
...@@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, ...@@ -455,7 +455,6 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
int strict) int strict)
{ {
struct rt6_info *sibling, *next_sibling; struct rt6_info *sibling, *next_sibling;
int route_choosen;
/* We might have already computed the hash for ICMPv6 errors. In such /* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it. * case it will always be non-zero. Otherwise now is the time to do it.
...@@ -463,28 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, ...@@ -463,28 +462,19 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
if (!fl6->mp_hash) if (!fl6->mp_hash)
fl6->mp_hash = rt6_multipath_hash(fl6, NULL); fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
/* Don't change the route, if route_choosen == 0 return match;
* (siblings does not include ourself)
*/
if (route_choosen)
list_for_each_entry_safe(sibling, next_sibling,
&match->rt6i_siblings, rt6i_siblings) {
route_choosen--;
if (route_choosen == 0) {
struct inet6_dev *idev = sibling->rt6i_idev;
if (sibling->rt6i_nh_flags & RTNH_F_DEAD) list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
break; rt6i_siblings) {
if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN && if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
idev->cnf.ignore_routes_with_linkdown) continue;
break;
if (rt6_score_route(sibling, oif, strict) < 0) if (rt6_score_route(sibling, oif, strict) < 0)
break; break;
match = sibling; match = sibling;
break; break;
} }
}
return match; return match;
} }
...@@ -1833,10 +1823,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) ...@@ -1833,10 +1823,10 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
if (skb) { if (skb) {
ip6_multipath_l3_keys(skb, &hash_keys); ip6_multipath_l3_keys(skb, &hash_keys);
return flow_hash_from_keys(&hash_keys); return flow_hash_from_keys(&hash_keys) >> 1;
} }
return get_hash_from_flowi6(fl6); return get_hash_from_flowi6(fl6) >> 1;
} }
void ip6_route_input(struct sk_buff *skb) void ip6_route_input(struct sk_buff *skb)
...@@ -2604,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, ...@@ -2604,6 +2594,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
#endif #endif
rt->rt6i_metric = cfg->fc_metric; rt->rt6i_metric = cfg->fc_metric;
rt->rt6i_nh_weight = 1;
/* We cannot add true routes via loopback here, /* We cannot add true routes via loopback here,
they would result in kernel looping; promote them to reject routes they would result in kernel looping; promote them to reject routes
...@@ -3481,6 +3472,99 @@ struct arg_netdev_event { ...@@ -3481,6 +3472,99 @@ struct arg_netdev_event {
}; };
}; };
static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
{
struct rt6_info *iter;
struct fib6_node *fn;
fn = rcu_dereference_protected(rt->rt6i_node,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
iter = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
while (iter) {
if (iter->rt6i_metric == rt->rt6i_metric &&
rt6_qualify_for_ecmp(iter))
return iter;
iter = rcu_dereference_protected(iter->rt6_next,
lockdep_is_held(&rt->rt6i_table->tb6_lock));
}
return NULL;
}
static bool rt6_is_dead(const struct rt6_info *rt)
{
if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
(rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
return true;
return false;
}
static int rt6_multipath_total_weight(const struct rt6_info *rt)
{
struct rt6_info *iter;
int total = 0;
if (!rt6_is_dead(rt))
total += rt->rt6i_nh_weight;
list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
if (!rt6_is_dead(iter))
total += iter->rt6i_nh_weight;
}
return total;
}
static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
{
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
*weight += rt->rt6i_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
}
static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
{
struct rt6_info *iter;
int weight = 0;
rt6_upper_bound_set(rt, &weight, total);
list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
rt6_upper_bound_set(iter, &weight, total);
}
void rt6_multipath_rebalance(struct rt6_info *rt)
{
struct rt6_info *first;
int total;
/* In case the entire multipath route was marked for flushing,
* then there is no need to rebalance upon the removal of every
* sibling route.
*/
if (!rt->rt6i_nsiblings || rt->should_flush)
return;
/* During lookup routes are evaluated in order, so we need to
* make sure upper bounds are assigned from the first sibling
* onwards.
*/
first = rt6_multipath_first_sibling(rt);
if (WARN_ON_ONCE(!first))
return;
total = rt6_multipath_total_weight(first);
rt6_multipath_upper_bound_set(first, total);
}
static int fib6_ifup(struct rt6_info *rt, void *p_arg) static int fib6_ifup(struct rt6_info *rt, void *p_arg)
{ {
const struct arg_netdev_event *arg = p_arg; const struct arg_netdev_event *arg = p_arg;
...@@ -3489,6 +3573,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg) ...@@ -3489,6 +3573,7 @@ static int fib6_ifup(struct rt6_info *rt, void *p_arg)
if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
rt->rt6i_nh_flags &= ~arg->nh_flags; rt->rt6i_nh_flags &= ~arg->nh_flags;
fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
rt6_multipath_rebalance(rt);
} }
return 0; return 0;
...@@ -3588,6 +3673,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) ...@@ -3588,6 +3673,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
RTNH_F_LINKDOWN); RTNH_F_LINKDOWN);
fib6_update_sernum(rt); fib6_update_sernum(rt);
rt6_multipath_rebalance(rt);
} }
return -2; return -2;
case NETDEV_CHANGE: case NETDEV_CHANGE:
...@@ -3595,6 +3681,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg) ...@@ -3595,6 +3681,7 @@ static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
break; break;
rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break; break;
} }
...@@ -3938,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, ...@@ -3938,6 +4025,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup; goto cleanup;
} }
rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) { if (err) {
dst_release_immediate(&rt->dst); dst_release_immediate(&rt->dst);
...@@ -4160,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) ...@@ -4160,7 +4249,7 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
if (!rtnh) if (!rtnh)
goto nla_put_failure; goto nla_put_failure;
rtnh->rtnh_hops = 0; rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
if (rt6_nexthop_info(skb, rt, &flags, true) < 0) if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment