Commit 45c7ec9d authored by David S. Miller's avatar David S. Miller

Merge branch 'ipv6-Route-ICMPv6-errors-with-the-flow-when-ECMP-in-use'

Jakub Sitnicki says:

====================
ipv6: Route ICMPv6 errors with the flow when ECMP in use

This patch set is another take at making Path MTU Discovery work when
server nodes are behind a router employing multipath routing in a
load-balance or anycast setup (that is, when not every end-node can be
reached by every path). The problem has been well described in RFC 7690
[1], but in short - in such setups ICMPv6 PTB errors are not guaranteed
to be routed back to the server node that sent a reply that exceeds path
MTU.

The proposed solution is two-fold:

 (1) on the server side - reflect the Flow Label [2]. This can be done
     without modifying the application using a new per-netns sysctl knob
     that has been proposed independently of this patchset in the patch
     entitled "ipv6: Add sysctl for per namespace flow label
     reflection" [3].

 (2) on the ECMP router - make the ipv6 routing subsystem look into the
     ICMPv6 error packets and compute the flow-hash from its payload,
     i.e. the offending packet that triggered the error. This is the
     same behavior as ipv4 stack has already.

With both parts in place Path MTU Discovery can work past the ECMP
router when using IPv6.

[1] https://tools.ietf.org/html/rfc7690
[2] https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
[3] http://patchwork.ozlabs.org/patch/804870/

v1 -> v2:
 - don't use "extern" in external function declaration in header file
 - style change, put as many arguments as possible on the first line of
   a function call, and align consecutive lines to the first argument
 - expand the cover letter based on the feedback

v2 -> v3:
 - switch to computing flow-hash using flow dissector to align with
   recent changes to multipath routing in ipv4 stack
 - add a sysctl knob for enabling flow label reflection per netns

---

Testing has covered multipath routing of ICMPv6 PTB errors in forward
and local output path in a simple use-case of an HTTP server sending a
reply which is over the path MTU size [3]. I have also checked if the
flows get evenly spread over multiple paths (i.e. if there are no
regressions) [4].

[3] https://github.com/jsitnicki/tools/tree/master/net/tests/ecmp/pmtud
[4] https://github.com/jsitnicki/tools/tree/master/net/tests/ecmp/load-balance
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 790c6056 b673d6cc
...@@ -149,6 +149,7 @@ struct flowi6 { ...@@ -149,6 +149,7 @@ struct flowi6 {
#define fl6_ipsec_spi uli.spi #define fl6_ipsec_spi uli.spi
#define fl6_mh_type uli.mht.type #define fl6_mh_type uli.mht.type
#define fl6_gre_key uli.gre_key #define fl6_gre_key uli.gre_key
__u32 mp_hash;
} __attribute__((__aligned__(BITS_PER_LONG/8))); } __attribute__((__aligned__(BITS_PER_LONG/8)));
struct flowidn { struct flowidn {
......
...@@ -115,6 +115,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, ...@@ -115,6 +115,7 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif, int flags); const struct in6_addr *saddr, int oif, int flags);
u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
......
...@@ -519,6 +519,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, ...@@ -519,6 +519,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
fl6.fl6_icmp_type = type; fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code; fl6.fl6_icmp_code = code;
fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.flowi6_uid = sock_net_uid(net, NULL);
fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
sk = icmpv6_xmit_lock(net); sk = icmpv6_xmit_lock(net);
......
...@@ -445,16 +445,6 @@ static bool rt6_check_expired(const struct rt6_info *rt) ...@@ -445,16 +445,6 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false; return false;
} }
/* Multipath route selection:
* Hash based function using packet header and flowlabel.
* Adapted from fib_info_hashfn()
*/
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
const struct flowi6 *fl6)
{
return get_hash_from_flowi6(fl6) % candidate_count;
}
static struct rt6_info *rt6_multipath_select(struct rt6_info *match, static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
struct flowi6 *fl6, int oif, struct flowi6 *fl6, int oif,
int strict) int strict)
...@@ -462,7 +452,13 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, ...@@ -462,7 +452,13 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
struct rt6_info *sibling, *next_sibling; struct rt6_info *sibling, *next_sibling;
int route_choosen; int route_choosen;
route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); /* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
if (!fl6->mp_hash)
fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
/* Don't change the route, if route_choosen == 0 /* Don't change the route, if route_choosen == 0
* (siblings does not include ourself) * (siblings does not include ourself)
*/ */
...@@ -1214,6 +1210,54 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, ...@@ -1214,6 +1210,54 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
} }
EXPORT_SYMBOL_GPL(ip6_route_input_lookup); EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
static void ip6_multipath_l3_keys(const struct sk_buff *skb,
struct flow_keys *keys)
{
const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
const struct ipv6hdr *key_iph = outer_iph;
const struct ipv6hdr *inner_iph;
const struct icmp6hdr *icmph;
struct ipv6hdr _inner_iph;
if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
goto out;
icmph = icmp6_hdr(skb);
if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
icmph->icmp6_type != ICMPV6_PARAMPROB)
goto out;
inner_iph = skb_header_pointer(skb,
skb_transport_offset(skb) + sizeof(*icmph),
sizeof(_inner_iph), &_inner_iph);
if (!inner_iph)
goto out;
key_iph = inner_iph;
out:
memset(keys, 0, sizeof(*keys));
keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
keys->addrs.v6addrs.src = key_iph->saddr;
keys->addrs.v6addrs.dst = key_iph->daddr;
keys->tags.flow_label = ip6_flowinfo(key_iph);
keys->basic.ip_proto = key_iph->nexthdr;
}
/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
{
struct flow_keys hash_keys;
if (skb) {
ip6_multipath_l3_keys(skb, &hash_keys);
return flow_hash_from_keys(&hash_keys);
}
return get_hash_from_flowi6(fl6);
}
void ip6_route_input(struct sk_buff *skb) void ip6_route_input(struct sk_buff *skb)
{ {
const struct ipv6hdr *iph = ipv6_hdr(skb); const struct ipv6hdr *iph = ipv6_hdr(skb);
...@@ -1232,6 +1276,8 @@ void ip6_route_input(struct sk_buff *skb) ...@@ -1232,6 +1276,8 @@ void ip6_route_input(struct sk_buff *skb)
tun_info = skb_tunnel_info(skb); tun_info = skb_tunnel_info(skb);
if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
skb_dst_drop(skb); skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment