Commit ad69a730 authored by David S. Miller's avatar David S. Miller

Merge branch 'gro-fixes'

Antoine Tenart says:

====================
gro: various fixes related to UDP tunnels

We found issues when a UDP tunnel endpoint is in a different netns than
where UDP GRO happens. This kind of setup is actually quite diverse,
from having one leg of the tunnel on a remove host, to having a tunnel
between netns (eg. being bridged in another one or on the host). In our
case that UDP tunnel was geneve.

UDP tunnel packets should not be GROed at the UDP level. The fundamental
issue here is such packet can't be detected in a foolproof way: we can't
know by looking at a packet alone and the current logic of looking up
UDP sockets is fragile (socket could be in another netns, packet could
be modified in between, etc). Because there is no way to make the GRO
code to correctly handle those packets in all cases, this series aims at
two things: making the net stack to correctly behave (as in, no crash
and no invalid packet) when such thing happens, and in some cases to
prevent this "early GRO" from happening.

First three patches fix issues when an "UDP tunneled" packet is being
GROed too early by rx-udp-gro-forwarding or rx-gro-list.

Last patch is preventing locally generated UDP tunnel packets from being
GROed. This turns out to be more complex than this patch alone as it
relies on skb->encapsulation which is currently untrusty in some cases
(see iptunnel_handle_offloads); but that should fix things in practice
and is acceptable for a fix. Future work is required to improve things
(prevent all locally generated UDP tunnel packets from being GROed),
such as fixing the misuse of skb->encapsulation in drivers; but that
would be net-next material.

Thanks!
Antoine

Since v3:
  - Fixed the udpgro_fwd selftest in patch 5 (Jakub Kicinski feedback).
  - Improved commit message on patch 3 (Willem de Bruijn feeback).

Since v2:
  - Fixed a build issue with IPv6=m in patch 1 (Jakub Kicinski
    feedback).
  - Fixed typo in patch 1 (Nikolay Aleksandrov feedback).
  - Added Reviewed-by tag on patch 2 (Willem de Bruijn feeback).
  - Added back conversion to CHECKSUM_UNNECESSARY but only from non
    CHECKSUM_PARTIAL in patch 3 (Paolo Abeni & Willem de Bruijn
    feeback).
  - Reworded patch 3 commit msg.

Since v1:
  - Fixed a build issue with IPv6 disabled in patch 1.
  - Reworked commit log in patch 2 (Willem de Bruijn feedback).
  - Added Reviewed-by tags on patches 1 & 4 (Willem de Bruijn feeback).
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 10e52ad5 0fb101be
...@@ -150,6 +150,24 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, ...@@ -150,6 +150,24 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
} }
} }
DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
#if IS_ENABLED(CONFIG_IPV6)
DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
#endif
static inline bool udp_encap_needed(void)
{
if (static_branch_unlikely(&udp_encap_needed_key))
return true;
#if IS_ENABLED(CONFIG_IPV6)
if (static_branch_unlikely(&udpv6_encap_needed_key))
return true;
#endif
return false;
}
static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
{ {
if (!skb_is_gso(skb)) if (!skb_is_gso(skb))
...@@ -163,6 +181,16 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) ...@@ -163,6 +181,16 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
!udp_test_bit(ACCEPT_FRAGLIST, sk)) !udp_test_bit(ACCEPT_FRAGLIST, sk))
return true; return true;
/* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still
* land in a tunnel as the socket check in udp_gro_receive cannot be
* foolproof.
*/
if (udp_encap_needed() &&
READ_ONCE(udp_sk(sk)->encap_rcv) &&
!(skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM)))
return true;
return false; return false;
} }
......
...@@ -192,8 +192,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) ...@@ -192,8 +192,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
} }
merge: merge:
/* sk owenrship - if any - completely transferred to the aggregated packet */ /* sk ownership - if any - completely transferred to the aggregated packet */
skb->destructor = NULL; skb->destructor = NULL;
skb->sk = NULL;
delta_truesize = skb->truesize; delta_truesize = skb->truesize;
if (offset > headlen) { if (offset > headlen) {
unsigned int eat = offset - headlen; unsigned int eat = offset - headlen;
......
...@@ -582,6 +582,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, ...@@ -582,6 +582,13 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
} }
DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
EXPORT_SYMBOL(udp_encap_needed_key);
#if IS_ENABLED(CONFIG_IPV6)
DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
EXPORT_SYMBOL(udpv6_encap_needed_key);
#endif
void udp_encap_enable(void) void udp_encap_enable(void)
{ {
static_branch_inc(&udp_encap_needed_key); static_branch_inc(&udp_encap_needed_key);
......
...@@ -449,8 +449,9 @@ static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) ...@@ -449,8 +449,9 @@ static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
NAPI_GRO_CB(p)->count++; NAPI_GRO_CB(p)->count++;
p->data_len += skb->len; p->data_len += skb->len;
/* sk owenrship - if any - completely transferred to the aggregated packet */ /* sk ownership - if any - completely transferred to the aggregated packet */
skb->destructor = NULL; skb->destructor = NULL;
skb->sk = NULL;
p->truesize += skb->truesize; p->truesize += skb->truesize;
p->len += skb->len; p->len += skb->len;
...@@ -551,11 +552,19 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, ...@@ -551,11 +552,19 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb); unsigned int off = skb_gro_offset(skb);
int flush = 1; int flush = 1;
/* we can do L4 aggregation only if the packet can't land in a tunnel /* We can do L4 aggregation only if the packet can't land in a tunnel
* otherwise we could corrupt the inner stream * otherwise we could corrupt the inner stream. Detecting such packets
* cannot be foolproof and the aggregation might still happen in some
* cases. Such packets should be caught in udp_unexpected_gso later.
*/ */
NAPI_GRO_CB(skb)->is_flist = 0; NAPI_GRO_CB(skb)->is_flist = 0;
if (!sk || !udp_sk(sk)->gro_receive) { if (!sk || !udp_sk(sk)->gro_receive) {
/* If the packet was locally encapsulated in a UDP tunnel that
* wasn't detected above, do not GRO.
*/
if (skb->encapsulation)
goto out;
if (skb->dev->features & NETIF_F_GRO_FRAGLIST) if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
...@@ -719,13 +728,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) ...@@ -719,13 +728,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
if (skb->ip_summed == CHECKSUM_UNNECESSARY) { __skb_incr_checksum_unnecessary(skb);
if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
skb->csum_level++;
} else {
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->csum_level = 0;
}
return 0; return 0;
} }
......
...@@ -447,7 +447,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, ...@@ -447,7 +447,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto try_again; goto try_again;
} }
DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
void udpv6_encap_enable(void) void udpv6_encap_enable(void)
{ {
static_branch_inc(&udpv6_encap_needed_key); static_branch_inc(&udpv6_encap_needed_key);
......
...@@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) ...@@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
if (skb->ip_summed == CHECKSUM_UNNECESSARY) { __skb_incr_checksum_unnecessary(skb);
if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
skb->csum_level++;
} else {
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->csum_level = 0;
}
return 0; return 0;
} }
......
...@@ -244,7 +244,7 @@ for family in 4 6; do ...@@ -244,7 +244,7 @@ for family in 4 6; do
create_vxlan_pair create_vxlan_pair
ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on ip netns exec $NS_DST ethtool -K veth$DST generic-receive-offload on
ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on
run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 run_test "GRO frag list over UDP tunnel" $OL_NET$DST 10 10
cleanup cleanup
# use NAT to circumvent GRO FWD check # use NAT to circumvent GRO FWD check
...@@ -258,13 +258,7 @@ for family in 4 6; do ...@@ -258,13 +258,7 @@ for family in 4 6; do
# load arp cache before running the test to reduce the amount of # load arp cache before running the test to reduce the amount of
# stray traffic on top of the UDP tunnel # stray traffic on top of the UDP tunnel
ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null ip netns exec $NS_SRC $PING -q -c 1 $OL_NET$DST_NAT >/dev/null
run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 10 10 $OL_NET$DST
cleanup
create_vxlan_pair
run_bench "UDP tunnel fwd perf" $OL_NET$DST
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST
cleanup cleanup
done done
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment