Commit 6106253e authored by David S. Miller's avatar David S. Miller

Merge branch 'gudp'

Tom Herbert says:

====================
net: Generic UDP Encapsulation

Generic UDP Encapsulation (GUE) is UDP encapsulation protocol which
encapsulates packets of various IP protocols. The GUE protocol is
described in http://tools.ietf.org/html/draft-herbert-gue-01.

The receive path of GUE is implemented in the FOU over UDP module (FOU).
This includes a UDP encap receive function for GUE as well as GUE
specific GRO functions. Management and configuration of GUE ports shares
most of the same code with FOU.

For the transmit path, the previous FOU support for IPIP, sit, and GRE
was simply extended for GUE (when GUE is enabled insert the GUE
header on transmit in addition to UDP header inserted for FOU).

Semantically GUE is the same as FOU in that the encapsulation (UDP
and GUE headers) that are inserted on transmission and removed on
reception so that IP packet is processed with the inner header.

This patch set includes:
 - Some fixes to FOU, removal of IPv4,v6 specific GRO functions
 - Support to configure a GUE receive port
 - Implementation of GUE receive path (normal and GRO)
 - Additions to ip_tunnel netlink to configure GUE
 - GUE header inserion in ip_tunnel transmit path

v2:
 - Include net/gue.h in patch set

Testing:

I ran performance numbers using netperf TCP_RR with 200 streams,
comparing encapsulation without GUE, encapsulation with GUE, and
encapsulation with FOU.

 GRE
    TCP_STREAM
      IPv4, FOU, UDP checksum enabled
        14.04% TX CPU utilization
        13.17% RX CPU utilization
        9211 Mbps
      IPv4, GUE, UDP checksum enabled
        14.99% TX CPU utilization
        13.79% RX CPU utilization
        9185 Mbps
      IPv4, FOU, UDP checksum disabled
        13.14% TX CPU utilization
        23.18% RX CPU utilization
        9277 Mbps
      IPv4, GUE, UDP checksum disabled
        13.66% TX CPU utilization
        23.57% RX CPU utilization
        9184 Mbps
    TCP_RR
      IPv4, FOU, UDP checksum enabled
        94.2% CPU utilization
        155/249/460 90/95/99% latencies
        1.17018e+06 tps
      IPv4, GUE, UDP checksum enabled
        93.9% CPU utilization
        158/253/472 90/95/99% latencies
        1.15045e+06 tps

  IPIP
    TCP_STREAM
      FOU, UDP checksum enabled
        15.28% TX CPU utilization
        13.92% RX CPU utilization
        9342 Mbps
      GUE, UDP checksum enabled
        13.99% TX CPU utilization
        13.34% RX CPU utilization
        9210 Mbps
      FOU, UDP checksum disabled
        15.08% TX CPU utilization
        24.64% RX CPU utilization
        9226 Mbps
      GUE, UDP checksum disabled
        15.90% TX CPU utilization
        24.77% RX CPU utilization
        9197 Mbps
    TCP_RR
      FOU, UDP checksum enabled
        94.23% CPU utilization
        149/237/429 90/95/99% latencies
        1.19553e+06 tps
      GUE, UDP checksum enabled
        93.75% CPU utilization
        152/243/442 90/95/99% latencies
        1.17027e+06 tps

  SIT
    TCP_STREAM
      FOU, UDP checksum enabled
        14.47% TX CPU utilization
        14.58% RX CPU utilization
        9106 Mbps
      GUE, UDP checksum enabled
        15.09% TX CPU utilization
        14.84% RX CPU utilization
        9080 Mbps
      FOU, UDP checksum disabled
        15.70% TX CPU utilization
        27.93% RX CPU utilization
        9097 Mbps
      GUE, UDP checksum disabled
        15.04% TX CPU utilization
        27.54% RX CPU utilization
        9073 Mbps
    TCP_RR
      FOU, UDP checksum enabled
        96.9% CPU utilization
        170/281/581 90/95/99% latencies
        1.03372e+06 tps
      GUE, UDP checksum enabled
        97.16% CPU utilization
        172/286/576 90/95/99% latencies
        1.00469e+06 tps
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 01291202 bc1fc390
...@@ -1886,6 +1886,9 @@ struct napi_gro_cb { ...@@ -1886,6 +1886,9 @@ struct napi_gro_cb {
/* Number of checksums via CHECKSUM_UNNECESSARY */ /* Number of checksums via CHECKSUM_UNNECESSARY */
u8 csum_cnt:3; u8 csum_cnt:3;
/* Used in foo-over-udp, set in udp[46]_gro_receive */
u8 is_ipv6:1;
/* used to support CHECKSUM_COMPLETE for tunneling protocols */ /* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum; __wsum csum;
......
#ifndef __NET_GUE_H
#define __NET_GUE_H
struct guehdr {
union {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 hlen:4,
version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
hlen:4;
#else
#error "Please fix <asm/byteorder.h>"
#endif
__u8 next_hdr;
__u16 flags;
};
__u32 word;
};
};
#endif
...@@ -13,6 +13,7 @@ enum { ...@@ -13,6 +13,7 @@ enum {
FOU_ATTR_PORT, /* u16 */ FOU_ATTR_PORT, /* u16 */
FOU_ATTR_AF, /* u8 */ FOU_ATTR_AF, /* u8 */
FOU_ATTR_IPPROTO, /* u8 */ FOU_ATTR_IPPROTO, /* u8 */
FOU_ATTR_TYPE, /* u8 */
__FOU_ATTR_MAX, __FOU_ATTR_MAX,
}; };
...@@ -27,6 +28,12 @@ enum { ...@@ -27,6 +28,12 @@ enum {
__FOU_CMD_MAX, __FOU_CMD_MAX,
}; };
enum {
FOU_ENCAP_UNSPEC,
FOU_ENCAP_DIRECT,
FOU_ENCAP_GUE,
};
#define FOU_CMD_MAX (__FOU_CMD_MAX - 1) #define FOU_CMD_MAX (__FOU_CMD_MAX - 1)
#endif /* _UAPI_LINUX_FOU_H */ #endif /* _UAPI_LINUX_FOU_H */
...@@ -64,6 +64,7 @@ enum { ...@@ -64,6 +64,7 @@ enum {
enum tunnel_encap_types { enum tunnel_encap_types {
TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_NONE,
TUNNEL_ENCAP_FOU, TUNNEL_ENCAP_FOU,
TUNNEL_ENCAP_GUE,
}; };
#define TUNNEL_ENCAP_FLAG_CSUM (1<<0) #define TUNNEL_ENCAP_FLAG_CSUM (1<<0)
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <net/genetlink.h> #include <net/genetlink.h>
#include <net/gue.h>
#include <net/ip.h> #include <net/ip.h>
#include <net/protocol.h> #include <net/protocol.h>
#include <net/udp.h> #include <net/udp.h>
...@@ -27,6 +28,7 @@ struct fou { ...@@ -27,6 +28,7 @@ struct fou {
}; };
struct fou_cfg { struct fou_cfg {
u16 type;
u8 protocol; u8 protocol;
struct udp_port_cfg udp_config; struct udp_port_cfg udp_config;
}; };
...@@ -64,15 +66,51 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) ...@@ -64,15 +66,51 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
sizeof(struct udphdr)); sizeof(struct udphdr));
} }
static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct fou *fou = fou_from_sock(sk);
size_t len;
struct guehdr *guehdr;
struct udphdr *uh;
if (!fou)
return 1;
len = sizeof(struct udphdr) + sizeof(struct guehdr);
if (!pskb_may_pull(skb, len))
goto drop;
uh = udp_hdr(skb);
guehdr = (struct guehdr *)&uh[1];
len += guehdr->hlen << 2;
if (!pskb_may_pull(skb, len))
goto drop;
if (guehdr->version != 0)
goto drop;
if (guehdr->flags) {
/* No support yet */
goto drop;
}
return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
drop:
kfree_skb(skb);
return 0;
}
static struct sk_buff **fou_gro_receive(struct sk_buff **head, static struct sk_buff **fou_gro_receive(struct sk_buff **head,
struct sk_buff *skb, struct sk_buff *skb)
const struct net_offload **offloads)
{ {
const struct net_offload *ops; const struct net_offload *ops;
struct sk_buff **pp = NULL; struct sk_buff **pp = NULL;
u8 proto = NAPI_GRO_CB(skb)->proto; u8 proto = NAPI_GRO_CB(skb)->proto;
const struct net_offload **offloads;
rcu_read_lock(); rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]); ops = rcu_dereference(offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) if (!ops || !ops->callbacks.gro_receive)
goto out_unlock; goto out_unlock;
...@@ -85,14 +123,15 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head, ...@@ -85,14 +123,15 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head,
return pp; return pp;
} }
static int fou_gro_complete(struct sk_buff *skb, int nhoff, static int fou_gro_complete(struct sk_buff *skb, int nhoff)
const struct net_offload **offloads)
{ {
const struct net_offload *ops; const struct net_offload *ops;
u8 proto = NAPI_GRO_CB(skb)->proto; u8 proto = NAPI_GRO_CB(skb)->proto;
int err = -ENOSYS; int err = -ENOSYS;
const struct net_offload **offloads;
rcu_read_lock(); rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]); ops = rcu_dereference(offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete)) if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock; goto out_unlock;
...@@ -105,26 +144,110 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff, ...@@ -105,26 +144,110 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff,
return err; return err;
} }
static struct sk_buff **fou4_gro_receive(struct sk_buff **head, static struct sk_buff **gue_gro_receive(struct sk_buff **head,
struct sk_buff *skb) struct sk_buff *skb)
{ {
return fou_gro_receive(head, skb, inet_offloads); const struct net_offload **offloads;
} const struct net_offload *ops;
struct sk_buff **pp = NULL;
struct sk_buff *p;
u8 proto;
struct guehdr *guehdr;
unsigned int hlen, guehlen;
unsigned int off;
int flush = 1;
off = skb_gro_offset(skb);
hlen = off + sizeof(*guehdr);
guehdr = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
guehdr = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!guehdr))
goto out;
}
static int fou4_gro_complete(struct sk_buff *skb, int nhoff) proto = guehdr->next_hdr;
{
return fou_gro_complete(skb, nhoff, inet_offloads);
}
static struct sk_buff **fou6_gro_receive(struct sk_buff **head, rcu_read_lock();
struct sk_buff *skb) offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
{ ops = rcu_dereference(offloads[proto]);
return fou_gro_receive(head, skb, inet6_offloads); if (WARN_ON(!ops || !ops->callbacks.gro_receive))
goto out_unlock;
guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
hlen = off + guehlen;
if (skb_gro_header_hard(skb, hlen)) {
guehdr = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!guehdr))
goto out_unlock;
}
flush = 0;
for (p = *head; p; p = p->next) {
const struct guehdr *guehdr2;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
guehdr2 = (struct guehdr *)(p->data + off);
/* Compare base GUE header to be equal (covers
* hlen, version, next_hdr, and flags.
*/
if (guehdr->word != guehdr2->word) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
/* Compare optional fields are the same. */
if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
guehdr->hlen << 2)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
}
skb_gro_pull(skb, guehlen);
/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
skb_gro_postpull_rcsum(skb, guehdr, guehlen);
pp = ops->callbacks.gro_receive(head, skb);
out_unlock:
rcu_read_unlock();
out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
} }
static int fou6_gro_complete(struct sk_buff *skb, int nhoff) static int gue_gro_complete(struct sk_buff *skb, int nhoff)
{ {
return fou_gro_complete(skb, nhoff, inet6_offloads); const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
const struct net_offload *ops;
unsigned int guehlen;
u8 proto;
int err = -ENOENT;
proto = guehdr->next_hdr;
guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out_unlock;
err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
out_unlock:
rcu_read_unlock();
return err;
} }
static int fou_add_to_port_list(struct fou *fou) static int fou_add_to_port_list(struct fou *fou)
...@@ -162,6 +285,28 @@ static void fou_release(struct fou *fou) ...@@ -162,6 +285,28 @@ static void fou_release(struct fou *fou)
kfree(fou); kfree(fou);
} }
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
{
udp_sk(sk)->encap_rcv = fou_udp_recv;
fou->protocol = cfg->protocol;
fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
fou->udp_offloads.port = cfg->udp_config.local_udp_port;
fou->udp_offloads.ipproto = cfg->protocol;
return 0;
}
static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
{
udp_sk(sk)->encap_rcv = gue_udp_recv;
fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
fou->udp_offloads.port = cfg->udp_config.local_udp_port;
return 0;
}
static int fou_create(struct net *net, struct fou_cfg *cfg, static int fou_create(struct net *net, struct fou_cfg *cfg,
struct socket **sockp) struct socket **sockp)
{ {
...@@ -184,10 +329,24 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, ...@@ -184,10 +329,24 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk = sock->sk; sk = sock->sk;
/* Mark socket as an encapsulation socket. See net/ipv4/udp.c */
fou->protocol = cfg->protocol;
fou->port = cfg->udp_config.local_udp_port; fou->port = cfg->udp_config.local_udp_port;
udp_sk(sk)->encap_rcv = fou_udp_recv;
/* Initial for fou type */
switch (cfg->type) {
case FOU_ENCAP_DIRECT:
err = fou_encap_init(sk, fou, cfg);
if (err)
goto error;
break;
case FOU_ENCAP_GUE:
err = gue_encap_init(sk, fou, cfg);
if (err)
goto error;
break;
default:
err = -EINVAL;
goto error;
}
udp_sk(sk)->encap_type = 1; udp_sk(sk)->encap_type = 1;
udp_encap_enable(); udp_encap_enable();
...@@ -199,23 +358,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, ...@@ -199,23 +358,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk->sk_allocation = GFP_ATOMIC; sk->sk_allocation = GFP_ATOMIC;
switch (cfg->udp_config.family) {
case AF_INET:
fou->udp_offloads.callbacks.gro_receive = fou4_gro_receive;
fou->udp_offloads.callbacks.gro_complete = fou4_gro_complete;
break;
case AF_INET6:
fou->udp_offloads.callbacks.gro_receive = fou6_gro_receive;
fou->udp_offloads.callbacks.gro_complete = fou6_gro_complete;
break;
default:
err = -EPFNOSUPPORT;
goto error;
}
fou->udp_offloads.port = cfg->udp_config.local_udp_port;
fou->udp_offloads.ipproto = cfg->protocol;
if (cfg->udp_config.family == AF_INET) { if (cfg->udp_config.family == AF_INET) {
err = udp_add_offload(&fou->udp_offloads); err = udp_add_offload(&fou->udp_offloads);
if (err) if (err)
...@@ -272,6 +414,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { ...@@ -272,6 +414,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
[FOU_ATTR_PORT] = { .type = NLA_U16, }, [FOU_ATTR_PORT] = { .type = NLA_U16, },
[FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_AF] = { .type = NLA_U8, },
[FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
}; };
static int parse_nl_config(struct genl_info *info, static int parse_nl_config(struct genl_info *info,
...@@ -299,6 +442,9 @@ static int parse_nl_config(struct genl_info *info, ...@@ -299,6 +442,9 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_IPPROTO]) if (info->attrs[FOU_ATTR_IPPROTO])
cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
if (info->attrs[FOU_ATTR_TYPE])
cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
return 0; return 0;
} }
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <net/netns/generic.h> #include <net/netns/generic.h>
#include <net/rtnetlink.h> #include <net/rtnetlink.h>
#include <net/udp.h> #include <net/udp.h>
#include <net/gue.h>
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h> #include <net/ipv6.h>
...@@ -495,6 +496,8 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e) ...@@ -495,6 +496,8 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e)
return 0; return 0;
case TUNNEL_ENCAP_FOU: case TUNNEL_ENCAP_FOU:
return sizeof(struct udphdr); return sizeof(struct udphdr);
case TUNNEL_ENCAP_GUE:
return sizeof(struct udphdr) + sizeof(struct guehdr);
default: default:
return -EINVAL; return -EINVAL;
} }
...@@ -546,6 +549,15 @@ static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, ...@@ -546,6 +549,15 @@ static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
skb_reset_transport_header(skb); skb_reset_transport_header(skb);
uh = udp_hdr(skb); uh = udp_hdr(skb);
if (e->type == TUNNEL_ENCAP_GUE) {
struct guehdr *guehdr = (struct guehdr *)&uh[1];
guehdr->version = 0;
guehdr->hlen = 0;
guehdr->flags = 0;
guehdr->next_hdr = *protocol;
}
uh->dest = e->dport; uh->dest = e->dport;
uh->source = sport; uh->source = sport;
uh->len = htons(skb->len); uh->len = htons(skb->len);
...@@ -565,6 +577,7 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, ...@@ -565,6 +577,7 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
case TUNNEL_ENCAP_NONE: case TUNNEL_ENCAP_NONE:
return 0; return 0;
case TUNNEL_ENCAP_FOU: case TUNNEL_ENCAP_FOU:
case TUNNEL_ENCAP_GUE:
return fou_build_header(skb, &t->encap, t->encap_hlen, return fou_build_header(skb, &t->encap, t->encap_hlen,
protocol, fl4); protocol, fl4);
default: default:
...@@ -759,7 +772,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ...@@ -759,7 +772,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
df |= (inner_iph->frag_off&htons(IP_DF)); df |= (inner_iph->frag_off&htons(IP_DF));
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len; + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
if (max_headroom > dev->needed_headroom) if (max_headroom > dev->needed_headroom)
dev->needed_headroom = max_headroom; dev->needed_headroom = max_headroom;
......
...@@ -334,6 +334,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, ...@@ -334,6 +334,7 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
inet_gro_compute_pseudo); inet_gro_compute_pseudo);
skip: skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
return udp_gro_receive(head, skb, uh); return udp_gro_receive(head, skb, uh);
flush: flush:
......
...@@ -140,6 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, ...@@ -140,6 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
ip6_gro_compute_pseudo); ip6_gro_compute_pseudo);
skip: skip:
NAPI_GRO_CB(skb)->is_ipv6 = 1;
return udp_gro_receive(head, skb, uh); return udp_gro_receive(head, skb, uh);
flush: flush:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment