Commit b5947e5d authored by Willem de Bruijn's avatar Willem de Bruijn Committed by David S. Miller

udp: msg_zerocopy

Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and
interpret flag MSG_ZEROCOPY.

This patch was previously part of the zerocopy RFC patchsets. Zerocopy
is not effective at small MTU. With segmentation offload building
larger datagrams, the benefit of page flipping outweights the cost of
generating a completion notification.

tools/testing/selftests/net/msg_zerocopy.sh after applying follow-on
test patch and making skb_orphan_frags_rx same as skb_orphan_frags:

    ipv4 udp -t 1
    tx=191312 (11938 MB) txc=0 zc=n
    rx=191312 (11938 MB)
    ipv4 udp -z -t 1
    tx=304507 (19002 MB) txc=304507 zc=y
    rx=304507 (19002 MB)
    ok
    ipv6 udp -t 1
    tx=174485 (10888 MB) txc=0 zc=n
    rx=174485 (10888 MB)
    ipv6 udp -z -t 1
    tx=294801 (18396 MB) txc=294801 zc=y
    rx=294801 (18396 MB)
    ok

Changes
  v1 -> v2
    - Fixup reverse christmas tree violation
  v2 -> v3
    - Split refcount avoidance optimization into separate patch
      - Fix refcount leak on error in fragmented case
        (thanks to Paolo Abeni for pointing this one out!)
      - Fix refcount inc on zero
      - Test sock_flag SOCK_ZEROCOPY directly in __ip_append_data.
        This is needed since commit 5cf4a853 ("tcp: really ignore
	MSG_ZEROCOPY if no SO_ZEROCOPY") did the same for tcp.
Signed-off-by: default avatarWillem de Bruijn <willemb@google.com>
Acked-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent ce01a56b
...@@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg); ...@@ -485,6 +485,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg);
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len, struct msghdr *msg, int len,
struct ubuf_info *uarg); struct ubuf_info *uarg);
......
...@@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); ...@@ -1105,6 +1105,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length); struct iov_iter *from, size_t length);
int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
{
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len, struct msghdr *msg, int len,
struct ubuf_info *uarg) struct ubuf_info *uarg)
......
...@@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ...@@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
case SO_ZEROCOPY: case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
if (sk->sk_protocol != IPPROTO_TCP) if (!((sk->sk_type == SOCK_STREAM &&
sk->sk_protocol == IPPROTO_TCP) ||
(sk->sk_type == SOCK_DGRAM &&
sk->sk_protocol == IPPROTO_UDP)))
ret = -ENOTSUPP; ret = -ENOTSUPP;
} else if (sk->sk_family != PF_RDS) { } else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP; ret = -ENOTSUPP;
......
...@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk, ...@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags) unsigned int flags)
{ {
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb; struct sk_buff *skb;
struct ip_options *opt = cork->opt; struct ip_options *opt = cork->opt;
...@@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk, ...@@ -916,6 +917,19 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL; csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg);
}
}
cork->length += length; cork->length += length;
/* So, what's going on in the loop below? /* So, what's going on in the loop below?
...@@ -1006,6 +1020,7 @@ static int __ip_append_data(struct sock *sk, ...@@ -1006,6 +1020,7 @@ static int __ip_append_data(struct sock *sk,
cork->tx_flags = 0; cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey; skb_shinfo(skb)->tskey = tskey;
tskey = 0; tskey = 0;
skb_zcopy_set(skb, uarg);
/* /*
* Find where to start putting bytes. * Find where to start putting bytes.
...@@ -1068,7 +1083,7 @@ static int __ip_append_data(struct sock *sk, ...@@ -1068,7 +1083,7 @@ static int __ip_append_data(struct sock *sk,
err = -EFAULT; err = -EFAULT;
goto error; goto error;
} }
} else { } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags; int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM; err = -ENOMEM;
...@@ -1098,6 +1113,10 @@ static int __ip_append_data(struct sock *sk, ...@@ -1098,6 +1113,10 @@ static int __ip_append_data(struct sock *sk,
skb->data_len += copy; skb->data_len += copy;
skb->truesize += copy; skb->truesize += copy;
wmem_alloc_delta += copy; wmem_alloc_delta += copy;
} else {
err = skb_zerocopy_iter_dgram(skb, from, copy);
if (err < 0)
goto error;
} }
offset += copy; offset += copy;
length -= copy; length -= copy;
...@@ -1105,11 +1124,13 @@ static int __ip_append_data(struct sock *sk, ...@@ -1105,11 +1124,13 @@ static int __ip_append_data(struct sock *sk,
if (wmem_alloc_delta) if (wmem_alloc_delta)
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
sock_zerocopy_put(uarg);
return 0; return 0;
error_efault: error_efault:
err = -EFAULT; err = -EFAULT;
error: error:
sock_zerocopy_put_abort(uarg);
cork->length -= length; cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
......
...@@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1245,6 +1245,7 @@ static int __ip6_append_data(struct sock *sk,
{ {
struct sk_buff *skb, *skb_prev = NULL; struct sk_buff *skb, *skb_prev = NULL;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0; int exthdrlen = 0;
int dst_exthdrlen = 0; int dst_exthdrlen = 0;
int hh_len; int hh_len;
...@@ -1322,6 +1323,19 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1322,6 +1323,19 @@ static int __ip6_append_data(struct sock *sk,
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL; csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg);
}
}
/* /*
* Let's try using as much space as possible. * Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU. * Use MTU if total length of the message fits into the MTU.
...@@ -1445,6 +1459,7 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1445,6 +1459,7 @@ static int __ip6_append_data(struct sock *sk,
cork->tx_flags = 0; cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey; skb_shinfo(skb)->tskey = tskey;
tskey = 0; tskey = 0;
skb_zcopy_set(skb, uarg);
/* /*
* Find where to start putting bytes * Find where to start putting bytes
...@@ -1506,7 +1521,7 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1506,7 +1521,7 @@ static int __ip6_append_data(struct sock *sk,
err = -EFAULT; err = -EFAULT;
goto error; goto error;
} }
} else { } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags; int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM; err = -ENOMEM;
...@@ -1536,6 +1551,10 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1536,6 +1551,10 @@ static int __ip6_append_data(struct sock *sk,
skb->data_len += copy; skb->data_len += copy;
skb->truesize += copy; skb->truesize += copy;
wmem_alloc_delta += copy; wmem_alloc_delta += copy;
} else {
err = skb_zerocopy_iter_dgram(skb, from, copy);
if (err < 0)
goto error;
} }
offset += copy; offset += copy;
length -= copy; length -= copy;
...@@ -1543,11 +1562,13 @@ static int __ip6_append_data(struct sock *sk, ...@@ -1543,11 +1562,13 @@ static int __ip6_append_data(struct sock *sk,
if (wmem_alloc_delta) if (wmem_alloc_delta)
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
sock_zerocopy_put(uarg);
return 0; return 0;
error_efault: error_efault:
err = -EFAULT; err = -EFAULT;
error: error:
sock_zerocopy_put_abort(uarg);
cork->length -= length; cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment