Commit 32e09298 authored by Jens Axboe's avatar Jens Axboe

Merge branch 'io_uring-zerocopy-send' of...

Merge branch 'io_uring-zerocopy-send' of git://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux into for-5.20/io_uring-zerocopy-send

Merge prep net series for io_uring tx zc from the Jakub's tree.

* 'io_uring-zerocopy-send' of git://git.kernel.org/pub/scm/linux/kernel/git/kuba/linux:
  net: fix uninitialised msghdr->sg_from_iter
  tcp: support externally provided ubufs
  ipv6/udp: support externally provided ubufs
  ipv4/udp: support externally provided ubufs
  net: introduce __skb_fill_page_desc_noacc
  net: introduce managed frags infrastructure
  net: Allow custom iter handler in msghdr
  skbuff: carry external ubuf_info in msghdr
  skbuff: add SKBFL_DONT_ORPHAN flag
  skbuff: don't mix ubuf_info from different sources
  ipv6: avoid partial copy for zc
  ipv4: avoid partial copy for zc
parents 32346491 2829a267
......@@ -686,10 +686,18 @@ enum {
* charged to the kernel memory.
*/
SKBFL_PURE_ZEROCOPY = BIT(2),
SKBFL_DONT_ORPHAN = BIT(3),
/* page references are managed by the ubuf_info, so it's safe to
* use frags only up until ubuf_info is released
*/
SKBFL_MANAGED_FRAG_REFS = BIT(4),
};
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
/*
* The callback notifies userspace to release buffers when skb DMA is done in
......@@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success);
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
size_t length);
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
struct msghdr *msg, int len)
{
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
}
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
......@@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}
static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
const struct sk_buff *skb2)
{
......@@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
}
}
void __skb_zcopy_downgrade_managed(struct sk_buff *skb);
static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
if (unlikely(skb_zcopy_managed(skb)))
__skb_zcopy_downgrade_managed(skb);
}
static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
skb->next = NULL;
......@@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
return skb_headlen(skb) + __skb_pagelen(skb);
}
static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
int i, struct page *page,
int off, int size)
{
skb_frag_t *frag = &shinfo->frags[i];
/*
* Propagate page pfmemalloc to the skb if we can. The problem is
* that not all callers have unique ownership of the page but rely
* on page_is_pfmemalloc doing the right thing(tm).
*/
frag->bv_page = page;
frag->bv_offset = off;
skb_frag_size_set(frag, size);
}
/**
* __skb_fill_page_desc - initialise a paged fragment in an skb
* @skb: buffer containing fragment to be initialised
......@@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
struct page *page, int off, int size)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
/*
* Propagate page pfmemalloc to the skb if we can. The problem is
* that not all callers have unique ownership of the page but rely
* on page_is_pfmemalloc doing the right thing(tm).
*/
frag->bv_page = page;
frag->bv_offset = off;
skb_frag_size_set(frag, size);
__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
page = compound_head(page);
if (page_is_pfmemalloc(page))
skb->pfmemalloc = true;
......@@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
if (likely(!skb_zcopy(skb)))
return 0;
if (!skb_zcopy_is_nouarg(skb) &&
skb_uarg(skb)->callback == msg_zerocopy_callback)
if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
return 0;
return skb_copy_ubufs(skb, gfp_mask);
}
......@@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
struct skb_shared_info *shinfo = skb_shinfo(skb);
if (!skb_zcopy_managed(skb))
__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}
/**
......
......@@ -14,6 +14,8 @@ struct file;
struct pid;
struct cred;
struct socket;
struct sock;
struct sk_buff;
#define __sockaddr_check_size(size) \
BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
......@@ -69,6 +71,9 @@ struct msghdr {
unsigned int msg_flags; /* flags on received message */
__kernel_size_t msg_controllen; /* ancillary data buffer length */
struct kiocb *msg_iocb; /* ptr to iocb for async requests */
struct ubuf_info *msg_ubuf;
int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
};
struct user_msghdr {
......
......@@ -80,6 +80,7 @@ int __get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
kmsg->msg_ubuf = NULL;
*ptr = msg.msg_iov;
*len = msg.msg_iovlen;
return 0;
......
......@@ -613,10 +613,16 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length)
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
size_t length)
{
int frag = skb_shinfo(skb)->nr_frags;
int frag;
if (msg && msg->msg_ubuf && msg->sg_from_iter)
return msg->sg_from_iter(sk, skb, from, length);
frag = skb_shinfo(skb)->nr_frags;
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
......@@ -702,7 +708,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
......
......@@ -666,11 +666,18 @@ static void skb_release_data(struct sk_buff *skb)
&shinfo->dataref))
goto exit;
skb_zcopy_clear(skb, true);
if (skb_zcopy(skb)) {
bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
skb_zcopy_clear(skb, true);
if (skip_unref)
goto free_head;
}
for (i = 0; i < shinfo->nr_frags; i++)
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
free_head:
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
......@@ -895,7 +902,10 @@ EXPORT_SYMBOL(skb_dump);
*/
void skb_tx_error(struct sk_buff *skb)
{
skb_zcopy_clear(skb, true);
if (skb) {
skb_zcopy_downgrade_managed(skb);
skb_zcopy_clear(skb, true);
}
}
EXPORT_SYMBOL(skb_tx_error);
......@@ -1193,7 +1203,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
uarg->flags = SKBFL_ZEROCOPY_FRAG;
uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
......@@ -1212,6 +1222,10 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
/* there might be non MSG_ZEROCOPY users */
if (uarg->callback != msg_zerocopy_callback)
return NULL;
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
......@@ -1354,7 +1368,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
if (orig_uarg && uarg != orig_uarg)
return -EEXIST;
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
......@@ -1371,6 +1385,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
int i;
skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{
......@@ -1688,6 +1712,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
BUG_ON(skb_shared(skb));
skb_zcopy_downgrade_managed(skb);
size = SKB_DATA_ALIGN(size);
if (skb_pfmemalloc(skb))
......@@ -3484,6 +3510,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
int pos = skb_headlen(skb);
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
skb_zcopy_downgrade_managed(skb);
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
......@@ -3837,6 +3865,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
skb_zcopy_downgrade_managed(skb);
get_page(page);
skb_fill_page_desc(skb, i, page, offset, size);
} else {
......
......@@ -969,7 +969,6 @@ static int __ip_append_data(struct sock *sk,
struct inet_sock *inet = inet_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
......@@ -977,6 +976,7 @@ static int __ip_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
bool zc = false;
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
struct rtable *rt = (struct rtable *)cork->dst;
......@@ -1017,17 +1017,35 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_ZEROCOPY) && length) {
struct msghdr *msg = from;
if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
return -EINVAL;
/* Leave uarg NULL if can't zerocopy, callers should
* be able to handle it.
*/
if ((rt->dst.dev->features & NETIF_F_SG) &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
zc = true;
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
zc = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
}
......@@ -1091,9 +1109,12 @@ static int __ip_append_data(struct sock *sk,
(fraglen + alloc_extra < SKB_MAX_ALLOC ||
!(rt->dst.dev->features & NETIF_F_SG)))
alloclen = fraglen;
else {
else if (!zc) {
alloclen = min_t(int, fraglen, MAX_HEADER);
pagedlen = fraglen - alloclen;
} else {
alloclen = fragheaderlen + transhdrlen;
pagedlen = datalen - transhdrlen;
}
alloclen += alloc_extra;
......@@ -1188,13 +1209,14 @@ static int __ip_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
} else if (!uarg || !uarg->zerocopy) {
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
skb_zcopy_downgrade_managed(skb);
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
err = -EMSGSIZE;
......
......@@ -1202,17 +1202,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
flags = msg->msg_flags;
if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
if ((flags & MSG_ZEROCOPY) && size) {
skb = tcp_write_queue_tail(sk);
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
goto out_err;
}
zc = sk->sk_route_caps & NETIF_F_SG;
if (!zc)
uarg->zerocopy = 0;
if (msg->msg_ubuf) {
uarg = msg->msg_ubuf;
net_zcopy_get(uarg);
zc = sk->sk_route_caps & NETIF_F_SG;
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
goto out_err;
}
zc = sk->sk_route_caps & NETIF_F_SG;
if (!zc)
uarg->zerocopy = 0;
}
}
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
......@@ -1335,8 +1341,13 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (tcp_downgrade_zcopy_pure(sk, skb) ||
!sk_wmem_schedule(sk, copy))
if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
if (tcp_downgrade_zcopy_pure(sk, skb))
goto wait_for_space;
skb_zcopy_downgrade_managed(skb);
}
if (!sk_wmem_schedule(sk, copy))
goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
......
......@@ -1464,6 +1464,7 @@ static int __ip6_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
bool zc = false;
u32 tskey = 0;
struct rt6_info *rt = (struct rt6_info *)cork->dst;
struct ipv6_txoptions *opt = v6_cork->opt;
......@@ -1541,17 +1542,35 @@ static int __ip6_append_data(struct sock *sk,
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
if ((flags & MSG_ZEROCOPY) && length) {
struct msghdr *msg = from;
if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
return -EINVAL;
/* Leave uarg NULL if can't zerocopy, callers should
* be able to handle it.
*/
if ((rt->dst.dev->features & NETIF_F_SG) &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
zc = true;
uarg = msg->msg_ubuf;
}
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
if (!uarg)
return -ENOBUFS;
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
if (rt->dst.dev->features & NETIF_F_SG &&
csummode == CHECKSUM_PARTIAL) {
paged = true;
zc = true;
} else {
uarg->zerocopy = 0;
skb_zcopy_set(skb, uarg, &extra_uref);
}
}
}
......@@ -1630,9 +1649,12 @@ static int __ip6_append_data(struct sock *sk,
(fraglen + alloc_extra < SKB_MAX_ALLOC ||
!(rt->dst.dev->features & NETIF_F_SG)))
alloclen = fraglen;
else {
else if (!zc) {
alloclen = min_t(int, fraglen, MAX_HEADER);
pagedlen = fraglen - alloclen;
} else {
alloclen = fragheaderlen + transhdrlen;
pagedlen = datalen - transhdrlen;
}
alloclen += alloc_extra;
......@@ -1742,13 +1764,14 @@ static int __ip6_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
} else if (!uarg || !uarg->zerocopy) {
} else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
skb_zcopy_downgrade_managed(skb);
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
err = -EMSGSIZE;
......
......@@ -2106,6 +2106,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
msg.msg_ubuf = NULL;
if (addr) {
err = move_addr_to_kernel(addr, addr_len, &address);
if (err < 0)
......@@ -2405,6 +2406,7 @@ int __copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
kmsg->msg_ubuf = NULL;
*uiov = msg.msg_iov;
*nsegs = msg.msg_iovlen;
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment