Commit 0690899b authored by Michael S. Tsirkin's avatar Michael S. Tsirkin Committed by David S. Miller

tun: experimental zero copy tx support

Let vhost-net utilize zero copy tx when used with tun.
Signed-off-by: default avatarMichael S. Tsirkin <mst@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent dcc0fb78
...@@ -100,6 +100,8 @@ do { \ ...@@ -100,6 +100,8 @@ do { \
} while (0) } while (0)
#endif #endif
#define GOODCOPY_LEN 128
#define FLT_EXACT_COUNT 8 #define FLT_EXACT_COUNT 8
struct tap_filter { struct tap_filter {
unsigned int count; /* Number of addrs. Zero means disabled */ unsigned int count; /* Number of addrs. Zero means disabled */
...@@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun, ...@@ -604,19 +606,100 @@ static struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
return skb; return skb;
} }
/* set skb frags from iovec, this can move to core network code for reuse */
static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
int offset, size_t count)
{
int len = iov_length(from, count) - offset;
int copy = skb_headlen(skb);
int size, offset1 = 0;
int i = 0;
/* Skip over from offset */
while (count && (offset >= from->iov_len)) {
offset -= from->iov_len;
++from;
--count;
}
/* copy up to skb headlen */
while (count && (copy > 0)) {
size = min_t(unsigned int, copy, from->iov_len - offset);
if (copy_from_user(skb->data + offset1, from->iov_base + offset,
size))
return -EFAULT;
if (copy > size) {
++from;
--count;
offset = 0;
} else
offset += size;
copy -= size;
offset1 += size;
}
if (len == offset1)
return 0;
while (count--) {
struct page *page[MAX_SKB_FRAGS];
int num_pages;
unsigned long base;
unsigned long truesize;
len = from->iov_len - offset;
if (!len) {
offset = 0;
++from;
continue;
}
base = (unsigned long)from->iov_base + offset;
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
if (i + size > MAX_SKB_FRAGS)
return -EMSGSIZE;
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
if (num_pages != size) {
for (i = 0; i < num_pages; i++)
put_page(page[i]);
return -EFAULT;
}
truesize = size * PAGE_SIZE;
skb->data_len += len;
skb->len += len;
skb->truesize += truesize;
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
while (len) {
int off = base & ~PAGE_MASK;
int size = min_t(int, len, PAGE_SIZE - off);
__skb_fill_page_desc(skb, i, page[i], off, size);
skb_shinfo(skb)->nr_frags++;
/* increase sk_wmem_alloc */
base += size;
len -= size;
i++;
}
offset = 0;
++from;
}
return 0;
}
/* Get packet from user space buffer */ /* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, static ssize_t tun_get_user(struct tun_struct *tun, void *msg_control,
const struct iovec *iv, size_t count, const struct iovec *iv, size_t total_len,
int noblock) size_t count, int noblock)
{ {
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb; struct sk_buff *skb;
size_t len = count, align = NET_SKB_PAD; size_t len = total_len, align = NET_SKB_PAD;
struct virtio_net_hdr gso = { 0 }; struct virtio_net_hdr gso = { 0 };
int offset = 0; int offset = 0;
int copylen;
bool zerocopy = false;
int err;
if (!(tun->flags & TUN_NO_PI)) { if (!(tun->flags & TUN_NO_PI)) {
if ((len -= sizeof(pi)) > count) if ((len -= sizeof(pi)) > total_len)
return -EINVAL; return -EINVAL;
if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi))) if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
...@@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, ...@@ -625,7 +708,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
} }
if (tun->flags & TUN_VNET_HDR) { if (tun->flags & TUN_VNET_HDR) {
if ((len -= tun->vnet_hdr_sz) > count) if ((len -= tun->vnet_hdr_sz) > total_len)
return -EINVAL; return -EINVAL;
if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso))) if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
...@@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun, ...@@ -647,14 +730,46 @@ static ssize_t tun_get_user(struct tun_struct *tun,
return -EINVAL; return -EINVAL;
} }
skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock); if (msg_control)
zerocopy = true;
if (zerocopy) {
/* Userspace may produce vectors with count greater than
* MAX_SKB_FRAGS, so we need to linearize parts of the skb
* to let the rest of data to be fit in the frags.
*/
if (count > MAX_SKB_FRAGS) {
copylen = iov_length(iv, count - MAX_SKB_FRAGS);
if (copylen < offset)
copylen = 0;
else
copylen -= offset;
} else
copylen = 0;
/* There are 256 bytes to be copied in skb, so there is enough
* room for skb expand head in case it is used.
* The rest of the buffer is mapped from userspace.
*/
if (copylen < gso.hdr_len)
copylen = gso.hdr_len;
if (!copylen)
copylen = GOODCOPY_LEN;
} else
copylen = len;
skb = tun_alloc_skb(tun, align, copylen, gso.hdr_len, noblock);
if (IS_ERR(skb)) { if (IS_ERR(skb)) {
if (PTR_ERR(skb) != -EAGAIN) if (PTR_ERR(skb) != -EAGAIN)
tun->dev->stats.rx_dropped++; tun->dev->stats.rx_dropped++;
return PTR_ERR(skb); return PTR_ERR(skb);
} }
if (skb_copy_datagram_from_iovec(skb, 0, iv, offset, len)) { if (zerocopy)
err = zerocopy_sg_from_iovec(skb, iv, offset, count);
else
err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
if (err) {
tun->dev->stats.rx_dropped++; tun->dev->stats.rx_dropped++;
kfree_skb(skb); kfree_skb(skb);
return -EFAULT; return -EFAULT;
...@@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, ...@@ -728,12 +843,18 @@ static ssize_t tun_get_user(struct tun_struct *tun,
skb_shinfo(skb)->gso_segs = 0; skb_shinfo(skb)->gso_segs = 0;
} }
/* copy skb_ubuf_info for callback when skb has no error */
if (zerocopy) {
skb_shinfo(skb)->destructor_arg = msg_control;
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
}
netif_rx_ni(skb); netif_rx_ni(skb);
tun->dev->stats.rx_packets++; tun->dev->stats.rx_packets++;
tun->dev->stats.rx_bytes += len; tun->dev->stats.rx_bytes += len;
return count; return total_len;
} }
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
...@@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, ...@@ -748,7 +869,7 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count); tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
result = tun_get_user(tun, iv, iov_length(iv, count), result = tun_get_user(tun, NULL, iv, iov_length(iv, count), count,
file->f_flags & O_NONBLOCK); file->f_flags & O_NONBLOCK);
tun_put(tun); tun_put(tun);
...@@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock, ...@@ -962,8 +1083,8 @@ static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len) struct msghdr *m, size_t total_len)
{ {
struct tun_struct *tun = container_of(sock, struct tun_struct, socket); struct tun_struct *tun = container_of(sock, struct tun_struct, socket);
return tun_get_user(tun, m->msg_iov, total_len, return tun_get_user(tun, m->msg_control, m->msg_iov, total_len,
m->msg_flags & MSG_DONTWAIT); m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
} }
static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
...@@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ...@@ -1133,6 +1254,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
sock_init_data(&tun->socket, sk); sock_init_data(&tun->socket, sk);
sk->sk_write_space = tun_sock_write_space; sk->sk_write_space = tun_sock_write_space;
sk->sk_sndbuf = INT_MAX; sk->sk_sndbuf = INT_MAX;
sock_set_flag(sk, SOCK_ZEROCOPY);
tun_sk(sk)->tun = tun; tun_sk(sk)->tun = tun;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment