Commit fb3f9037 authored by Harold Huang's avatar Harold Huang Committed by Jakub Kicinski

tun: support NAPI for packets received from batched XDP buffs

In tun, NAPI is supported and we can also use NAPI in the path of
batched XDP buffs to accelerate packet processing. What is more, after
we use NAPI, GRO is also supported. The iperf shows that the throughput of
single stream could be improved from 4.5Gbps to 9.2Gbps. Additionally, 9.2
Gbps nearly reachs the line speed of the phy nic and there is still about
15% idle cpu core remaining on the vhost thread.

Test topology:
[iperf server]<--->tap<--->dpdk testpmd<--->phy nic<--->[iperf client]

Iperf stream:
iperf3 -c 10.0.0.2  -i 1 -t 10

Before:
...
[  5]   5.00-6.00   sec   558 MBytes  4.68 Gbits/sec    0   1.50 MBytes
[  5]   6.00-7.00   sec   556 MBytes  4.67 Gbits/sec    1   1.35 MBytes
[  5]   7.00-8.00   sec   556 MBytes  4.67 Gbits/sec    2   1.18 MBytes
[  5]   8.00-9.00   sec   559 MBytes  4.69 Gbits/sec    0   1.48 MBytes
[  5]   9.00-10.00  sec   556 MBytes  4.67 Gbits/sec    1   1.33 MBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  5.39 GBytes  4.63 Gbits/sec   72          sender
[  5]   0.00-10.04  sec  5.39 GBytes  4.61 Gbits/sec               receiver

After:
...
[  5]   5.00-6.00   sec  1.07 GBytes  9.19 Gbits/sec    0   1.55 MBytes
[  5]   6.00-7.00   sec  1.08 GBytes  9.30 Gbits/sec    0   1.63 MBytes
[  5]   7.00-8.00   sec  1.08 GBytes  9.25 Gbits/sec    0   1.72 MBytes
[  5]   8.00-9.00   sec  1.08 GBytes  9.25 Gbits/sec   77   1.31 MBytes
[  5]   9.00-10.00  sec  1.08 GBytes  9.24 Gbits/sec    0   1.48 MBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  10.8 GBytes  9.28 Gbits/sec  166          sender
[  5]   0.00-10.04  sec  10.8 GBytes  9.24 Gbits/sec               receiver

Reported-at: https://lore.kernel.org/all/CACGkMEvTLG0Ayg+TtbN4q4pPW-ycgCCs3sC3-TF8cuRTf7Pp1A@mail.gmail.comSigned-off-by: default avatarHarold Huang <baymaxhuang@gmail.com>
Acked-by: default avatarJason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20220228033805.1579435-1-baymaxhuang@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 422ce836
...@@ -2388,9 +2388,10 @@ static int tun_xdp_one(struct tun_struct *tun, ...@@ -2388,9 +2388,10 @@ static int tun_xdp_one(struct tun_struct *tun,
struct virtio_net_hdr *gso = &hdr->gso; struct virtio_net_hdr *gso = &hdr->gso;
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
struct sk_buff *skb = NULL; struct sk_buff *skb = NULL;
struct sk_buff_head *queue;
u32 rxhash = 0, act; u32 rxhash = 0, act;
int buflen = hdr->buflen; int buflen = hdr->buflen;
int err = 0; int ret = 0;
bool skb_xdp = false; bool skb_xdp = false;
struct page *page; struct page *page;
...@@ -2405,13 +2406,13 @@ static int tun_xdp_one(struct tun_struct *tun, ...@@ -2405,13 +2406,13 @@ static int tun_xdp_one(struct tun_struct *tun,
xdp_set_data_meta_invalid(xdp); xdp_set_data_meta_invalid(xdp);
act = bpf_prog_run_xdp(xdp_prog, xdp); act = bpf_prog_run_xdp(xdp_prog, xdp);
err = tun_xdp_act(tun, xdp_prog, xdp, act); ret = tun_xdp_act(tun, xdp_prog, xdp, act);
if (err < 0) { if (ret < 0) {
put_page(virt_to_head_page(xdp->data)); put_page(virt_to_head_page(xdp->data));
return err; return ret;
} }
switch (err) { switch (ret) {
case XDP_REDIRECT: case XDP_REDIRECT:
*flush = true; *flush = true;
fallthrough; fallthrough;
...@@ -2435,7 +2436,7 @@ static int tun_xdp_one(struct tun_struct *tun, ...@@ -2435,7 +2436,7 @@ static int tun_xdp_one(struct tun_struct *tun,
build: build:
skb = build_skb(xdp->data_hard_start, buflen); skb = build_skb(xdp->data_hard_start, buflen);
if (!skb) { if (!skb) {
err = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
} }
...@@ -2445,7 +2446,7 @@ static int tun_xdp_one(struct tun_struct *tun, ...@@ -2445,7 +2446,7 @@ static int tun_xdp_one(struct tun_struct *tun,
if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) { if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
atomic_long_inc(&tun->rx_frame_errors); atomic_long_inc(&tun->rx_frame_errors);
kfree_skb(skb); kfree_skb(skb);
err = -EINVAL; ret = -EINVAL;
goto out; goto out;
} }
...@@ -2455,16 +2456,27 @@ static int tun_xdp_one(struct tun_struct *tun, ...@@ -2455,16 +2456,27 @@ static int tun_xdp_one(struct tun_struct *tun,
skb_record_rx_queue(skb, tfile->queue_index); skb_record_rx_queue(skb, tfile->queue_index);
if (skb_xdp) { if (skb_xdp) {
err = do_xdp_generic(xdp_prog, skb); ret = do_xdp_generic(xdp_prog, skb);
if (err != XDP_PASS) if (ret != XDP_PASS) {
ret = 0;
goto out; goto out;
} }
}
if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 && if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
!tfile->detached) !tfile->detached)
rxhash = __skb_get_hash_symmetric(skb); rxhash = __skb_get_hash_symmetric(skb);
if (tfile->napi_enabled) {
queue = &tfile->sk.sk_write_queue;
spin_lock(&queue->lock);
__skb_queue_tail(queue, skb);
spin_unlock(&queue->lock);
ret = 1;
} else {
netif_receive_skb(skb); netif_receive_skb(skb);
ret = 0;
}
/* No need to disable preemption here since this function is /* No need to disable preemption here since this function is
* always called with bh disabled * always called with bh disabled
...@@ -2475,7 +2487,7 @@ static int tun_xdp_one(struct tun_struct *tun, ...@@ -2475,7 +2487,7 @@ static int tun_xdp_one(struct tun_struct *tun,
tun_flow_update(tun, rxhash, tfile); tun_flow_update(tun, rxhash, tfile);
out: out:
return err; return ret;
} }
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
...@@ -2492,7 +2504,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ...@@ -2492,7 +2504,7 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
if (ctl && (ctl->type == TUN_MSG_PTR)) { if (ctl && (ctl->type == TUN_MSG_PTR)) {
struct tun_page tpage; struct tun_page tpage;
int n = ctl->num; int n = ctl->num;
int flush = 0; int flush = 0, queued = 0;
memset(&tpage, 0, sizeof(tpage)); memset(&tpage, 0, sizeof(tpage));
...@@ -2501,12 +2513,17 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ...@@ -2501,12 +2513,17 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
xdp = &((struct xdp_buff *)ctl->ptr)[i]; xdp = &((struct xdp_buff *)ctl->ptr)[i];
tun_xdp_one(tun, tfile, xdp, &flush, &tpage); ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
if (ret > 0)
queued += ret;
} }
if (flush) if (flush)
xdp_do_flush(); xdp_do_flush();
if (tfile->napi_enabled && queued > 0)
napi_schedule(&tfile->napi);
rcu_read_unlock(); rcu_read_unlock();
local_bh_enable(); local_bh_enable();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment