Commit cd333e37 authored by David S. Miller's avatar David S. Miller

Merge branch 'virtio_net-XDP'

John Fastabend says:

====================
XDP for virtio_net

This implements virtio_net for the mergeable buffers and big_packet
modes. I tested this with vhost_net running on qemu and did not see
any issues. For testing num_buf > 1 I added a hack to vhost driver
to only but 100 bytes per buffer.

There are some restrictions for XDP to be enabled and work well
(see patch 3) for more details.

  1. GUEST_TSO{4|6} must be off
  2. MTU must be less than PAGE_SIZE
  3. queues must be available to dedicate to XDP
  4. num_bufs received in mergeable buffers must be 1
  5. big_packet mode must have all data on single page

To test this I used pktgen in the hypervisor and ran the XDP sample
programs xdp1 and xdp2 from ./samples/bpf in the host. The default
mode that is used with these patches with Linux guest and QEMU/Linux
hypervisor is the mergeable buffers mode. I tested this mode for 2+
days running xdp2 without issues. Additionally I did a series of
driver unload/load tests to check the allocate/release paths.

To test the big_packets path I applied the following simple patch against
the virtio driver forcing big_packets mode,

--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2242,7 +2242,7 @@ static int virtnet_probe(struct virtio_device *vdev)
                vi->big_packets = true;

        if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
-               vi->mergeable_rx_bufs = true;
+               vi->mergeable_rx_bufs = false;

        if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) ||
            virtio_has_feature(vdev, VIRTIO_F_VERSION_1))

I then repeated the tests with xdp1 and xdp2. After letting them run
for a few hours I called it good enough.

Testing the unexpected case where virtio receives a packet across
multiple buffers required patching the hypervisor vhost driver to
convince it to send these unexpected packets. Then I used ping with
the -s option to trigger the case with multiple buffers. This mode
is not expected to be used but as MST pointed out per spec it is
not strictly speaking illegal to generate multi-buffer packets so we
need someway to handle these. The following patch can be used to
generate multiple buffers,

--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1777,7 +1777,8 @@ static int translate_desc(struct vhost_virtqueue
*vq, u64

                _iov = iov + ret;
                size = node->size - addr + node->start;
-               _iov->iov_len = min((u64)len - s, size);
+               printk("%s: build 100 length headers!\n", __func__);
+               _iov->iov_len = min((u64)len - s, (u64)100);//size);
                _iov->iov_base = (void __user *)(unsigned long)
                        (node->userspace_addr + addr - node->start);
                s += size;

The qemu command I most frequently used for testing (although I did test
various other combinations of devices) is the following,

 ./x86_64-softmmu/qemu-system-x86_64              \
    -hda /var/lib/libvirt/images/Fedora-test0.img \
    -m 4096  -enable-kvm -smp 2                   \
    -netdev tap,id=hn0,queues=4,vhost=on          \
    -device virtio-net-pci,netdev=hn0,mq=on,vectors=9,guest_tso4=off,guest_tso6=off \
    -serial stdio

The options 'guest_tso4=off,guest_tso6=off' are required because we
do not support LRO with XDP at the moment.

Please review any comments/feedback welcome as always.
====================
Acked-by: default avatarMichael S. Tsirkin <mst@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 08abb795 72979a6c
......@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_net.h>
#include <linux/bpf.h>
#include <linux/scatterlist.h>
#include <linux/if_vlan.h>
#include <linux/slab.h>
......@@ -81,6 +82,8 @@ struct receive_queue {
struct napi_struct napi;
struct bpf_prog __rcu *xdp_prog;
/* Chain pages by the private ptr. */
struct page *pages;
......@@ -111,6 +114,9 @@ struct virtnet_info {
/* # of queue pairs currently used by the driver */
u16 curr_queue_pairs;
/* # of XDP queue pairs currently used by the driver */
u16 xdp_queue_pairs;
/* I like... big packets and I cannot lie! */
bool big_packets;
......@@ -324,6 +330,90 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
return skb;
}
static void virtnet_xdp_xmit(struct virtnet_info *vi,
struct receive_queue *rq,
struct send_queue *sq,
struct xdp_buff *xdp)
{
struct page *page = virt_to_head_page(xdp->data);
struct virtio_net_hdr_mrg_rxbuf *hdr;
unsigned int num_sg, len;
void *xdp_sent;
int err;
/* Free up any pending old buffers before queueing new ones. */
while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
struct page *sent_page = virt_to_head_page(xdp_sent);
if (vi->mergeable_rx_bufs)
put_page(sent_page);
else
give_pages(rq, sent_page);
}
/* Zero header and leave csum up to XDP layers */
hdr = xdp->data;
memset(hdr, 0, vi->hdr_len);
num_sg = 1;
sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
xdp->data, GFP_ATOMIC);
if (unlikely(err)) {
if (vi->mergeable_rx_bufs)
put_page(page);
else
give_pages(rq, page);
return; // On error abort to avoid unnecessary kick
} else if (!vi->mergeable_rx_bufs) {
/* If not mergeable bufs must be big packets so cleanup pages */
give_pages(rq, (struct page *)page->private);
page->private = 0;
}
virtqueue_kick(sq->vq);
}
static u32 do_xdp_prog(struct virtnet_info *vi,
struct receive_queue *rq,
struct bpf_prog *xdp_prog,
struct page *page, int offset, int len)
{
int hdr_padded_len;
struct xdp_buff xdp;
unsigned int qp;
u32 act;
u8 *buf;
buf = page_address(page) + offset;
if (vi->mergeable_rx_bufs)
hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
else
hdr_padded_len = sizeof(struct padded_vnet_hdr);
xdp.data = buf + hdr_padded_len;
xdp.data_end = xdp.data + (len - vi->hdr_len);
act = bpf_prog_run_xdp(xdp_prog, &xdp);
switch (act) {
case XDP_PASS:
return XDP_PASS;
case XDP_TX:
qp = vi->curr_queue_pairs -
vi->xdp_queue_pairs +
smp_processor_id();
xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
virtnet_xdp_xmit(vi, rq, &vi->sq[qp], &xdp);
return XDP_TX;
default:
bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED:
case XDP_DROP:
return XDP_DROP;
}
}
static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
{
struct sk_buff * skb = buf;
......@@ -340,17 +430,102 @@ static struct sk_buff *receive_big(struct net_device *dev,
void *buf,
unsigned int len)
{
struct bpf_prog *xdp_prog;
struct page *page = buf;
struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
struct sk_buff *skb;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
u32 act;
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
switch (act) {
case XDP_PASS:
break;
case XDP_TX:
rcu_read_unlock();
goto xdp_xmit;
case XDP_DROP:
default:
goto err_xdp;
}
}
rcu_read_unlock();
skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
if (unlikely(!skb))
goto err;
return skb;
err_xdp:
rcu_read_unlock();
err:
dev->stats.rx_dropped++;
give_pages(rq, page);
xdp_xmit:
return NULL;
}
/* The conditions to enable XDP should preclude the underlying device from
* sending packets across multiple buffers (num_buf > 1). However per spec
* it does not appear to be illegal to do so but rather just against convention.
* So in order to avoid making a system unresponsive the packets are pushed
* into a page and the XDP program is run. This will be extremely slow and we
* push a warning to the user to fix this as soon as possible. Fixing this may
* require resolving the underlying hardware to determine why multiple buffers
* are being received or simply loading the XDP program in the ingress stack
* after the skb is built because there is no advantage to running it here
* anymore.
*/
static struct page *xdp_linearize_page(struct receive_queue *rq,
u16 num_buf,
struct page *p,
int offset,
unsigned int *len)
{
struct page *page = alloc_page(GFP_ATOMIC);
unsigned int page_off = 0;
if (!page)
return NULL;
memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
page_off += *len;
while (--num_buf) {
unsigned int buflen;
unsigned long ctx;
void *buf;
int off;
ctx = (unsigned long)virtqueue_get_buf(rq->vq, &buflen);
if (unlikely(!ctx))
goto err_buf;
/* guard against a misconfigured or uncooperative backend that
* is sending packet larger than the MTU.
*/
if ((page_off + buflen) > PAGE_SIZE)
goto err_buf;
buf = mergeable_ctx_to_buf_address(ctx);
p = virt_to_head_page(buf);
off = buf - page_address(p);
memcpy(page_address(page) + page_off,
page_address(p) + off, buflen);
page_off += buflen;
}
*len = page_off;
return page;
err_buf:
__free_pages(page, 0);
return NULL;
}
......@@ -365,11 +540,67 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
struct page *page = virt_to_head_page(buf);
int offset = buf - page_address(page);
unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
struct sk_buff *head_skb, *curr_skb;
struct bpf_prog *xdp_prog;
unsigned int truesize;
head_skb = NULL;
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) {
struct page *xdp_page;
u32 act;
/* No known backend devices should send packets with
* more than a single buffer when XDP conditions are
* met. However it is not strictly illegal so the case
* is handled as an exception and a warning is thrown.
*/
if (unlikely(num_buf > 1)) {
bpf_warn_invalid_xdp_buffer();
/* linearize data for XDP */
xdp_page = xdp_linearize_page(rq, num_buf,
page, offset, &len);
if (!xdp_page)
goto err_xdp;
offset = 0;
} else {
xdp_page = page;
}
struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
truesize);
struct sk_buff *curr_skb = head_skb;
/* Transient failure which in theory could occur if
* in-flight packets from before XDP was enabled reach
* the receive path after XDP is loaded. In practice I
* was not able to create this condition.
*/
if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
goto err_xdp;
act = do_xdp_prog(vi, rq, xdp_prog, page, offset, len);
switch (act) {
case XDP_PASS:
if (unlikely(xdp_page != page))
__free_pages(xdp_page, 0);
break;
case XDP_TX:
if (unlikely(xdp_page != page))
goto err_xdp;
rcu_read_unlock();
goto xdp_xmit;
case XDP_DROP:
default:
if (unlikely(xdp_page != page))
__free_pages(xdp_page, 0);
goto err_xdp;
}
}
rcu_read_unlock();
truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
curr_skb = head_skb;
if (unlikely(!curr_skb))
goto err_skb;
......@@ -423,6 +654,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
return head_skb;
err_xdp:
rcu_read_unlock();
err_skb:
put_page(page);
while (--num_buf) {
......@@ -439,6 +672,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
err_buf:
dev->stats.rx_dropped++;
dev_kfree_skb(head_skb);
xdp_xmit:
return NULL;
}
......@@ -1337,6 +1571,13 @@ static int virtnet_set_channels(struct net_device *dev,
if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0)
return -EINVAL;
/* For now we don't support modifying channels while XDP is loaded
* also when XDP is loaded all RX queues have XDP programs so we only
* need to check a single RX queue.
*/
if (vi->rq[0].xdp_prog)
return -EINVAL;
get_online_cpus();
err = virtnet_set_queues(vi, queue_pairs);
if (!err) {
......@@ -1428,6 +1669,93 @@ static const struct ethtool_ops virtnet_ethtool_ops = {
.set_settings = virtnet_set_settings,
};
static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
{
unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
struct virtnet_info *vi = netdev_priv(dev);
struct bpf_prog *old_prog;
u16 xdp_qp = 0, curr_qp;
int i, err;
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) ||
virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6)) {
netdev_warn(dev, "can't set XDP while host is implementing LRO, disable LRO first\n");
return -EOPNOTSUPP;
}
if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
netdev_warn(dev, "XDP expects header/data in single page, any_header_sg required\n");
return -EINVAL;
}
if (dev->mtu > max_sz) {
netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
return -EINVAL;
}
curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
if (prog)
xdp_qp = nr_cpu_ids;
/* XDP requires extra queues for XDP_TX */
if (curr_qp + xdp_qp > vi->max_queue_pairs) {
netdev_warn(dev, "request %i queues but max is %i\n",
curr_qp + xdp_qp, vi->max_queue_pairs);
return -ENOMEM;
}
err = virtnet_set_queues(vi, curr_qp + xdp_qp);
if (err) {
dev_warn(&dev->dev, "XDP Device queue allocation failure.\n");
return err;
}
if (prog) {
prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
if (IS_ERR(prog)) {
virtnet_set_queues(vi, curr_qp);
return PTR_ERR(prog);
}
}
vi->xdp_queue_pairs = xdp_qp;
netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
for (i = 0; i < vi->max_queue_pairs; i++) {
old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
if (old_prog)
bpf_prog_put(old_prog);
}
return 0;
}
static bool virtnet_xdp_query(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
int i;
for (i = 0; i < vi->max_queue_pairs; i++) {
if (vi->rq[i].xdp_prog)
return true;
}
return false;
}
static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
{
switch (xdp->command) {
case XDP_SETUP_PROG:
return virtnet_xdp_set(dev, xdp->prog);
case XDP_QUERY_PROG:
xdp->prog_attached = virtnet_xdp_query(dev);
return 0;
default:
return -EINVAL;
}
}
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
......@@ -1444,6 +1772,7 @@ static const struct net_device_ops virtnet_netdev = {
#ifdef CONFIG_NET_RX_BUSY_POLL
.ndo_busy_poll = virtnet_busy_poll,
#endif
.ndo_xdp = virtnet_xdp,
};
static void virtnet_config_changed_work(struct work_struct *work)
......@@ -1505,12 +1834,20 @@ static void virtnet_free_queues(struct virtnet_info *vi)
static void free_receive_bufs(struct virtnet_info *vi)
{
struct bpf_prog *old_prog;
int i;
rtnl_lock();
for (i = 0; i < vi->max_queue_pairs; i++) {
while (vi->rq[i].pages)
__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
if (old_prog)
bpf_prog_put(old_prog);
}
rtnl_unlock();
}
static void free_receive_page_frags(struct virtnet_info *vi)
......@@ -1521,6 +1858,16 @@ static void free_receive_page_frags(struct virtnet_info *vi)
put_page(vi->rq[i].alloc_frag.page);
}
static bool is_xdp_queue(struct virtnet_info *vi, int q)
{
if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
return false;
else if (q < vi->curr_queue_pairs)
return true;
else
return false;
}
static void free_unused_bufs(struct virtnet_info *vi)
{
void *buf;
......@@ -1528,8 +1875,12 @@ static void free_unused_bufs(struct virtnet_info *vi)
for (i = 0; i < vi->max_queue_pairs; i++) {
struct virtqueue *vq = vi->sq[i].vq;
while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
if (!is_xdp_queue(vi, i))
dev_kfree_skb(buf);
else
put_page(virt_to_head_page(buf));
}
}
for (i = 0; i < vi->max_queue_pairs; i++) {
......
......@@ -602,6 +602,7 @@ bool bpf_helper_changes_pkt_data(void *func);
struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
const struct bpf_insn *patch, u32 len);
void bpf_warn_invalid_xdp_action(u32 act);
void bpf_warn_invalid_xdp_buffer(void);
#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
......
......@@ -2972,6 +2972,12 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
void bpf_warn_invalid_xdp_buffer(void)
{
WARN_ONCE(1, "Illegal XDP buffer encountered, expect throughput degradation\n");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_buffer);
static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
int src_reg, int ctx_off,
struct bpf_insn *insn_buf,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment