Commit ac98d8aa authored by Magnus Karlsson's avatar Magnus Karlsson Committed by Daniel Borkmann

xsk: wire upp Tx zero-copy functions

Here we add the functionality required to support zero-copy Tx, and
also exposes various zero-copy related functions for the netdevs.
Signed-off-by: default avatarMagnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent e3760c7e
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/if_xdp.h> #include <linux/if_xdp.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <net/sock.h> #include <net/sock.h>
...@@ -42,6 +43,8 @@ struct xdp_umem { ...@@ -42,6 +43,8 @@ struct xdp_umem {
struct net_device *dev; struct net_device *dev;
u16 queue_id; u16 queue_id;
bool zc; bool zc;
spinlock_t xsk_list_lock;
struct list_head xsk_list;
}; };
struct xdp_sock { struct xdp_sock {
...@@ -53,6 +56,8 @@ struct xdp_sock { ...@@ -53,6 +56,8 @@ struct xdp_sock {
struct list_head flush_node; struct list_head flush_node;
u16 queue_id; u16 queue_id;
struct xsk_queue *tx ____cacheline_aligned_in_smp; struct xsk_queue *tx ____cacheline_aligned_in_smp;
struct list_head list;
bool zc;
/* Protects multiple processes in the control path */ /* Protects multiple processes in the control path */
struct mutex mutex; struct mutex mutex;
u64 rx_dropped; u64 rx_dropped;
...@@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); ...@@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
void xsk_flush(struct xdp_sock *xs); void xsk_flush(struct xdp_sock *xs);
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
/* Used from netdev driver */
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_discard_addr(struct xdp_umem *umem);
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
void xsk_umem_consume_tx_done(struct xdp_umem *umem);
#else #else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{ {
......
...@@ -17,6 +17,29 @@ ...@@ -17,6 +17,29 @@
#define XDP_UMEM_MIN_CHUNK_SIZE 2048 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
unsigned long flags;
spin_lock_irqsave(&umem->xsk_list_lock, flags);
list_add_rcu(&xs->list, &umem->xsk_list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
}
void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
{
unsigned long flags;
if (xs->dev) {
spin_lock_irqsave(&umem->xsk_list_lock, flags);
list_del_rcu(&xs->list);
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
if (umem->zc)
synchronize_net();
}
}
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u32 queue_id, u16 flags) u32 queue_id, u16 flags)
{ {
...@@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, ...@@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
dev_hold(dev); dev_hold(dev);
if (dev->netdev_ops->ndo_bpf) { if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
bpf.command = XDP_QUERY_XSK_UMEM; bpf.command = XDP_QUERY_XSK_UMEM;
rtnl_lock(); rtnl_lock();
...@@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, ...@@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
} }
void xdp_umem_clear_dev(struct xdp_umem *umem) static void xdp_umem_clear_dev(struct xdp_umem *umem)
{ {
struct netdev_bpf bpf; struct netdev_bpf bpf;
int err; int err;
...@@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) ...@@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->npgs = size / PAGE_SIZE; umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL; umem->pgs = NULL;
umem->user = NULL; umem->user = NULL;
INIT_LIST_HEAD(&umem->xsk_list);
spin_lock_init(&umem->xsk_list_lock);
refcount_set(&umem->users, 1); refcount_set(&umem->users, 1);
......
...@@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) ...@@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
} }
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
}
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u32 queue_id, u16 flags); u32 queue_id, u16 flags);
void xdp_umem_clear_dev(struct xdp_umem *umem);
bool xdp_umem_validate_queues(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem);
void xdp_put_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem);
void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs);
struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr);
#endif /* XDP_UMEM_H_ */ #endif /* XDP_UMEM_H_ */
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/net.h> #include <linux/net.h>
#include <linux/netdevice.h> #include <linux/netdevice.h>
#include <linux/rculist.h>
#include <net/xdp_sock.h> #include <net/xdp_sock.h>
#include <net/xdp.h> #include <net/xdp.h>
...@@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) ...@@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
return err; return err;
} }
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
xskq_produce_flush_addr_n(umem->cq, nb_entries);
}
EXPORT_SYMBOL(xsk_umem_complete_tx);
void xsk_umem_consume_tx_done(struct xdp_umem *umem)
{
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
xs->sk.sk_write_space(&xs->sk);
}
rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_umem_consume_tx_done);
bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
{
struct xdp_desc desc;
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
if (!xskq_peek_desc(xs->tx, &desc))
continue;
if (xskq_produce_addr_lazy(umem->cq, desc.addr))
goto out;
*dma = xdp_umem_get_dma(umem, desc.addr);
*len = desc.len;
xskq_discard_desc(xs->tx);
rcu_read_unlock();
return true;
}
out:
rcu_read_unlock();
return false;
}
EXPORT_SYMBOL(xsk_umem_consume_tx);
static int xsk_zc_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
struct net_device *dev = xs->dev;
return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
}
static void xsk_destruct_skb(struct sk_buff *skb) static void xsk_destruct_skb(struct sk_buff *skb)
{ {
u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
...@@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb) ...@@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb)
static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
size_t total_len) size_t total_len)
{ {
bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
u32 max_batch = TX_BATCH_SIZE; u32 max_batch = TX_BATCH_SIZE;
struct xdp_sock *xs = xdp_sk(sk); struct xdp_sock *xs = xdp_sk(sk);
bool sent_frame = false; bool sent_frame = false;
...@@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, ...@@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
if (unlikely(!xs->tx)) if (unlikely(!xs->tx))
return -ENOBUFS; return -ENOBUFS;
if (need_wait)
return -EOPNOTSUPP;
mutex_lock(&xs->mutex); mutex_lock(&xs->mutex);
...@@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, ...@@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
goto out; goto out;
} }
skb = sock_alloc_send_skb(sk, len, !need_wait, &err); skb = sock_alloc_send_skb(sk, len, 1, &err);
if (unlikely(!skb)) { if (unlikely(!skb)) {
err = -EAGAIN; err = -EAGAIN;
goto out; goto out;
...@@ -235,6 +286,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, ...@@ -235,6 +286,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{ {
bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk); struct xdp_sock *xs = xdp_sk(sk);
...@@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) ...@@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
return -ENXIO; return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP))) if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN; return -ENETDOWN;
if (need_wait)
return -EOPNOTSUPP;
return xsk_generic_xmit(sk, m, total_len); return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
} }
static unsigned int xsk_poll(struct file *file, struct socket *sock, static unsigned int xsk_poll(struct file *file, struct socket *sock,
...@@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) ...@@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
} }
xs->dev = dev; xs->dev = dev;
xs->queue_id = sxdp->sxdp_queue_id; xs->zc = xs->umem->zc;
xs->queue_id = qid;
xskq_set_umem(xs->rx, &xs->umem->props); xskq_set_umem(xs->rx, &xs->umem->props);
xskq_set_umem(xs->tx, &xs->umem->props); xskq_set_umem(xs->tx, &xs->umem->props);
xdp_add_sk_umem(xs->umem, xs);
out_unlock: out_unlock:
if (err) if (err)
...@@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk) ...@@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk)
xskq_destroy(xs->rx); xskq_destroy(xs->rx);
xskq_destroy(xs->tx); xskq_destroy(xs->tx);
xdp_del_sk_umem(xs->umem, xs);
xdp_put_umem(xs->umem); xdp_put_umem(xs->umem);
sk_refcnt_debug_dec(sk); sk_refcnt_debug_dec(sk);
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <net/xdp_sock.h> #include <net/xdp_sock.h>
#define RX_BATCH_SIZE 16 #define RX_BATCH_SIZE 16
#define LAZY_UPDATE_THRESHOLD 128
struct xdp_ring { struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp; u32 producer ____cacheline_aligned_in_smp;
...@@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) ...@@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
return (entries > dcnt) ? dcnt : entries; return (entries > dcnt) ? dcnt : entries;
} }
static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
{
return q->nentries - (producer - q->cons_tail);
}
static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
{ {
u32 free_entries = q->nentries - (producer - q->cons_tail); u32 free_entries = xskq_nb_free_lazy(q, producer);
if (free_entries >= dcnt) if (free_entries >= dcnt)
return free_entries; return free_entries;
...@@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) ...@@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
{ {
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
return -ENOSPC;
ring->desc[q->prod_tail++ & q->ring_mask] = addr; ring->desc[q->prod_tail++ & q->ring_mask] = addr;
/* Order producer and data */ /* Order producer and data */
...@@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) ...@@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
return 0; return 0;
} }
static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr)
{
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0)
return -ENOSPC;
ring->desc[q->prod_head++ & q->ring_mask] = addr;
return 0;
}
static inline void xskq_produce_flush_addr_n(struct xsk_queue *q,
u32 nb_entries)
{
/* Order producer and data */
smp_wmb();
q->prod_tail += nb_entries;
WRITE_ONCE(q->ring->producer, q->prod_tail);
}
static inline int xskq_reserve_addr(struct xsk_queue *q) static inline int xskq_reserve_addr(struct xsk_queue *q)
{ {
if (xskq_nb_free(q, q->prod_head, 1) == 0) if (xskq_nb_free(q, q->prod_head, 1) == 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment