Commit 6402528b authored by Niklas Söderlund's avatar Niklas Söderlund Committed by David S. Miller

nfp: xsk: add AF_XDP zero-copy Rx and Tx support

This patch adds zero-copy Rx and Tx support for AF_XDP sockets. It do so
by adding a separate NAPI poll function that is attached to a each
channel when the XSK socket is attached with XDP_SETUP_XSK_POOL, and
restored when the XSK socket is terminated, this is done per channel.

Support for XDP_TX is implemented and the XDP buffer can safely be moved
from the Rx to the Tx queue and correctly freed and returned to the XSK
pool once it's transmitted.

Note that when AF_XDP zero-copy is enabled, the XDP action XDP_PASS
will allocate a new buffer and copy the zero-copy frame prior
passing it to the kernel stack.

This patch is based on previous work by Jakub Kicinski.
Signed-off-by: default avatarNiklas Söderlund <niklas.soderlund@corigine.com>
Signed-off-by: default avatarSimon Horman <simon.horman@corigine.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 9c91a365
......@@ -31,6 +31,7 @@ nfp-objs := \
nfp_net_main.o \
nfp_net_repr.o \
nfp_net_sriov.o \
nfp_net_xsk.o \
nfp_netvf_main.o \
nfp_port.o \
nfp_shared_buf.o \
......
......@@ -171,11 +171,14 @@ struct nfp_net_tx_desc {
* struct nfp_net_tx_buf - software TX buffer descriptor
* @skb: normal ring, sk_buff associated with this buffer
* @frag: XDP ring, page frag associated with this buffer
* @xdp: XSK buffer pool handle (for AF_XDP)
* @dma_addr: DMA mapping address of the buffer
* @fidx: Fragment index (-1 for the head and [0..nr_frags-1] for frags)
* @pkt_cnt: Number of packets to be produced out of the skb associated
* with this buffer (valid only on the head's buffer).
* Will be 1 for all non-TSO packets.
* @is_xsk_tx: Flag if buffer is a RX buffer after a XDP_TX action and not a
* buffer from the TX queue (for AF_XDP).
* @real_len: Number of bytes which to be produced out of the skb (valid only
* on the head's buffer). Equal to skb->len for non-TSO packets.
*/
......@@ -183,10 +186,18 @@ struct nfp_net_tx_buf {
union {
struct sk_buff *skb;
void *frag;
struct xdp_buff *xdp;
};
dma_addr_t dma_addr;
union {
struct {
short int fidx;
u16 pkt_cnt;
};
struct {
bool is_xsk_tx;
};
};
u32 real_len;
};
......@@ -315,6 +326,16 @@ struct nfp_net_rx_buf {
dma_addr_t dma_addr;
};
/**
* struct nfp_net_xsk_rx_buf - software RX XSK buffer descriptor
* @dma_addr: DMA mapping address of the buffer
* @xdp: XSK buffer pool handle (for AF_XDP)
*/
struct nfp_net_xsk_rx_buf {
dma_addr_t dma_addr;
struct xdp_buff *xdp;
};
/**
* struct nfp_net_rx_ring - RX ring structure
* @r_vec: Back pointer to ring vector structure
......@@ -325,6 +346,7 @@ struct nfp_net_rx_buf {
* @fl_qcidx: Queue Controller Peripheral (QCP) queue index for the freelist
* @qcp_fl: Pointer to base of the QCP freelist queue
* @rxbufs: Array of transmitted FL/RX buffers
* @xsk_rxbufs: Array of transmitted FL/RX buffers (for AF_XDP)
* @rxds: Virtual address of FL/RX ring in host memory
* @xdp_rxq: RX-ring info avail for XDP
* @dma: DMA address of the FL/RX ring
......@@ -343,6 +365,7 @@ struct nfp_net_rx_ring {
u8 __iomem *qcp_fl;
struct nfp_net_rx_buf *rxbufs;
struct nfp_net_xsk_rx_buf *xsk_rxbufs;
struct nfp_net_rx_desc *rxds;
struct xdp_rxq_info xdp_rxq;
......@@ -361,6 +384,7 @@ struct nfp_net_rx_ring {
* @tx_ring: Pointer to TX ring
* @rx_ring: Pointer to RX ring
* @xdp_ring: Pointer to an extra TX ring for XDP
* @xsk_pool: XSK buffer pool active on vector queue pair (for AF_XDP)
* @irq_entry: MSI-X table entry (use for talking to the device)
* @event_ctr: Number of interrupt
* @rx_dim: Dynamic interrupt moderation structure for RX
......@@ -432,6 +456,7 @@ struct nfp_net_r_vector {
u64 rx_replace_buf_alloc_fail;
struct nfp_net_tx_ring *xdp_ring;
struct xsk_buff_pool *xsk_pool;
struct u64_stats_sync tx_sync;
u64 tx_pkts;
......@@ -502,7 +527,7 @@ struct nfp_stat_pair {
* @num_stack_tx_rings: Number of TX rings used by the stack (not XDP)
* @num_rx_rings: Currently configured number of RX rings
* @mtu: Device MTU
* @xsk_pools: AF_XDP UMEM table (@num_r_vecs in size)
* @xsk_pools: XSK buffer pools, @max_r_vecs in size (for AF_XDP).
*/
struct nfp_net_dp {
struct device *dev;
......
......@@ -46,6 +46,7 @@
#include "nfp_net_ctrl.h"
#include "nfp_net.h"
#include "nfp_net_sriov.h"
#include "nfp_net_xsk.h"
#include "nfp_port.h"
#include "crypto/crypto.h"
#include "crypto/fw.h"
......@@ -1316,6 +1317,9 @@ nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
tx_ring->rd_p++;
}
if (tx_ring->is_xdp)
nfp_net_xsk_tx_bufs_free(tx_ring);
memset(tx_ring->txds, 0, tx_ring->size);
tx_ring->wr_p = 0;
tx_ring->rd_p = 0;
......@@ -1504,10 +1508,14 @@ static void nfp_net_rx_ring_reset(struct nfp_net_rx_ring *rx_ring)
/* Move the empty entry to the end of the list */
wr_idx = D_IDX(rx_ring, rx_ring->wr_p);
last_idx = rx_ring->cnt - 1;
rx_ring->rxbufs[wr_idx].dma_addr = rx_ring->rxbufs[last_idx].dma_addr;
rx_ring->rxbufs[wr_idx].frag = rx_ring->rxbufs[last_idx].frag;
rx_ring->rxbufs[last_idx].dma_addr = 0;
rx_ring->rxbufs[last_idx].frag = NULL;
if (rx_ring->r_vec->xsk_pool) {
rx_ring->xsk_rxbufs[wr_idx] = rx_ring->xsk_rxbufs[last_idx];
memset(&rx_ring->xsk_rxbufs[last_idx], 0,
sizeof(*rx_ring->xsk_rxbufs));
} else {
rx_ring->rxbufs[wr_idx] = rx_ring->rxbufs[last_idx];
memset(&rx_ring->rxbufs[last_idx], 0, sizeof(*rx_ring->rxbufs));
}
memset(rx_ring->rxds, 0, rx_ring->size);
rx_ring->wr_p = 0;
......@@ -1529,6 +1537,9 @@ nfp_net_rx_ring_bufs_free(struct nfp_net_dp *dp,
{
unsigned int i;
if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
return;
for (i = 0; i < rx_ring->cnt - 1; i++) {
/* NULL skb can only happen when initial filling of the ring
* fails to allocate enough buffers and calls here to free
......@@ -1556,6 +1567,9 @@ nfp_net_rx_ring_bufs_alloc(struct nfp_net_dp *dp,
struct nfp_net_rx_buf *rxbufs;
unsigned int i;
if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
return 0;
rxbufs = rx_ring->rxbufs;
for (i = 0; i < rx_ring->cnt - 1; i++) {
......@@ -1580,6 +1594,9 @@ nfp_net_rx_ring_fill_freelist(struct nfp_net_dp *dp,
{
unsigned int i;
if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
return nfp_net_xsk_rx_ring_fill_freelist(rx_ring);
for (i = 0; i < rx_ring->cnt - 1; i++)
nfp_net_rx_give_one(dp, rx_ring, rx_ring->rxbufs[i].frag,
rx_ring->rxbufs[i].dma_addr);
......@@ -2560,6 +2577,10 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
if (dp->netdev)
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx))
kvfree(rx_ring->xsk_rxbufs);
else
kvfree(rx_ring->rxbufs);
if (rx_ring->rxds)
......@@ -2568,6 +2589,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
rx_ring->cnt = 0;
rx_ring->rxbufs = NULL;
rx_ring->xsk_rxbufs = NULL;
rx_ring->rxds = NULL;
rx_ring->dma = 0;
rx_ring->size = 0;
......@@ -2583,8 +2605,18 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring *rx_ring)
static int
nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
{
enum xdp_mem_type mem_type;
size_t rxbuf_sw_desc_sz;
int err;
if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx)) {
mem_type = MEM_TYPE_XSK_BUFF_POOL;
rxbuf_sw_desc_sz = sizeof(*rx_ring->xsk_rxbufs);
} else {
mem_type = MEM_TYPE_PAGE_ORDER0;
rxbuf_sw_desc_sz = sizeof(*rx_ring->rxbufs);
}
if (dp->netdev) {
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, dp->netdev,
rx_ring->idx, rx_ring->r_vec->napi.napi_id);
......@@ -2592,6 +2624,10 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
return err;
}
err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, mem_type, NULL);
if (err)
goto err_alloc;
rx_ring->cnt = dp->rxd_cnt;
rx_ring->size = array_size(rx_ring->cnt, sizeof(*rx_ring->rxds));
rx_ring->rxds = dma_alloc_coherent(dp->dev, rx_ring->size,
......@@ -2603,10 +2639,17 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring)
goto err_alloc;
}
rx_ring->rxbufs = kvcalloc(rx_ring->cnt, sizeof(*rx_ring->rxbufs),
if (nfp_net_has_xsk_pool_slow(dp, rx_ring->idx)) {
rx_ring->xsk_rxbufs = kvcalloc(rx_ring->cnt, rxbuf_sw_desc_sz,
GFP_KERNEL);
if (!rx_ring->xsk_rxbufs)
goto err_alloc;
} else {
rx_ring->rxbufs = kvcalloc(rx_ring->cnt, rxbuf_sw_desc_sz,
GFP_KERNEL);
if (!rx_ring->rxbufs)
goto err_alloc;
}
return 0;
......@@ -2659,11 +2702,13 @@ static void nfp_net_rx_rings_free(struct nfp_net_dp *dp)
}
static void
nfp_net_napi_add(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec)
nfp_net_napi_add(struct nfp_net_dp *dp, struct nfp_net_r_vector *r_vec, int idx)
{
if (dp->netdev)
netif_napi_add(dp->netdev, &r_vec->napi,
nfp_net_poll, NAPI_POLL_WEIGHT);
nfp_net_has_xsk_pool_slow(dp, idx) ?
nfp_net_xsk_poll : nfp_net_poll,
NAPI_POLL_WEIGHT);
else
tasklet_enable(&r_vec->tasklet);
}
......@@ -2687,6 +2732,17 @@ nfp_net_vector_assign_rings(struct nfp_net_dp *dp,
r_vec->xdp_ring = idx < dp->num_tx_rings - dp->num_stack_tx_rings ?
&dp->tx_rings[dp->num_stack_tx_rings + idx] : NULL;
if (nfp_net_has_xsk_pool_slow(dp, idx) || r_vec->xsk_pool) {
r_vec->xsk_pool = dp->xdp_prog ? dp->xsk_pools[idx] : NULL;
if (r_vec->xsk_pool)
xsk_pool_set_rxq_info(r_vec->xsk_pool,
&r_vec->rx_ring->xdp_rxq);
nfp_net_napi_del(dp, r_vec);
nfp_net_napi_add(dp, r_vec, idx);
}
}
static int
......@@ -2695,7 +2751,7 @@ nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
{
int err;
nfp_net_napi_add(&nn->dp, r_vec);
nfp_net_napi_add(&nn->dp, r_vec, idx);
snprintf(r_vec->name, sizeof(r_vec->name),
"%s-rxtx-%d", nfp_net_name(nn), idx);
......@@ -2834,8 +2890,11 @@ static void nfp_net_clear_config_and_disable(struct nfp_net *nn)
if (err)
nn_err(nn, "Could not disable device: %d\n", err);
for (r = 0; r < nn->dp.num_rx_rings; r++)
for (r = 0; r < nn->dp.num_rx_rings; r++) {
nfp_net_rx_ring_reset(&nn->dp.rx_rings[r]);
if (nfp_net_has_xsk_pool_slow(&nn->dp, nn->dp.rx_rings[r].idx))
nfp_net_xsk_rx_bufs_free(&nn->dp.rx_rings[r]);
}
for (r = 0; r < nn->dp.num_tx_rings; r++)
nfp_net_tx_ring_reset(&nn->dp, &nn->dp.tx_rings[r]);
for (r = 0; r < nn->dp.num_r_vecs; r++)
......@@ -3771,6 +3830,9 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
return nfp_net_xdp_setup_drv(nn, xdp);
case XDP_SETUP_PROG_HW:
return nfp_net_xdp_setup_hw(nn, xdp);
case XDP_SETUP_XSK_POOL:
return nfp_net_xsk_setup_pool(netdev, xdp->xsk.pool,
xdp->xsk.queue_id);
default:
return nfp_app_bpf(nn->app, nn, xdp);
}
......@@ -3821,6 +3883,7 @@ const struct net_device_ops nfp_net_netdev_ops = {
.ndo_features_check = nfp_net_features_check,
.ndo_get_phys_port_name = nfp_net_get_phys_port_name,
.ndo_bpf = nfp_net_xdp,
.ndo_xsk_wakeup = nfp_net_xsk_wakeup,
.ndo_get_devlink_port = nfp_devlink_get_devlink_port,
};
......@@ -3948,6 +4011,14 @@ nfp_net_alloc(struct pci_dev *pdev, void __iomem *ctrl_bar, bool needs_netdev,
nn->dp.num_r_vecs = max(nn->dp.num_tx_rings, nn->dp.num_rx_rings);
nn->dp.num_r_vecs = min_t(unsigned int,
nn->dp.num_r_vecs, num_online_cpus());
nn->max_r_vecs = nn->dp.num_r_vecs;
nn->dp.xsk_pools = kcalloc(nn->max_r_vecs, sizeof(nn->dp.xsk_pools),
GFP_KERNEL);
if (!nn->dp.xsk_pools) {
err = -ENOMEM;
goto err_free_nn;
}
nn->dp.txd_cnt = NFP_NET_TX_DESCS_DEFAULT;
nn->dp.rxd_cnt = NFP_NET_RX_DESCS_DEFAULT;
......@@ -3987,6 +4058,7 @@ void nfp_net_free(struct nfp_net *nn)
WARN_ON(timer_pending(&nn->reconfig_timer) || nn->reconfig_posted);
nfp_ccm_mbox_free(nn);
kfree(nn->dp.xsk_pools);
if (nn->dp.netdev)
free_netdev(nn->dp.netdev);
else
......
......@@ -42,6 +42,7 @@ static int nfp_rx_q_show(struct seq_file *file, void *data)
seq_printf(file, "%04d: 0x%08x 0x%08x", i,
rxd->vals[0], rxd->vals[1]);
if (!r_vec->xsk_pool) {
frag = READ_ONCE(rx_ring->rxbufs[i].frag);
if (frag)
seq_printf(file, " frag=%p", frag);
......@@ -49,6 +50,11 @@ static int nfp_rx_q_show(struct seq_file *file, void *data)
if (rx_ring->rxbufs[i].dma_addr)
seq_printf(file, " dma_addr=%pad",
&rx_ring->rxbufs[i].dma_addr);
} else {
if (rx_ring->xsk_rxbufs[i].dma_addr)
seq_printf(file, " dma_addr=%pad",
&rx_ring->xsk_rxbufs[i].dma_addr);
}
if (i == rx_ring->rd_p % rxd_cnt)
seq_puts(file, " H_RD ");
......@@ -103,20 +109,23 @@ static int nfp_tx_q_show(struct seq_file *file, void *data)
tx_ring->rd_p, tx_ring->wr_p, d_rd_p, d_wr_p);
for (i = 0; i < txd_cnt; i++) {
struct xdp_buff *xdp;
struct sk_buff *skb;
txd = &tx_ring->txds[i];
seq_printf(file, "%04d: 0x%08x 0x%08x 0x%08x 0x%08x", i,
txd->vals[0], txd->vals[1],
txd->vals[2], txd->vals[3]);
if (tx_ring == r_vec->tx_ring) {
struct sk_buff *skb = READ_ONCE(tx_ring->txbufs[i].skb);
if (!tx_ring->is_xdp) {
skb = READ_ONCE(tx_ring->txbufs[i].skb);
if (skb)
seq_printf(file, " skb->head=%p skb->data=%p",
skb->head, skb->data);
} else {
seq_printf(file, " frag=%p",
READ_ONCE(tx_ring->txbufs[i].frag));
xdp = READ_ONCE(tx_ring->txbufs[i].xdp);
if (xdp)
seq_printf(file, " xdp->data=%p", xdp->data);
}
if (tx_ring->txbufs[i].dma_addr)
......
This diff is collapsed.
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (C) 2018 Netronome Systems, Inc */
/* Copyright (C) 2021 Corigine, Inc */
#ifndef _NFP_XSK_H_
#define _NFP_XSK_H_
#include <net/xdp_sock_drv.h>
#define NFP_NET_XSK_TX_BATCH 16 /* XSK TX transmission batch size. */
static inline bool nfp_net_has_xsk_pool_slow(struct nfp_net_dp *dp,
unsigned int qid)
{
return dp->xdp_prog && dp->xsk_pools[qid];
}
int nfp_net_xsk_setup_pool(struct net_device *netdev, struct xsk_buff_pool *pool,
u16 queue_id);
void nfp_net_xsk_tx_bufs_free(struct nfp_net_tx_ring *tx_ring);
void nfp_net_xsk_rx_bufs_free(struct nfp_net_rx_ring *rx_ring);
void nfp_net_xsk_rx_ring_fill_freelist(struct nfp_net_rx_ring *rx_ring);
int nfp_net_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
int nfp_net_xsk_poll(struct napi_struct *napi, int budget);
#endif /* _NFP_XSK_H_ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment