Commit 684009d4 authored by David S. Miller's avatar David S. Miller

Merge branch 'XDP-redirect-memory-return-API'

Jesper Dangaard Brouer says:

====================
XDP redirect memory return API

Submitted against net-next, as it contains NIC driver changes.

This patchset works towards supporting different XDP RX-ring memory
allocators.  As this will be needed by the AF_XDP zero-copy mode.

The patchset uses mlx5 as the sample driver, which gets implemented
XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to
change thought the patchset).

A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt).
And both ndo_xdp_xmit and the new xdp_return_frame end-up using this.

Support for a driver supplied allocator is implemented, and a
refurbished version of page_pool is the first return allocator type
introduced.  This will be a integration point for AF_XDP zero-copy.

The mlx5 driver evolve into using the page_pool, and see a performance
increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps.

The patchset stop at 16 patches (one over limit), but more API changes
are planned.  Specifically extending ndo_xdp_xmit and xdp_return_frame
APIs to support bulking.  As this will address some known limits.

V2: Updated according to Tariq's feedback
V3: Updated based on feedback from Jason Wang and Alex Duyck
V4: Updated based on feedback from Tariq and Jason
V5: Fix SPDX license, add Tariq's reviews, improve patch desc for perf test
V6: Updated based on feedback from Eric Dumazet and Alex Duyck
V7: Adapt to i40e that got XDP_REDIRECT support in-between
V8:
 Updated based on feedback kbuild test robot, and adjust for mlx5 changes
 page_pool only compiled into kernel when drivers Kconfig 'select' feature
V9:
 Remove some inline statements, let compiler decide what to inline
 Fix return value in virtio_net driver
 Adjust for mlx5 changes in-between submissions
V10:
 Minor adjust for mlx5 requested by Tariq
 Resubmit against net-next
V11: avoid leaking info stored in frame data on page reuse
====================
Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 897ddc24 6dfb970d
...@@ -638,7 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring, ...@@ -638,7 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB) if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
kfree(tx_buffer->raw_buf); kfree(tx_buffer->raw_buf);
else if (ring_is_xdp(ring)) else if (ring_is_xdp(ring))
page_frag_free(tx_buffer->raw_buf); xdp_return_frame(tx_buffer->xdpf);
else else
dev_kfree_skb_any(tx_buffer->skb); dev_kfree_skb_any(tx_buffer->skb);
if (dma_unmap_len(tx_buffer, len)) if (dma_unmap_len(tx_buffer, len))
...@@ -841,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi, ...@@ -841,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
/* free the skb/XDP data */ /* free the skb/XDP data */
if (ring_is_xdp(tx_ring)) if (ring_is_xdp(tx_ring))
page_frag_free(tx_buf->raw_buf); xdp_return_frame(tx_buf->xdpf);
else else
napi_consume_skb(tx_buf->skb, napi_budget); napi_consume_skb(tx_buf->skb, napi_budget);
...@@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring, ...@@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
#define I40E_XDP_CONSUMED 1 #define I40E_XDP_CONSUMED 1
#define I40E_XDP_TX 2 #define I40E_XDP_TX 2
static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
struct i40e_ring *xdp_ring); struct i40e_ring *xdp_ring);
static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp,
struct i40e_ring *xdp_ring)
{
struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return I40E_XDP_CONSUMED;
return i40e_xmit_xdp_ring(xdpf, xdp_ring);
}
/** /**
* i40e_run_xdp - run an XDP program * i40e_run_xdp - run an XDP program
* @rx_ring: Rx ring being processed * @rx_ring: Rx ring being processed
...@@ -2225,13 +2236,15 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring, ...@@ -2225,13 +2236,15 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
if (!xdp_prog) if (!xdp_prog)
goto xdp_out; goto xdp_out;
prefetchw(xdp->data_hard_start); /* xdp_frame write */
act = bpf_prog_run_xdp(xdp_prog, xdp); act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) { switch (act) {
case XDP_PASS: case XDP_PASS:
break; break;
case XDP_TX: case XDP_TX:
xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index]; xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
result = i40e_xmit_xdp_ring(xdp, xdp_ring); result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
break; break;
case XDP_REDIRECT: case XDP_REDIRECT:
err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog); err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
...@@ -3478,13 +3491,13 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, ...@@ -3478,13 +3491,13 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
* @xdp: data to transmit * @xdp: data to transmit
* @xdp_ring: XDP Tx ring * @xdp_ring: XDP Tx ring
**/ **/
static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
struct i40e_ring *xdp_ring) struct i40e_ring *xdp_ring)
{ {
u32 size = xdp->data_end - xdp->data;
u16 i = xdp_ring->next_to_use; u16 i = xdp_ring->next_to_use;
struct i40e_tx_buffer *tx_bi; struct i40e_tx_buffer *tx_bi;
struct i40e_tx_desc *tx_desc; struct i40e_tx_desc *tx_desc;
u32 size = xdpf->len;
dma_addr_t dma; dma_addr_t dma;
if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) { if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
...@@ -3492,14 +3505,14 @@ static int i40e_xmit_xdp_ring(struct xdp_buff *xdp, ...@@ -3492,14 +3505,14 @@ static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
return I40E_XDP_CONSUMED; return I40E_XDP_CONSUMED;
} }
dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE); dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE);
if (dma_mapping_error(xdp_ring->dev, dma)) if (dma_mapping_error(xdp_ring->dev, dma))
return I40E_XDP_CONSUMED; return I40E_XDP_CONSUMED;
tx_bi = &xdp_ring->tx_bi[i]; tx_bi = &xdp_ring->tx_bi[i];
tx_bi->bytecount = size; tx_bi->bytecount = size;
tx_bi->gso_segs = 1; tx_bi->gso_segs = 1;
tx_bi->raw_buf = xdp->data; tx_bi->xdpf = xdpf;
/* record length, and DMA address */ /* record length, and DMA address */
dma_unmap_len_set(tx_bi, len, size); dma_unmap_len_set(tx_bi, len, size);
...@@ -3675,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev) ...@@ -3675,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
* *
* Returns Zero if sent, else an error code * Returns Zero if sent, else an error code
**/ **/
int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
{ {
struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_netdev_priv *np = netdev_priv(dev);
unsigned int queue_index = smp_processor_id(); unsigned int queue_index = smp_processor_id();
...@@ -3688,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) ...@@ -3688,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO; return -ENXIO;
err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]); err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
if (err != I40E_XDP_TX) if (err != I40E_XDP_TX)
return -ENOSPC; return -ENOSPC;
......
...@@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size) ...@@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
struct i40e_tx_buffer { struct i40e_tx_buffer {
struct i40e_tx_desc *next_to_watch; struct i40e_tx_desc *next_to_watch;
union { union {
struct xdp_frame *xdpf;
struct sk_buff *skb; struct sk_buff *skb;
void *raw_buf; void *raw_buf;
}; };
...@@ -510,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw); ...@@ -510,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40e_detect_recover_hung(struct i40e_vsi *vsi); void i40e_detect_recover_hung(struct i40e_vsi *vsi);
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size); int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb); bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp); int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
void i40e_xdp_flush(struct net_device *dev); void i40e_xdp_flush(struct net_device *dev);
/** /**
......
...@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer { ...@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
unsigned long time_stamp; unsigned long time_stamp;
union { union {
struct sk_buff *skb; struct sk_buff *skb;
/* XDP uses address ptr on irq_clean */ struct xdp_frame *xdpf;
void *data;
}; };
unsigned int bytecount; unsigned int bytecount;
unsigned short gso_segs; unsigned short gso_segs;
......
...@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, ...@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
/* free the skb */ /* free the skb */
if (ring_is_xdp(tx_ring)) if (ring_is_xdp(tx_ring))
page_frag_free(tx_buffer->data); xdp_return_frame(tx_buffer->xdpf);
else else
napi_consume_skb(tx_buffer->skb, napi_budget); napi_consume_skb(tx_buffer->skb, napi_budget);
...@@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, ...@@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
#define IXGBE_XDP_TX 2 #define IXGBE_XDP_TX 2
static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
struct xdp_buff *xdp); struct xdp_frame *xdpf);
static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring, struct ixgbe_ring *rx_ring,
...@@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, ...@@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
{ {
int err, result = IXGBE_XDP_PASS; int err, result = IXGBE_XDP_PASS;
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
u32 act; u32 act;
rcu_read_lock(); rcu_read_lock();
...@@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, ...@@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
if (!xdp_prog) if (!xdp_prog)
goto xdp_out; goto xdp_out;
prefetchw(xdp->data_hard_start); /* xdp_frame write */
act = bpf_prog_run_xdp(xdp_prog, xdp); act = bpf_prog_run_xdp(xdp_prog, xdp);
switch (act) { switch (act) {
case XDP_PASS: case XDP_PASS:
break; break;
case XDP_TX: case XDP_TX:
result = ixgbe_xmit_xdp_ring(adapter, xdp); xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf)) {
result = IXGBE_XDP_CONSUMED;
break;
}
result = ixgbe_xmit_xdp_ring(adapter, xdpf);
break; break;
case XDP_REDIRECT: case XDP_REDIRECT:
err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog); err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
...@@ -5797,7 +5805,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring) ...@@ -5797,7 +5805,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
/* Free all the Tx ring sk_buffs */ /* Free all the Tx ring sk_buffs */
if (ring_is_xdp(tx_ring)) if (ring_is_xdp(tx_ring))
page_frag_free(tx_buffer->data); xdp_return_frame(tx_buffer->xdpf);
else else
dev_kfree_skb_any(tx_buffer->skb); dev_kfree_skb_any(tx_buffer->skb);
...@@ -6370,7 +6378,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, ...@@ -6370,7 +6378,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
struct device *dev = rx_ring->dev; struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev); int orig_node = dev_to_node(dev);
int ring_node = -1; int ring_node = -1;
int size; int size, err;
size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
...@@ -6407,6 +6415,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, ...@@ -6407,6 +6415,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
rx_ring->queue_index) < 0) rx_ring->queue_index) < 0)
goto err; goto err;
err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL);
if (err) {
xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
goto err;
}
rx_ring->xdp_prog = adapter->xdp_prog; rx_ring->xdp_prog = adapter->xdp_prog;
return 0; return 0;
...@@ -8336,7 +8351,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, ...@@ -8336,7 +8351,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
} }
static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
struct xdp_buff *xdp) struct xdp_frame *xdpf)
{ {
struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()]; struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
struct ixgbe_tx_buffer *tx_buffer; struct ixgbe_tx_buffer *tx_buffer;
...@@ -8345,12 +8360,12 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, ...@@ -8345,12 +8360,12 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
dma_addr_t dma; dma_addr_t dma;
u16 i; u16 i;
len = xdp->data_end - xdp->data; len = xdpf->len;
if (unlikely(!ixgbe_desc_unused(ring))) if (unlikely(!ixgbe_desc_unused(ring)))
return IXGBE_XDP_CONSUMED; return IXGBE_XDP_CONSUMED;
dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE); dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
if (dma_mapping_error(ring->dev, dma)) if (dma_mapping_error(ring->dev, dma))
return IXGBE_XDP_CONSUMED; return IXGBE_XDP_CONSUMED;
...@@ -8365,7 +8380,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter, ...@@ -8365,7 +8380,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
dma_unmap_len_set(tx_buffer, len, len); dma_unmap_len_set(tx_buffer, len, len);
dma_unmap_addr_set(tx_buffer, dma, dma); dma_unmap_addr_set(tx_buffer, dma, dma);
tx_buffer->data = xdp->data; tx_buffer->xdpf = xdpf;
tx_desc->read.buffer_addr = cpu_to_le64(dma); tx_desc->read.buffer_addr = cpu_to_le64(dma);
/* put descriptor type bits */ /* put descriptor type bits */
...@@ -9996,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) ...@@ -9996,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
} }
} }
static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
{ {
struct ixgbe_adapter *adapter = netdev_priv(dev); struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring; struct ixgbe_ring *ring;
...@@ -10012,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) ...@@ -10012,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (unlikely(!ring)) if (unlikely(!ring))
return -ENXIO; return -ENXIO;
err = ixgbe_xmit_xdp_ring(adapter, xdp); err = ixgbe_xmit_xdp_ring(adapter, xdpf);
if (err != IXGBE_XDP_TX) if (err != IXGBE_XDP_TX)
return -ENOSPC; return -ENOSPC;
......
...@@ -30,6 +30,7 @@ config MLX5_CORE_EN ...@@ -30,6 +30,7 @@ config MLX5_CORE_EN
bool "Mellanox Technologies ConnectX-4 Ethernet support" bool "Mellanox Technologies ConnectX-4 Ethernet support"
depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
depends on IPV6=y || IPV6=n || MLX5_CORE=m depends on IPV6=y || IPV6=n || MLX5_CORE=m
select PAGE_POOL
default n default n
---help--- ---help---
Ethernet support in Mellanox Technologies ConnectX-4 NIC. Ethernet support in Mellanox Technologies ConnectX-4 NIC.
......
...@@ -53,6 +53,8 @@ ...@@ -53,6 +53,8 @@
#include "mlx5_core.h" #include "mlx5_core.h"
#include "en_stats.h" #include "en_stats.h"
struct page_pool;
#define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
#define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
...@@ -392,6 +394,7 @@ struct mlx5e_xdpsq { ...@@ -392,6 +394,7 @@ struct mlx5e_xdpsq {
struct { struct {
struct mlx5e_dma_info *di; struct mlx5e_dma_info *di;
bool doorbell; bool doorbell;
bool redirect_flush;
} db; } db;
/* read only */ /* read only */
...@@ -533,6 +536,7 @@ struct mlx5e_rq { ...@@ -533,6 +536,7 @@ struct mlx5e_rq {
unsigned int hw_mtu; unsigned int hw_mtu;
struct mlx5e_xdpsq xdpsq; struct mlx5e_xdpsq xdpsq;
DECLARE_BITMAP(flags, 8); DECLARE_BITMAP(flags, 8);
struct page_pool *page_pool;
/* control */ /* control */
struct mlx5_wq_ctrl wq_ctrl; struct mlx5_wq_ctrl wq_ctrl;
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <linux/mlx5/fs.h> #include <linux/mlx5/fs.h>
#include <net/vxlan.h> #include <net/vxlan.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <net/page_pool.h>
#include "eswitch.h" #include "eswitch.h"
#include "en.h" #include "en.h"
#include "en_tc.h" #include "en_tc.h"
...@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_rq_param *rqp, struct mlx5e_rq_param *rqp,
struct mlx5e_rq *rq) struct mlx5e_rq *rq)
{ {
struct page_pool_params pp_params = { 0 };
struct mlx5_core_dev *mdev = c->mdev; struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc; void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
u32 byte_count; u32 byte_count, pool_size;
int npages; int npages;
int wq_sz; int wq_sz;
int err; int err;
...@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params); rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
pool_size = 1 << params->log_rq_mtu_frames;
switch (rq->wq_type) { switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
rq->post_wqes = mlx5e_post_rx_mpwqes; rq->post_wqes = mlx5e_post_rx_mpwqes;
rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
...@@ -512,6 +517,32 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -512,6 +517,32 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->mkey_be = c->mkey_be; rq->mkey_be = c->mkey_be;
} }
/* Create a page_pool and register it with rxq */
pp_params.order = rq->buff.page_order;
pp_params.flags = 0; /* No-internal DMA mapping in page_pool */
pp_params.pool_size = pool_size;
pp_params.nid = cpu_to_node(c->cpu);
pp_params.dev = c->pdev;
pp_params.dma_dir = rq->buff.map_dir;
/* page_pool can be used even when there is no rq->xdp_prog,
* given page_pool does not handle DMA mapping there is no
* required state to clear. And page_pool gracefully handle
* elevated refcnt.
*/
rq->page_pool = page_pool_create(&pp_params);
if (IS_ERR(rq->page_pool)) {
if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
kfree(rq->wqe.frag_info);
err = PTR_ERR(rq->page_pool);
rq->page_pool = NULL;
goto err_rq_wq_destroy;
}
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_PAGE_POOL, rq->page_pool);
if (err)
goto err_rq_wq_destroy;
for (i = 0; i < wq_sz; i++) { for (i = 0; i < wq_sz; i++) {
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
...@@ -548,6 +579,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, ...@@ -548,6 +579,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
if (rq->xdp_prog) if (rq->xdp_prog)
bpf_prog_put(rq->xdp_prog); bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq); xdp_rxq_info_unreg(&rq->xdp_rxq);
if (rq->page_pool)
page_pool_destroy(rq->page_pool);
mlx5_wq_destroy(&rq->wq_ctrl); mlx5_wq_destroy(&rq->wq_ctrl);
return err; return err;
...@@ -561,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) ...@@ -561,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
bpf_prog_put(rq->xdp_prog); bpf_prog_put(rq->xdp_prog);
xdp_rxq_info_unreg(&rq->xdp_rxq); xdp_rxq_info_unreg(&rq->xdp_rxq);
if (rq->page_pool)
page_pool_destroy(rq->page_pool);
switch (rq->wq_type) { switch (rq->wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include <linux/bpf_trace.h> #include <linux/bpf_trace.h>
#include <net/busy_poll.h> #include <net/busy_poll.h>
#include <net/ip6_checksum.h> #include <net/ip6_checksum.h>
#include <net/page_pool.h>
#include "en.h" #include "en.h"
#include "en_tc.h" #include "en_tc.h"
#include "eswitch.h" #include "eswitch.h"
...@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, ...@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
if (mlx5e_rx_cache_get(rq, dma_info)) if (mlx5e_rx_cache_get(rq, dma_info))
return 0; return 0;
dma_info->page = dev_alloc_pages(rq->buff.page_order); dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
if (unlikely(!dma_info->page)) if (unlikely(!dma_info->page))
return -ENOMEM; return -ENOMEM;
...@@ -236,15 +237,26 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, ...@@ -236,15 +237,26 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
return 0; return 0;
} }
static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
rq->buff.map_dir);
}
void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
bool recycle) bool recycle)
{ {
if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info)) if (likely(recycle)) {
if (mlx5e_rx_cache_put(rq, dma_info))
return; return;
dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq), mlx5e_page_dma_unmap(rq, dma_info);
rq->buff.map_dir); page_pool_recycle_direct(rq->page_pool, dma_info->page);
} else {
mlx5e_page_dma_unmap(rq, dma_info);
put_page(dma_info->page); put_page(dma_info->page);
}
} }
static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq, static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
...@@ -800,9 +812,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, ...@@ -800,9 +812,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
struct mlx5e_dma_info *di, struct mlx5e_dma_info *di,
void *va, u16 *rx_headroom, u32 *len) void *va, u16 *rx_headroom, u32 *len)
{ {
const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog); struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
struct xdp_buff xdp; struct xdp_buff xdp;
u32 act; u32 act;
int err;
if (!prog) if (!prog)
return false; return false;
...@@ -823,6 +836,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq, ...@@ -823,6 +836,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp))) if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
trace_xdp_exception(rq->netdev, prog, act); trace_xdp_exception(rq->netdev, prog, act);
return true; return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
err = xdp_do_redirect(rq->netdev, &xdp, prog);
if (!err) {
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
rq->xdpsq.db.redirect_flush = true;
mlx5e_page_dma_unmap(rq, di);
}
return true;
default: default:
bpf_warn_invalid_xdp_action(act); bpf_warn_invalid_xdp_action(act);
case XDP_ABORTED: case XDP_ABORTED:
...@@ -868,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, ...@@ -868,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset, dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
frag_size, DMA_FROM_DEVICE); frag_size, DMA_FROM_DEVICE);
prefetchw(va); /* xdp_frame data area */
prefetch(data); prefetch(data);
wi->offset += frag_size; wi->offset += frag_size;
...@@ -1140,6 +1163,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) ...@@ -1140,6 +1163,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
xdpsq->db.doorbell = false; xdpsq->db.doorbell = false;
} }
if (xdpsq->db.redirect_flush) {
xdp_do_flush_map();
xdpsq->db.redirect_flush = false;
}
mlx5_cqwq_update_db_record(&cq->wq); mlx5_cqwq_update_db_record(&cq->wq);
/* ensure cq space is freed before enabling more cqes */ /* ensure cq space is freed before enabling more cqes */
......
...@@ -248,11 +248,11 @@ struct veth { ...@@ -248,11 +248,11 @@ struct veth {
__be16 h_vlan_TCI; __be16 h_vlan_TCI;
}; };
bool tun_is_xdp_buff(void *ptr) bool tun_is_xdp_frame(void *ptr)
{ {
return (unsigned long)ptr & TUN_XDP_FLAG; return (unsigned long)ptr & TUN_XDP_FLAG;
} }
EXPORT_SYMBOL(tun_is_xdp_buff); EXPORT_SYMBOL(tun_is_xdp_frame);
void *tun_xdp_to_ptr(void *ptr) void *tun_xdp_to_ptr(void *ptr)
{ {
...@@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr) ...@@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr)
{ {
if (!ptr) if (!ptr)
return; return;
if (tun_is_xdp_buff(ptr)) { if (tun_is_xdp_frame(ptr)) {
struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
put_page(virt_to_head_page(xdp->data)); xdp_return_frame(xdpf);
} else { } else {
__skb_array_destroy_skb(ptr); __skb_array_destroy_skb(ptr);
} }
...@@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file, ...@@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
tun->dev, tfile->queue_index); tun->dev, tfile->queue_index);
if (err < 0) if (err < 0)
goto out; goto out;
err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL);
if (err < 0) {
xdp_rxq_info_unreg(&tfile->xdp_rxq);
goto out;
}
err = 0; err = 0;
} }
...@@ -1295,21 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = { ...@@ -1295,21 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64 = tun_net_get_stats64, .ndo_get_stats64 = tun_net_get_stats64,
}; };
static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
{ {
struct tun_struct *tun = netdev_priv(dev); struct tun_struct *tun = netdev_priv(dev);
struct xdp_buff *buff = xdp->data_hard_start;
int headroom = xdp->data - xdp->data_hard_start;
struct tun_file *tfile; struct tun_file *tfile;
u32 numqueues; u32 numqueues;
int ret = 0; int ret = 0;
/* Assure headroom is available and buff is properly aligned */
if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
return -ENOSPC;
*buff = *xdp;
rcu_read_lock(); rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues); numqueues = READ_ONCE(tun->numqueues);
...@@ -1323,7 +1321,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) ...@@ -1323,7 +1321,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
/* Encode the XDP flag into lowest bit for consumer to differ /* Encode the XDP flag into lowest bit for consumer to differ
* XDP buffer from sk_buff. * XDP buffer from sk_buff.
*/ */
if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) { if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
this_cpu_inc(tun->pcpu_stats->tx_dropped); this_cpu_inc(tun->pcpu_stats->tx_dropped);
ret = -ENOSPC; ret = -ENOSPC;
} }
...@@ -1333,6 +1331,16 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) ...@@ -1333,6 +1331,16 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
return ret; return ret;
} }
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
{
struct xdp_frame *frame = convert_to_xdp_frame(xdp);
if (unlikely(!frame))
return -EOVERFLOW;
return tun_xdp_xmit(dev, frame);
}
static void tun_xdp_flush(struct net_device *dev) static void tun_xdp_flush(struct net_device *dev)
{ {
struct tun_struct *tun = netdev_priv(dev); struct tun_struct *tun = netdev_priv(dev);
...@@ -1680,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, ...@@ -1680,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
case XDP_TX: case XDP_TX:
get_page(alloc_frag->page); get_page(alloc_frag->page);
alloc_frag->offset += buflen; alloc_frag->offset += buflen;
if (tun_xdp_xmit(tun->dev, &xdp)) if (tun_xdp_tx(tun->dev, &xdp))
goto err_redirect; goto err_redirect;
tun_xdp_flush(tun->dev); tun_xdp_flush(tun->dev);
rcu_read_unlock(); rcu_read_unlock();
...@@ -2001,11 +2009,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -2001,11 +2009,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t tun_put_user_xdp(struct tun_struct *tun, static ssize_t tun_put_user_xdp(struct tun_struct *tun,
struct tun_file *tfile, struct tun_file *tfile,
struct xdp_buff *xdp, struct xdp_frame *xdp_frame,
struct iov_iter *iter) struct iov_iter *iter)
{ {
int vnet_hdr_sz = 0; int vnet_hdr_sz = 0;
size_t size = xdp->data_end - xdp->data; size_t size = xdp_frame->len;
struct tun_pcpu_stats *stats; struct tun_pcpu_stats *stats;
size_t ret; size_t ret;
...@@ -2021,7 +2029,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun, ...@@ -2021,7 +2029,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso)); iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
} }
ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz; ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
stats = get_cpu_ptr(tun->pcpu_stats); stats = get_cpu_ptr(tun->pcpu_stats);
u64_stats_update_begin(&stats->syncp); u64_stats_update_begin(&stats->syncp);
...@@ -2189,11 +2197,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, ...@@ -2189,11 +2197,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
return err; return err;
} }
if (tun_is_xdp_buff(ptr)) { if (tun_is_xdp_frame(ptr)) {
struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
ret = tun_put_user_xdp(tun, tfile, xdp, to); ret = tun_put_user_xdp(tun, tfile, xdpf, to);
put_page(virt_to_head_page(xdp->data)); xdp_return_frame(xdpf);
} else { } else {
struct sk_buff *skb = ptr; struct sk_buff *skb = ptr;
...@@ -2432,10 +2440,10 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, ...@@ -2432,10 +2440,10 @@ static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
static int tun_ptr_peek_len(void *ptr) static int tun_ptr_peek_len(void *ptr)
{ {
if (likely(ptr)) { if (likely(ptr)) {
if (tun_is_xdp_buff(ptr)) { if (tun_is_xdp_frame(ptr)) {
struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
return xdp->data_end - xdp->data; return xdpf->len;
} }
return __skb_array_len_with_tag(ptr); return __skb_array_len_with_tag(ptr);
} else { } else {
......
...@@ -415,46 +415,51 @@ static void virtnet_xdp_flush(struct net_device *dev) ...@@ -415,46 +415,51 @@ static void virtnet_xdp_flush(struct net_device *dev)
virtqueue_kick(sq->vq); virtqueue_kick(sq->vq);
} }
static bool __virtnet_xdp_xmit(struct virtnet_info *vi, static int __virtnet_xdp_xmit(struct virtnet_info *vi,
struct xdp_buff *xdp) struct xdp_frame *xdpf)
{ {
struct virtio_net_hdr_mrg_rxbuf *hdr; struct virtio_net_hdr_mrg_rxbuf *hdr;
unsigned int len; struct xdp_frame *xdpf_sent;
struct send_queue *sq; struct send_queue *sq;
unsigned int len;
unsigned int qp; unsigned int qp;
void *xdp_sent;
int err; int err;
qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
sq = &vi->sq[qp]; sq = &vi->sq[qp];
/* Free up any pending old buffers before queueing new ones. */ /* Free up any pending old buffers before queueing new ones. */
while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) { while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
struct page *sent_page = virt_to_head_page(xdp_sent); xdp_return_frame(xdpf_sent);
put_page(sent_page); /* virtqueue want to use data area in-front of packet */
} if (unlikely(xdpf->metasize > 0))
return -EOPNOTSUPP;
xdp->data -= vi->hdr_len; if (unlikely(xdpf->headroom < vi->hdr_len))
return -EOVERFLOW;
/* Make room for virtqueue hdr (also change xdpf->headroom?) */
xdpf->data -= vi->hdr_len;
/* Zero header and leave csum up to XDP layers */ /* Zero header and leave csum up to XDP layers */
hdr = xdp->data; hdr = xdpf->data;
memset(hdr, 0, vi->hdr_len); memset(hdr, 0, vi->hdr_len);
xdpf->len += vi->hdr_len;
sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data); sg_init_one(sq->sg, xdpf->data, xdpf->len);
err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC); err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
if (unlikely(err)) if (unlikely(err))
return false; /* Caller handle free/refcnt */ return -ENOSPC; /* Caller handle free/refcnt */
return true; return 0;
} }
static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
{ {
struct virtnet_info *vi = netdev_priv(dev); struct virtnet_info *vi = netdev_priv(dev);
struct receive_queue *rq = vi->rq; struct receive_queue *rq = vi->rq;
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
bool sent;
/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
* indicate XDP resources have been successfully allocated. * indicate XDP resources have been successfully allocated.
...@@ -463,10 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp) ...@@ -463,10 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
if (!xdp_prog) if (!xdp_prog)
return -ENXIO; return -ENXIO;
sent = __virtnet_xdp_xmit(vi, xdp); return __virtnet_xdp_xmit(vi, xdpf);
if (!sent)
return -ENOSPC;
return 0;
} }
static unsigned int virtnet_get_headroom(struct virtnet_info *vi) static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
...@@ -555,7 +557,6 @@ static struct sk_buff *receive_small(struct net_device *dev, ...@@ -555,7 +557,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
struct page *page = virt_to_head_page(buf); struct page *page = virt_to_head_page(buf);
unsigned int delta = 0; unsigned int delta = 0;
struct page *xdp_page; struct page *xdp_page;
bool sent;
int err; int err;
len -= vi->hdr_len; len -= vi->hdr_len;
...@@ -564,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev, ...@@ -564,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp_prog = rcu_dereference(rq->xdp_prog); xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) { if (xdp_prog) {
struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
struct xdp_frame *xdpf;
struct xdp_buff xdp; struct xdp_buff xdp;
void *orig_data; void *orig_data;
u32 act; u32 act;
...@@ -606,8 +608,11 @@ static struct sk_buff *receive_small(struct net_device *dev, ...@@ -606,8 +608,11 @@ static struct sk_buff *receive_small(struct net_device *dev,
delta = orig_data - xdp.data; delta = orig_data - xdp.data;
break; break;
case XDP_TX: case XDP_TX:
sent = __virtnet_xdp_xmit(vi, &xdp); xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!sent)) { if (unlikely(!xdpf))
goto err_xdp;
err = __virtnet_xdp_xmit(vi, xdpf);
if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act); trace_xdp_exception(vi->dev, xdp_prog, act);
goto err_xdp; goto err_xdp;
} }
...@@ -690,7 +695,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, ...@@ -690,7 +695,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
struct bpf_prog *xdp_prog; struct bpf_prog *xdp_prog;
unsigned int truesize; unsigned int truesize;
unsigned int headroom = mergeable_ctx_to_headroom(ctx); unsigned int headroom = mergeable_ctx_to_headroom(ctx);
bool sent;
int err; int err;
head_skb = NULL; head_skb = NULL;
...@@ -698,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, ...@@ -698,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
rcu_read_lock(); rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog); xdp_prog = rcu_dereference(rq->xdp_prog);
if (xdp_prog) { if (xdp_prog) {
struct xdp_frame *xdpf;
struct page *xdp_page; struct page *xdp_page;
struct xdp_buff xdp; struct xdp_buff xdp;
void *data; void *data;
...@@ -762,8 +767,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, ...@@ -762,8 +767,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
} }
break; break;
case XDP_TX: case XDP_TX:
sent = __virtnet_xdp_xmit(vi, &xdp); xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!sent)) { if (unlikely(!xdpf))
goto err_xdp;
err = __virtnet_xdp_xmit(vi, xdpf);
if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act); trace_xdp_exception(vi->dev, xdp_prog, act);
if (unlikely(xdp_page != page)) if (unlikely(xdp_page != page))
put_page(xdp_page); put_page(xdp_page);
...@@ -1301,6 +1309,13 @@ static int virtnet_open(struct net_device *dev) ...@@ -1301,6 +1309,13 @@ static int virtnet_open(struct net_device *dev)
if (err < 0) if (err < 0)
return err; return err;
err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL);
if (err < 0) {
xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
return err;
}
virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
} }
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/xdp.h>
#include "vhost.h" #include "vhost.h"
...@@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq) ...@@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
static int vhost_net_buf_peek_len(void *ptr) static int vhost_net_buf_peek_len(void *ptr)
{ {
if (tun_is_xdp_buff(ptr)) { if (tun_is_xdp_frame(ptr)) {
struct xdp_buff *xdp = tun_ptr_to_xdp(ptr); struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
return xdp->data_end - xdp->data; return xdpf->len;
} }
return __skb_array_len_with_tag(ptr); return __skb_array_len_with_tag(ptr);
......
...@@ -30,6 +30,7 @@ struct sock; ...@@ -30,6 +30,7 @@ struct sock;
struct seccomp_data; struct seccomp_data;
struct bpf_prog_aux; struct bpf_prog_aux;
struct xdp_rxq_info; struct xdp_rxq_info;
struct xdp_buff;
/* ArgX, context and stack frame pointer register positions. Note, /* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function * Arg1, Arg2, Arg3, etc are used as argument mappings of function
...@@ -500,14 +501,6 @@ struct bpf_skb_data_end { ...@@ -500,14 +501,6 @@ struct bpf_skb_data_end {
void *data_end; void *data_end;
}; };
struct xdp_buff {
void *data;
void *data_end;
void *data_meta;
void *data_hard_start;
struct xdp_rxq_info *rxq;
};
struct sk_msg_buff { struct sk_msg_buff {
void *data; void *data;
void *data_end; void *data_end;
...@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev, ...@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev,
struct bpf_prog *prog); struct bpf_prog *prog);
void xdp_do_flush_map(void); void xdp_do_flush_map(void);
/* Drivers not supporting XDP metadata can use this helper, which
* rejects any room expansion for metadata as a result.
*/
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
xdp->data_meta = xdp->data + 1;
}
static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
return unlikely(xdp->data_meta > xdp->data);
}
void bpf_warn_invalid_xdp_action(u32 act); void bpf_warn_invalid_xdp_action(u32 act);
struct sock *do_sk_redirect_map(struct sk_buff *skb); struct sock *do_sk_redirect_map(struct sk_buff *skb);
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *); struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file); struct ptr_ring *tun_get_tx_ring(struct file *file);
bool tun_is_xdp_buff(void *ptr); bool tun_is_xdp_frame(void *ptr);
void *tun_xdp_to_ptr(void *ptr); void *tun_xdp_to_ptr(void *ptr);
void *tun_ptr_to_xdp(void *ptr); void *tun_ptr_to_xdp(void *ptr);
void tun_ptr_free(void *ptr); void tun_ptr_free(void *ptr);
...@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f) ...@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
{ {
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
static inline bool tun_is_xdp_buff(void *ptr) static inline bool tun_is_xdp_frame(void *ptr)
{ {
return false; return false;
} }
......
...@@ -1165,7 +1165,7 @@ struct dev_ifalias { ...@@ -1165,7 +1165,7 @@ struct dev_ifalias {
* This function is used to set or query state related to XDP on the * This function is used to set or query state related to XDP on the
* netdevice and manage BPF offload. See definition of * netdevice and manage BPF offload. See definition of
* enum bpf_netdev_command for details. * enum bpf_netdev_command for details.
* int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp); * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
* This function is used to submit a XDP packet for transmit on a * This function is used to submit a XDP packet for transmit on a
* netdevice. * netdevice.
* void (*ndo_xdp_flush)(struct net_device *dev); * void (*ndo_xdp_flush)(struct net_device *dev);
...@@ -1356,7 +1356,7 @@ struct net_device_ops { ...@@ -1356,7 +1356,7 @@ struct net_device_ops {
int (*ndo_bpf)(struct net_device *dev, int (*ndo_bpf)(struct net_device *dev,
struct netdev_bpf *bpf); struct netdev_bpf *bpf);
int (*ndo_xdp_xmit)(struct net_device *dev, int (*ndo_xdp_xmit)(struct net_device *dev,
struct xdp_buff *xdp); struct xdp_frame *xdp);
void (*ndo_xdp_flush)(struct net_device *dev); void (*ndo_xdp_flush)(struct net_device *dev);
}; };
......
/* SPDX-License-Identifier: GPL-2.0
*
* page_pool.h
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
/**
* DOC: page_pool allocator
*
* This page_pool allocator is optimized for the XDP mode that
* uses one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
*
* Basic use involve replacing alloc_pages() calls with the
* page_pool_alloc_pages() call. Drivers should likely use
* page_pool_dev_alloc_pages() replacing dev_alloc_pages().
*
* If page_pool handles DMA mapping (use page->private), then API user
* is responsible for invoking page_pool_put_page() once. In-case of
* elevated refcnt, the DMA state is released, assuming other users of
* the page will eventually call put_page().
*
* If no DMA mapping is done, then it can act as shim-layer that
* fall-through to alloc_page. As no state is kept on the page, the
* regular put_page() call is sufficient.
*/
#ifndef _NET_PAGE_POOL_H
#define _NET_PAGE_POOL_H
#include <linux/mm.h> /* Needed by ptr_ring */
#include <linux/ptr_ring.h>
#include <linux/dma-direction.h>
#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */
#define PP_FLAG_ALL PP_FLAG_DMA_MAP
/*
* Fast allocation side cache array/stack
*
* The cache size and refill watermark is related to the network
* use-case. The NAPI budget is 64 packets. After a NAPI poll the RX
* ring is usually refilled and the max consumed elements will be 64,
* thus a natural max size of objects needed in the cache.
*
* Keeping room for more objects, is due to XDP_DROP use-case. As
* XDP_DROP allows the opportunity to recycle objects directly into
* this array, as it shares the same softirq/NAPI protection. If
* cache is already full (or partly full) then the XDP_DROP recycles
* would have to take a slower code path.
*/
#define PP_ALLOC_CACHE_SIZE 128
#define PP_ALLOC_CACHE_REFILL 64
struct pp_alloc_cache {
u32 count;
void *cache[PP_ALLOC_CACHE_SIZE];
};
struct page_pool_params {
unsigned int flags;
unsigned int order;
unsigned int pool_size;
int nid; /* Numa node id to allocate from pages from */
struct device *dev; /* device, for DMA pre-mapping purposes */
enum dma_data_direction dma_dir; /* DMA mapping direction */
};
struct page_pool {
struct rcu_head rcu;
struct page_pool_params p;
/*
* Data structure for allocation side
*
* Drivers allocation side usually already perform some kind
* of resource protection. Piggyback on this protection, and
* require driver to protect allocation side.
*
* For NIC drivers this means, allocate a page_pool per
* RX-queue. As the RX-queue is already protected by
* Softirq/BH scheduling and napi_schedule. NAPI schedule
* guarantee that a single napi_struct will only be scheduled
* on a single CPU (see napi_schedule).
*/
struct pp_alloc_cache alloc ____cacheline_aligned_in_smp;
/* Data structure for storing recycled pages.
*
* Returning/freeing pages is more complicated synchronization
* wise, because free's can happen on remote CPUs, with no
* association with allocation resource.
*
* Use ptr_ring, as it separates consumer and producer
* effeciently, it a way that doesn't bounce cache-lines.
*
* TODO: Implement bulk return pages into this structure.
*/
struct ptr_ring ring;
};
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
{
gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
return page_pool_alloc_pages(pool, gfp);
}
struct page_pool *page_pool_create(const struct page_pool_params *params);
void page_pool_destroy(struct page_pool *pool);
/* Never call this directly, use helpers below */
void __page_pool_put_page(struct page_pool *pool,
struct page *page, bool allow_direct);
static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
{
/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
* allow registering MEM_TYPE_PAGE_POOL, but shield linker.
*/
#ifdef CONFIG_PAGE_POOL
__page_pool_put_page(pool, page, false);
#endif
}
/* Very limited use-cases allow recycle direct */
static inline void page_pool_recycle_direct(struct page_pool *pool,
struct page *page)
{
__page_pool_put_page(pool, page, true);
}
static inline bool is_page_pool_compiled_in(void)
{
#ifdef CONFIG_PAGE_POOL
return true;
#else
return false;
#endif
}
#endif /* _NET_PAGE_POOL_H */
...@@ -33,16 +33,99 @@ ...@@ -33,16 +33,99 @@
* also mandatory during RX-ring setup. * also mandatory during RX-ring setup.
*/ */
enum xdp_mem_type {
MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */
MEM_TYPE_PAGE_POOL,
MEM_TYPE_MAX,
};
struct xdp_mem_info {
u32 type; /* enum xdp_mem_type, but known size type */
u32 id;
};
struct page_pool;
struct xdp_rxq_info { struct xdp_rxq_info {
struct net_device *dev; struct net_device *dev;
u32 queue_index; u32 queue_index;
u32 reg_state; u32 reg_state;
struct xdp_mem_info mem;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */ } ____cacheline_aligned; /* perf critical, avoid false-sharing */
struct xdp_buff {
void *data;
void *data_end;
void *data_meta;
void *data_hard_start;
struct xdp_rxq_info *rxq;
};
struct xdp_frame {
void *data;
u16 len;
u16 headroom;
u16 metasize;
/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
* while mem info is valid on remote CPU.
*/
struct xdp_mem_info mem;
struct net_device *dev_rx; /* used by cpumap */
};
/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
{
struct xdp_frame *xdp_frame;
int metasize;
int headroom;
/* Assure headroom is available for storing info */
headroom = xdp->data - xdp->data_hard_start;
metasize = xdp->data - xdp->data_meta;
metasize = metasize > 0 ? metasize : 0;
if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
return NULL;
/* Store info in top of packet */
xdp_frame = xdp->data_hard_start;
xdp_frame->data = xdp->data;
xdp_frame->len = xdp->data_end - xdp->data;
xdp_frame->headroom = headroom - sizeof(*xdp_frame);
xdp_frame->metasize = metasize;
/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
xdp_frame->mem = xdp->rxq->mem;
return xdp_frame;
}
void xdp_return_frame(struct xdp_frame *xdpf);
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
struct net_device *dev, u32 queue_index); struct net_device *dev, u32 queue_index);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
enum xdp_mem_type type, void *allocator);
/* Drivers not supporting XDP metadata can use this helper, which
* rejects any room expansion for metadata as a result.
*/
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
xdp->data_meta = xdp->data + 1;
}
static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
return unlikely(xdp->data_meta > xdp->data);
}
#endif /* __LINUX_NET_XDP_H__ */ #endif /* __LINUX_NET_XDP_H__ */
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/ptr_ring.h> #include <linux/ptr_ring.h>
#include <net/xdp.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
...@@ -137,27 +138,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) ...@@ -137,27 +138,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return ERR_PTR(err); return ERR_PTR(err);
} }
static void __cpu_map_queue_destructor(void *ptr)
{
/* The tear-down procedure should have made sure that queue is
* empty. See __cpu_map_entry_replace() and work-queue
* invoked cpu_map_kthread_stop(). Catch any broken behaviour
* gracefully and warn once.
*/
if (WARN_ON_ONCE(ptr))
page_frag_free(ptr);
}
static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
if (atomic_dec_and_test(&rcpu->refcnt)) {
/* The queue should be empty at this point */
ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
kfree(rcpu->queue);
kfree(rcpu);
}
}
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{ {
atomic_inc(&rcpu->refcnt); atomic_inc(&rcpu->refcnt);
...@@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) ...@@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
kthread_stop(rcpu->kthread); kthread_stop(rcpu->kthread);
} }
/* For now, xdp_pkt is a cpumap internal data structure, with info
* carried between enqueue to dequeue. It is mapped into the top
* headroom of the packet, to avoid allocating separate mem.
*/
struct xdp_pkt {
void *data;
u16 len;
u16 headroom;
u16 metasize;
struct net_device *dev_rx;
};
/* Convert xdp_buff to xdp_pkt */
static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
{
struct xdp_pkt *xdp_pkt;
int metasize;
int headroom;
/* Assure headroom is available for storing info */
headroom = xdp->data - xdp->data_hard_start;
metasize = xdp->data - xdp->data_meta;
metasize = metasize > 0 ? metasize : 0;
if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
return NULL;
/* Store info in top of packet */
xdp_pkt = xdp->data_hard_start;
xdp_pkt->data = xdp->data;
xdp_pkt->len = xdp->data_end - xdp->data;
xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
xdp_pkt->metasize = metasize;
return xdp_pkt;
}
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
struct xdp_pkt *xdp_pkt) struct xdp_frame *xdpf)
{ {
unsigned int frame_size; unsigned int frame_size;
void *pkt_data_start; void *pkt_data_start;
...@@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, ...@@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* would be preferred to set frame_size to 2048 or 4096 * would be preferred to set frame_size to 2048 or 4096
* depending on the driver. * depending on the driver.
* frame_size = 2048; * frame_size = 2048;
* frame_len = frame_size - sizeof(*xdp_pkt); * frame_len = frame_size - sizeof(*xdp_frame);
* *
* Instead, with info avail, skb_shared_info in placed after * Instead, with info avail, skb_shared_info in placed after
* packet len. This, unfortunately fakes the truesize. * packet len. This, unfortunately fakes the truesize.
...@@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, ...@@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* is not at a fixed memory location, with mixed length * is not at a fixed memory location, with mixed length
* packets, which is bad for cache-line hotness. * packets, which is bad for cache-line hotness.
*/ */
frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; pkt_data_start = xdpf->data - xdpf->headroom;
skb = build_skb(pkt_data_start, frame_size); skb = build_skb(pkt_data_start, frame_size);
if (!skb) if (!skb)
return NULL; return NULL;
skb_reserve(skb, xdp_pkt->headroom); skb_reserve(skb, xdpf->headroom);
__skb_put(skb, xdp_pkt->len); __skb_put(skb, xdpf->len);
if (xdp_pkt->metasize) if (xdpf->metasize)
skb_metadata_set(skb, xdp_pkt->metasize); skb_metadata_set(skb, xdpf->metasize);
/* Essential SKB info: protocol and skb->dev */ /* Essential SKB info: protocol and skb->dev */
skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
/* Optional SKB info, currently missing: /* Optional SKB info, currently missing:
* - HW checksum info (skb->ip_summed) * - HW checksum info (skb->ip_summed)
...@@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, ...@@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
return skb; return skb;
} }
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
/* The tear-down procedure should have made sure that queue is
* empty. See __cpu_map_entry_replace() and work-queue
* invoked cpu_map_kthread_stop(). Catch any broken behaviour
* gracefully and warn once.
*/
struct xdp_frame *xdpf;
while ((xdpf = ptr_ring_consume(ring)))
if (WARN_ON_ONCE(xdpf))
xdp_return_frame(xdpf);
}
static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
if (atomic_dec_and_test(&rcpu->refcnt)) {
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
kfree(rcpu->queue);
kfree(rcpu);
}
}
static int cpu_map_kthread_run(void *data) static int cpu_map_kthread_run(void *data)
{ {
struct bpf_cpu_map_entry *rcpu = data; struct bpf_cpu_map_entry *rcpu = data;
...@@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data) ...@@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
*/ */
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
unsigned int processed = 0, drops = 0, sched = 0; unsigned int processed = 0, drops = 0, sched = 0;
struct xdp_pkt *xdp_pkt; struct xdp_frame *xdpf;
/* Release CPU reschedule checks */ /* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) { if (__ptr_ring_empty(rcpu->queue)) {
...@@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data) ...@@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring * kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue. * consume side valid as no-resize allowed of queue.
*/ */
while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
struct sk_buff *skb; struct sk_buff *skb;
int ret; int ret;
skb = cpu_map_build_skb(rcpu, xdp_pkt); skb = cpu_map_build_skb(rcpu, xdpf);
if (!skb) { if (!skb) {
page_frag_free(xdp_pkt); xdp_return_frame(xdpf);
continue; continue;
} }
...@@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, ...@@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
spin_lock(&q->producer_lock); spin_lock(&q->producer_lock);
for (i = 0; i < bq->count; i++) { for (i = 0; i < bq->count; i++) {
void *xdp_pkt = bq->q[i]; struct xdp_frame *xdpf = bq->q[i];
int err; int err;
err = __ptr_ring_produce(q, xdp_pkt); err = __ptr_ring_produce(q, xdpf);
if (err) { if (err) {
drops++; drops++;
page_frag_free(xdp_pkt); /* Free xdp_pkt */ xdp_return_frame(xdpf);
} }
processed++; processed++;
} }
...@@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, ...@@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
/* Runs under RCU-read-side, plus in softirq under NAPI protection. /* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access. * Thus, safe percpu variable access.
*/ */
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{ {
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
...@@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) ...@@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
* driver to code invoking us to finished, due to driver * driver to code invoking us to finished, due to driver
* (e.g. ixgbe) recycle tricks based on page-refcnt. * (e.g. ixgbe) recycle tricks based on page-refcnt.
* *
* Thus, incoming xdp_pkt is always queued here (else we race * Thus, incoming xdp_frame is always queued here (else we race
* with another CPU on page-refcnt and remaining driver code). * with another CPU on page-refcnt and remaining driver code).
* Queue time is very short, as driver will invoke flush * Queue time is very short, as driver will invoke flush
* operation, when completing napi->poll call. * operation, when completing napi->poll call.
*/ */
bq->q[bq->count++] = xdp_pkt; bq->q[bq->count++] = xdpf;
return 0; return 0;
} }
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx) struct net_device *dev_rx)
{ {
struct xdp_pkt *xdp_pkt; struct xdp_frame *xdpf;
xdp_pkt = convert_to_xdp_pkt(xdp); xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdp_pkt)) if (unlikely(!xdpf))
return -EOVERFLOW; return -EOVERFLOW;
/* Info needed when constructing SKB on remote CPU */ /* Info needed when constructing SKB on remote CPU */
xdp_pkt->dev_rx = dev_rx; xdpf->dev_rx = dev_rx;
bq_enqueue(rcpu, xdp_pkt); bq_enqueue(rcpu, xdpf);
return 0; return 0;
} }
......
...@@ -423,6 +423,9 @@ config MAY_USE_DEVLINK ...@@ -423,6 +423,9 @@ config MAY_USE_DEVLINK
on MAY_USE_DEVLINK to ensure they do not cause link errors when on MAY_USE_DEVLINK to ensure they do not cause link errors when
devlink is a loadable module and the driver using it is built-in. devlink is a loadable module and the driver using it is built-in.
config PAGE_POOL
bool
endif # if NET endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour. # Used by archs to tell that they support BPF JIT compiler plus which flavour.
......
...@@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ ...@@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
fib_notifier.o xdp.o fib_notifier.o xdp.o
obj-y += net-sysfs.o obj-y += net-sysfs.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_NETPOLL) += netpoll.o
......
...@@ -2692,6 +2692,7 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) ...@@ -2692,6 +2692,7 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{ {
void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp); unsigned long metalen = xdp_get_metalen(xdp);
void *data_start = xdp->data_hard_start + metalen; void *data_start = xdp->data_hard_start + metalen;
void *data = xdp->data + offset; void *data = xdp->data + offset;
...@@ -2700,6 +2701,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) ...@@ -2700,6 +2701,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
data > xdp->data_end - ETH_HLEN)) data > xdp->data_end - ETH_HLEN))
return -EINVAL; return -EINVAL;
/* Avoid info leak, when reusing area prev used by xdp_frame */
if (data < xdp_frame_end) {
unsigned long clearlen = xdp_frame_end - data;
memset(data, 0, clearlen);
}
if (metalen) if (metalen)
memmove(xdp->data_meta + offset, memmove(xdp->data_meta + offset,
xdp->data_meta, metalen); xdp->data_meta, metalen);
...@@ -2749,13 +2757,18 @@ static int __bpf_tx_xdp(struct net_device *dev, ...@@ -2749,13 +2757,18 @@ static int __bpf_tx_xdp(struct net_device *dev,
struct xdp_buff *xdp, struct xdp_buff *xdp,
u32 index) u32 index)
{ {
struct xdp_frame *xdpf;
int err; int err;
if (!dev->netdev_ops->ndo_xdp_xmit) { if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
if (err) if (err)
return err; return err;
dev->netdev_ops->ndo_xdp_flush(dev); dev->netdev_ops->ndo_xdp_flush(dev);
...@@ -2771,11 +2784,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -2771,11 +2784,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
if (map->map_type == BPF_MAP_TYPE_DEVMAP) { if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
struct net_device *dev = fwd; struct net_device *dev = fwd;
struct xdp_frame *xdpf;
if (!dev->netdev_ops->ndo_xdp_xmit) if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP; return -EOPNOTSUPP;
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
/* TODO: move to inside map code instead, for bulk support
* err = dev_map_enqueue(dev, xdp);
*/
err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
if (err) if (err)
return err; return err;
__dev_map_insert_ctx(map, index); __dev_map_insert_ctx(map, index);
......
/* SPDX-License-Identifier: GPL-2.0
*
* page_pool.c
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <net/page_pool.h>
#include <linux/dma-direction.h>
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
unsigned int ring_qsize = 1024; /* Default */
memcpy(&pool->p, params, sizeof(pool->p));
/* Validate only known flags were used */
if (pool->p.flags & ~(PP_FLAG_ALL))
return -EINVAL;
if (pool->p.pool_size)
ring_qsize = pool->p.pool_size;
/* Sanity limit mem that can be pinned down */
if (ring_qsize > 32768)
return -E2BIG;
/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
(pool->p.dma_dir != DMA_BIDIRECTIONAL))
return -EINVAL;
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
return 0;
}
struct page_pool *page_pool_create(const struct page_pool_params *params)
{
struct page_pool *pool;
int err = 0;
pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
if (!pool)
return ERR_PTR(-ENOMEM);
err = page_pool_init(pool, params);
if (err < 0) {
pr_warn("%s() gave up with errno %d\n", __func__, err);
kfree(pool);
return ERR_PTR(err);
}
return pool;
}
EXPORT_SYMBOL(page_pool_create);
/* fast path */
static struct page *__page_pool_get_cached(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
struct page *page;
/* Quicker fallback, avoid locks when ring is empty */
if (__ptr_ring_empty(r))
return NULL;
/* Test for safe-context, caller should provide this guarantee */
if (likely(in_serving_softirq())) {
if (likely(pool->alloc.count)) {
/* Fast-path */
page = pool->alloc.cache[--pool->alloc.count];
return page;
}
/* Slower-path: Alloc array empty, time to refill
*
* Open-coded bulk ptr_ring consumer.
*
* Discussion: the ring consumer lock is not really
* needed due to the softirq/NAPI protection, but
* later need the ability to reclaim pages on the
* ring. Thus, keeping the locks.
*/
spin_lock(&r->consumer_lock);
while ((page = __ptr_ring_consume(r))) {
if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
break;
pool->alloc.cache[pool->alloc.count++] = page;
}
spin_unlock(&r->consumer_lock);
return page;
}
/* Slow-path: Get page from locked ring queue */
page = ptr_ring_consume(&pool->ring);
return page;
}
/* slow path */
noinline
static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
gfp_t _gfp)
{
struct page *page;
gfp_t gfp = _gfp;
dma_addr_t dma;
/* We could always set __GFP_COMP, and avoid this branch, as
* prep_new_page() can handle order-0 with __GFP_COMP.
*/
if (pool->p.order)
gfp |= __GFP_COMP;
/* FUTURE development:
*
* Current slow-path essentially falls back to single page
* allocations, which doesn't improve performance. This code
* need bulk allocation support from the page allocator code.
*/
/* Cache was empty, do real allocation */
page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
if (!page)
return NULL;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
goto skip_dma_map;
/* Setup DMA mapping: use page->private for DMA-addr
* This mapping is kept for lifetime of page, until leaving pool.
*/
dma = dma_map_page(pool->p.dev, page, 0,
(PAGE_SIZE << pool->p.order),
pool->p.dma_dir);
if (dma_mapping_error(pool->p.dev, dma)) {
put_page(page);
return NULL;
}
set_page_private(page, dma); /* page->private = dma; */
skip_dma_map:
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
}
/* For using page_pool replace: alloc_pages() API calls, but provide
* synchronization guarantee for allocation side.
*/
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
{
struct page *page;
/* Fast-path: Get a page from cache */
page = __page_pool_get_cached(pool);
if (page)
return page;
/* Slow-path: cache empty, do real allocation */
page = __page_pool_alloc_pages_slow(pool, gfp);
return page;
}
EXPORT_SYMBOL(page_pool_alloc_pages);
/* Cleanup page_pool state from page */
static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
{
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
return;
/* DMA unmap */
dma_unmap_page(pool->p.dev, page_private(page),
PAGE_SIZE << pool->p.order, pool->p.dma_dir);
set_page_private(page, 0);
}
/* Return a page to the page allocator, cleaning up our state */
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
{
__page_pool_clean_page(pool, page);
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
* __page_cache_release() call).
*/
}
static bool __page_pool_recycle_into_ring(struct page_pool *pool,
struct page *page)
{
int ret;
/* BH protection not needed if current is serving softirq */
if (in_serving_softirq())
ret = ptr_ring_produce(&pool->ring, page);
else
ret = ptr_ring_produce_bh(&pool->ring, page);
return (ret == 0) ? true : false;
}
/* Only allow direct recycling in special circumstances, into the
* alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
*
* Caller must provide appropriate safe context.
*/
static bool __page_pool_recycle_direct(struct page *page,
struct page_pool *pool)
{
if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
return false;
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
pool->alloc.cache[pool->alloc.count++] = page;
return true;
}
void __page_pool_put_page(struct page_pool *pool,
struct page *page, bool allow_direct)
{
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
*
* refcnt == 1 means page_pool owns page, and can recycle it.
*/
if (likely(page_ref_count(page) == 1)) {
/* Read barrier done in page_ref_count / READ_ONCE */
if (allow_direct && in_serving_softirq())
if (__page_pool_recycle_direct(page, pool))
return;
if (!__page_pool_recycle_into_ring(pool, page)) {
/* Cache full, fallback to free pages */
__page_pool_return_page(pool, page);
}
return;
}
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
* Many drivers split up the page into fragments, and some
* want to keep doing this to save memory and do refcnt based
* recycling. Support this use case too, to ease drivers
* switching between XDP/non-XDP.
*
* In-case page_pool maintains the DMA mapping, API user must
* call page_pool_put_page once. In this elevated refcnt
* case, the DMA is unmapped/released, as driver is likely
* doing refcnt based recycle tricks, meaning another process
* will be invoking put_page.
*/
__page_pool_clean_page(pool, page);
put_page(page);
}
EXPORT_SYMBOL(__page_pool_put_page);
static void __page_pool_empty_ring(struct page_pool *pool)
{
struct page *page;
/* Empty recycle ring */
while ((page = ptr_ring_consume(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
if (!(page_ref_count(page) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
__func__, page_ref_count(page));
__page_pool_return_page(pool, page);
}
}
static void __page_pool_destroy_rcu(struct rcu_head *rcu)
{
struct page_pool *pool;
pool = container_of(rcu, struct page_pool, rcu);
WARN(pool->alloc.count, "API usage violation");
__page_pool_empty_ring(pool);
ptr_ring_cleanup(&pool->ring, NULL);
kfree(pool);
}
/* Cleanup and release resources */
void page_pool_destroy(struct page_pool *pool)
{
struct page *page;
/* Empty alloc cache, assume caller made sure this is
* no-longer in use, and page_pool_alloc_pages() cannot be
* call concurrently.
*/
while (pool->alloc.count) {
page = pool->alloc.cache[--pool->alloc.count];
__page_pool_return_page(pool, page);
}
/* No more consumers should exist, but producers could still
* be in-flight.
*/
__page_pool_empty_ring(pool);
/* An xdp_mem_allocator can still ref page_pool pointer */
call_rcu(&pool->rcu, __page_pool_destroy_rcu);
}
EXPORT_SYMBOL(page_pool_destroy);
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
*/ */
#include <linux/types.h> #include <linux/types.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
#include <net/page_pool.h>
#include <net/xdp.h> #include <net/xdp.h>
...@@ -13,6 +17,104 @@ ...@@ -13,6 +17,104 @@
#define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNREGISTERED 0x2
#define REG_STATE_UNUSED 0x3 #define REG_STATE_UNUSED 0x3
static DEFINE_IDA(mem_id_pool);
static DEFINE_MUTEX(mem_id_lock);
#define MEM_ID_MAX 0xFFFE
#define MEM_ID_MIN 1
static int mem_id_next = MEM_ID_MIN;
static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;
struct xdp_mem_allocator {
struct xdp_mem_info mem;
union {
void *allocator;
struct page_pool *page_pool;
};
struct rhash_head node;
struct rcu_head rcu;
};
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
const u32 *k = data;
const u32 key = *k;
BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
!= sizeof(u32));
/* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
return key << RHT_HASH_RESERVED_SPACE;
}
static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
const struct xdp_mem_allocator *xa = ptr;
u32 mem_id = *(u32 *)arg->key;
return xa->mem.id != mem_id;
}
static const struct rhashtable_params mem_id_rht_params = {
.nelem_hint = 64,
.head_offset = offsetof(struct xdp_mem_allocator, node),
.key_offset = offsetof(struct xdp_mem_allocator, mem.id),
.key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
.max_size = MEM_ID_MAX,
.min_size = 8,
.automatic_shrinking = true,
.hashfn = xdp_mem_id_hashfn,
.obj_cmpfn = xdp_mem_id_cmp,
};
static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
{
struct xdp_mem_allocator *xa;
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
/* Notice, driver is expected to free the *allocator,
* e.g. page_pool, and MUST also use RCU free.
*/
/* Poison memory */
xa->mem.id = 0xFFFF;
xa->mem.type = 0xF0F0;
xa->allocator = (void *)0xDEAD9001;
kfree(xa);
}
static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
int id = xdp_rxq->mem.id;
int err;
if (id == 0)
return;
mutex_lock(&mem_id_lock);
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
if (!xa) {
mutex_unlock(&mem_id_lock);
return;
}
err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
WARN_ON(err);
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
mutex_unlock(&mem_id_lock);
}
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{ {
/* Simplify driver cleanup code paths, allow unreg "unused" */ /* Simplify driver cleanup code paths, allow unreg "unused" */
...@@ -21,8 +123,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) ...@@ -21,8 +123,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
__xdp_rxq_info_unreg_mem_model(xdp_rxq);
xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL; xdp_rxq->dev = NULL;
/* Reset mem info to defaults */
xdp_rxq->mem.id = 0;
xdp_rxq->mem.type = 0;
} }
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
...@@ -71,3 +179,164 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) ...@@ -71,3 +179,164 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
return (xdp_rxq->reg_state == REG_STATE_REGISTERED); return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
} }
EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
static int __mem_id_init_hash_table(void)
{
struct rhashtable *rht;
int ret;
if (unlikely(mem_id_init))
return 0;
rht = kzalloc(sizeof(*rht), GFP_KERNEL);
if (!rht)
return -ENOMEM;
ret = rhashtable_init(rht, &mem_id_rht_params);
if (ret < 0) {
kfree(rht);
return ret;
}
mem_id_ht = rht;
smp_mb(); /* mutex lock should provide enough pairing */
mem_id_init = true;
return 0;
}
/* Allocate a cyclic ID that maps to allocator pointer.
* See: https://www.kernel.org/doc/html/latest/core-api/idr.html
*
* Caller must lock mem_id_lock.
*/
static int __mem_id_cyclic_get(gfp_t gfp)
{
int retries = 1;
int id;
again:
id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
if (id < 0) {
if (id == -ENOSPC) {
/* Cyclic allocator, reset next id */
if (retries--) {
mem_id_next = MEM_ID_MIN;
goto again;
}
}
return id; /* errno */
}
mem_id_next = id + 1;
return id;
}
static bool __is_supported_mem_type(enum xdp_mem_type type)
{
if (type == MEM_TYPE_PAGE_POOL)
return is_page_pool_compiled_in();
if (type >= MEM_TYPE_MAX)
return false;
return true;
}
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
enum xdp_mem_type type, void *allocator)
{
struct xdp_mem_allocator *xdp_alloc;
gfp_t gfp = GFP_KERNEL;
int id, errno, ret;
void *ptr;
if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
WARN(1, "Missing register, driver bug");
return -EFAULT;
}
if (!__is_supported_mem_type(type))
return -EOPNOTSUPP;
xdp_rxq->mem.type = type;
if (!allocator) {
if (type == MEM_TYPE_PAGE_POOL)
return -EINVAL; /* Setup time check page_pool req */
return 0;
}
/* Delay init of rhashtable to save memory if feature isn't used */
if (!mem_id_init) {
mutex_lock(&mem_id_lock);
ret = __mem_id_init_hash_table();
mutex_unlock(&mem_id_lock);
if (ret < 0) {
WARN_ON(1);
return ret;
}
}
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
if (!xdp_alloc)
return -ENOMEM;
mutex_lock(&mem_id_lock);
id = __mem_id_cyclic_get(gfp);
if (id < 0) {
errno = id;
goto err;
}
xdp_rxq->mem.id = id;
xdp_alloc->mem = xdp_rxq->mem;
xdp_alloc->allocator = allocator;
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
errno = PTR_ERR(ptr);
goto err;
}
mutex_unlock(&mem_id_lock);
return 0;
err:
mutex_unlock(&mem_id_lock);
kfree(xdp_alloc);
return errno;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
void xdp_return_frame(struct xdp_frame *xdpf)
{
struct xdp_mem_info *mem = &xdpf->mem;
struct xdp_mem_allocator *xa;
void *data = xdpf->data;
struct page *page;
switch (mem->type) {
case MEM_TYPE_PAGE_POOL:
rcu_read_lock();
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
if (xa)
page_pool_put_page(xa->page_pool, page);
else
put_page(page);
rcu_read_unlock();
break;
case MEM_TYPE_PAGE_SHARED:
page_frag_free(data);
break;
case MEM_TYPE_PAGE_ORDER0:
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
break;
}
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment