Commit 3b1c667e authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-af-xdp-mlx5e'

Tariq Toukan says:

====================
This series contains improvements to the AF_XDP kernel infrastructure
and AF_XDP support in mlx5e. The infrastructure improvements are
required for mlx5e, but also some of them benefit to all drivers, and
some can be useful for other drivers that want to implement AF_XDP.

The performance testing was performed on a machine with the following
configuration:

- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link

The results with retpoline disabled, single stream:

txonly: 33.3 Mpps (21.5 Mpps with queue and app pinned to the same CPU)
rxdrop: 12.2 Mpps
l2fwd: 9.4 Mpps

The results with retpoline enabled, single stream:

txonly: 21.3 Mpps (14.1 Mpps with queue and app pinned to the same CPU)
rxdrop: 9.9 Mpps
l2fwd: 6.8 Mpps

v2 changes:

Added patches for mlx5e and addressed the comments for v1. Rebased for
bpf-next.

v3 changes:

Rebased for the newer bpf-next, resolved conflicts in libbpf. Addressed
Björn's comments for coding style. Fixed a bug in error handling flow in
mlx5e_open_xsk.

v4 changes:

UAPI is not changed, XSK RX queues are exposed to the kernel. The lower
half of the available amount of RX queues are regular queues, and the
upper half are XSK RX queues. The patch "xsk: Extend channels to support
combined XSK/non-XSK traffic" was dropped. The final patch was reworked
accordingly.

Added "net/mlx5e: Attach/detach XDP program safely", as the changes
introduced in the XSK patch base on the stuff from this one.

Added "libbpf: Support drivers with non-combined channels", which aligns
the condition in libbpf with the condition in the kernel.

Rebased over the newer bpf-next.

v5 changes:

In v4, ethtool reports the number of channels as 'combined' and the
number of XSK RX queues as 'rx' for mlx5e. It was changed, so that 'rx'
is 0, and 'combined' reports the double amount of channels if there is
an active UMEM - to make libbpf happy.

The patch for libbpf was dropped. Although it's still useful and fixes
things, it raises some disagreement, so I'm dropping it - it's no longer
useful for mlx5e anymore after the change above.

v6 changes:

As Maxim is out of office, I rebased the series on behalf of him,
solved some conflicts, and re-spinned.
====================
Acked-by: default avatarBjörn Töpel <bjorn.topel@intel.com>
Tested-by: default avatarJonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents e5c891a3 db05815b
......@@ -641,8 +641,8 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
struct i40e_tx_desc *tx_desc = NULL;
struct i40e_tx_buffer *tx_bi;
bool work_done = true;
struct xdp_desc desc;
dma_addr_t dma;
u32 len;
while (budget-- > 0) {
if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
......@@ -651,21 +651,23 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
break;
}
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
dma_sync_single_for_device(xdp_ring->dev, dma, len,
dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
tx_bi->bytecount = len;
tx_bi->bytecount = desc.len;
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
tx_desc->buffer_addr = cpu_to_le64(dma);
tx_desc->cmd_type_offset_bsz =
build_ctob(I40E_TX_DESC_CMD_ICRC
| I40E_TX_DESC_CMD_EOP,
0, len, 0);
0, desc.len, 0);
xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
......
......@@ -571,8 +571,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
union ixgbe_adv_tx_desc *tx_desc = NULL;
struct ixgbe_tx_buffer *tx_bi;
bool work_done = true;
u32 len, cmd_type;
struct xdp_desc desc;
dma_addr_t dma;
u32 cmd_type;
while (budget-- > 0) {
if (unlikely(!ixgbe_desc_unused(xdp_ring)) ||
......@@ -581,14 +582,16 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
break;
}
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &dma, &len))
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
dma_sync_single_for_device(xdp_ring->dev, dma, len,
dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
tx_bi->bytecount = len;
tx_bi->bytecount = desc.len;
tx_bi->xdpf = NULL;
tx_bi->gso_segs = 1;
......@@ -599,10 +602,10 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
cmd_type = IXGBE_ADVTXD_DTYP_DATA |
IXGBE_ADVTXD_DCMD_DEXT |
IXGBE_ADVTXD_DCMD_IFCS;
cmd_type |= len | IXGBE_TXD_CMD;
cmd_type |= desc.len | IXGBE_TXD_CMD;
tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
tx_desc->read.olinfo_status =
cpu_to_le32(len << IXGBE_ADVTXD_PAYLEN_SHIFT);
cpu_to_le32(desc.len << IXGBE_ADVTXD_PAYLEN_SHIFT);
xdp_ring->next_to_use++;
if (xdp_ring->next_to_use == xdp_ring->count)
......
......@@ -24,7 +24,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \
en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \
en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o \
en/params.o
en/params.o en/xsk/umem.o en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o
#
# Netdev extra
......
......@@ -3,65 +3,102 @@
#include "en/params.h"
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
u16 linear_rq_headroom = params->xdp_prog ?
XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
u32 frag_sz;
return params->xdp_prog || xsk;
}
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u16 headroom = NET_IP_ALIGN;
if (mlx5e_rx_is_xdp(params, xsk)) {
headroom += XDP_PACKET_HEADROOM;
if (xsk)
headroom += xsk->headroom;
} else {
headroom += MLX5_RX_HEADROOM;
}
return headroom;
}
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u32 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
u16 linear_rq_headroom = mlx5e_get_linear_rq_headroom(params, xsk);
u32 frag_sz = linear_rq_headroom + hw_mtu;
linear_rq_headroom += NET_IP_ALIGN;
/* AF_XDP doesn't build SKBs in place. */
if (!xsk)
frag_sz = MLX5_SKB_FRAG_SZ(frag_sz);
frag_sz = MLX5_SKB_FRAG_SZ(linear_rq_headroom + hw_mtu);
/* XDP in mlx5e doesn't support multiple packets per page. */
if (mlx5e_rx_is_xdp(params, xsk))
frag_sz = max_t(u32, frag_sz, PAGE_SIZE);
if (params->xdp_prog && frag_sz < PAGE_SIZE)
frag_sz = PAGE_SIZE;
/* Even if we can go with a smaller fragment size, we must not put
* multiple packets into a single frame.
*/
if (xsk)
frag_sz = max_t(u32, frag_sz, xsk->chunk_size);
return frag_sz;
}
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params);
u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
}
bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params)
bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
/* AF_XDP allocates SKBs on XDP_PASS - ensure they don't occupy more
* than one page. For this, check both with and without xsk.
*/
u32 linear_frag_sz = max(mlx5e_rx_get_linear_frag_sz(params, xsk),
mlx5e_rx_get_linear_frag_sz(params, NULL));
return !params->lro_en && frag_sz <= PAGE_SIZE;
return !params->lro_en && linear_frag_sz <= PAGE_SIZE;
}
#define MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ ((BIT(__mlx5_bit_sz(wq, log_wqe_stride_size)) - 1) + \
MLX5_MPWQE_LOG_STRIDE_SZ_BASE)
bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk);
s8 signed_log_num_strides_param;
u8 log_num_strides;
if (!mlx5e_rx_is_linear_skb(params))
if (!mlx5e_rx_is_linear_skb(params, xsk))
return false;
if (order_base_2(frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
if (order_base_2(linear_frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ)
return false;
if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
return true;
log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(frag_sz);
log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
signed_log_num_strides_param =
(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
return signed_log_num_strides_param >= 0;
}
u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params);
u8 log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(params, xsk);
/* Numbers are unsigned, don't subtract to avoid underflow. */
if (params->log_rq_mtu_frames <
......@@ -72,33 +109,30 @@ u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
}
u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
return order_base_2(mlx5e_rx_get_linear_frag_sz(params));
if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk))
return order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk));
return MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev);
}
u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
return MLX5_MPWRQ_LOG_WQE_SZ -
mlx5e_mpwqe_get_log_stride_size(mdev, params);
mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
}
u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u16 linear_rq_headroom = params->xdp_prog ?
XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
bool is_linear_skb;
linear_rq_headroom += NET_IP_ALIGN;
is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
mlx5e_rx_is_linear_skb(params) :
mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
bool is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
mlx5e_rx_is_linear_skb(params, xsk) :
mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk);
return is_linear_skb ? linear_rq_headroom : 0;
return is_linear_skb ? mlx5e_get_linear_rq_headroom(params, xsk) : 0;
}
......@@ -6,17 +6,119 @@
#include "en.h"
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params);
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params);
bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params);
struct mlx5e_xsk_param {
u16 headroom;
u16 chunk_size;
};
struct mlx5e_rq_param {
u32 rqc[MLX5_ST_SZ_DW(rqc)];
struct mlx5_wq_param wq;
struct mlx5e_rq_frags_info frags_info;
};
struct mlx5e_sq_param {
u32 sqc[MLX5_ST_SZ_DW(sqc)];
struct mlx5_wq_param wq;
bool is_mpw;
};
struct mlx5e_cq_param {
u32 cqc[MLX5_ST_SZ_DW(cqc)];
struct mlx5_wq_param wq;
u16 eq_ix;
u8 cq_period_mode;
};
struct mlx5e_channel_param {
struct mlx5e_rq_param rq;
struct mlx5e_sq_param sq;
struct mlx5e_sq_param xdp_sq;
struct mlx5e_sq_param icosq;
struct mlx5e_cq_param rx_cq;
struct mlx5e_cq_param tx_cq;
struct mlx5e_cq_param icosq_cq;
};
static inline bool mlx5e_qid_get_ch_if_in_group(struct mlx5e_params *params,
u16 qid,
enum mlx5e_rq_group group,
u16 *ix)
{
int nch = params->num_channels;
int ch = qid - nch * group;
if (ch < 0 || ch >= nch)
return false;
*ix = ch;
return true;
}
static inline void mlx5e_qid_get_ch_and_group(struct mlx5e_params *params,
u16 qid,
u16 *ix,
enum mlx5e_rq_group *group)
{
u16 nch = params->num_channels;
*ix = qid % nch;
*group = qid / nch;
}
static inline bool mlx5e_qid_validate(struct mlx5e_params *params, u64 qid)
{
return qid < params->num_channels * MLX5E_NUM_RQ_GROUPS;
}
/* Parameter calculations */
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
struct mlx5e_params *params);
u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params);
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
struct mlx5e_params *params);
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
struct mlx5e_params *params);
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
struct mlx5e_params *params);
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk);
/* Build queue parameters */
void mlx5e_build_rq_param(struct mlx5e_priv *priv,
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5e_rq_param *param);
void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
struct mlx5e_sq_param *param);
void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5e_cq_param *param);
void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv,
struct mlx5e_params *params,
struct mlx5e_cq_param *param);
void mlx5e_build_ico_cq_param(struct mlx5e_priv *priv,
u8 log_wq_size,
struct mlx5e_cq_param *param);
void mlx5e_build_icosq_param(struct mlx5e_priv *priv,
u8 log_wq_size,
struct mlx5e_sq_param *param);
void mlx5e_build_xdpsq_param(struct mlx5e_priv *priv,
struct mlx5e_params *params,
struct mlx5e_sq_param *param);
#endif /* __MLX5_EN_PARAMS_H__ */
......@@ -39,11 +39,13 @@
(sizeof(struct mlx5e_tx_wqe) / MLX5_SEND_WQE_DS)
#define MLX5E_XDP_TX_DS_COUNT (MLX5E_XDP_TX_EMPTY_DS_COUNT + 1 /* SG DS */)
int mlx5e_xdp_max_mtu(struct mlx5e_params *params);
struct mlx5e_xsk_param;
int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
void *va, u16 *rx_headroom, u32 *len);
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq);
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq);
void *va, u16 *rx_headroom, u32 *len, bool xsk);
void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);
void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw);
void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq);
int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
......@@ -66,6 +68,21 @@ static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv)
return test_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
}
static inline void mlx5e_xdp_set_open(struct mlx5e_priv *priv)
{
set_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
}
static inline void mlx5e_xdp_set_closed(struct mlx5e_priv *priv)
{
clear_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
}
static inline bool mlx5e_xdp_is_open(struct mlx5e_priv *priv)
{
return test_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
}
static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
{
if (sq->doorbell_cseg) {
......@@ -97,15 +114,14 @@ static inline void mlx5e_xdp_update_inline_state(struct mlx5e_xdpsq *sq)
}
static inline void
mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi,
mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq,
struct mlx5e_xdp_xmit_data *xdptxd,
struct mlx5e_xdpsq_stats *stats)
{
struct mlx5e_xdp_mpwqe *session = &sq->mpwqe;
dma_addr_t dma_addr = xdpi->dma_addr;
struct xdp_frame *xdpf = xdpi->xdpf;
struct mlx5_wqe_data_seg *dseg =
(struct mlx5_wqe_data_seg *)session->wqe + session->ds_count;
u16 dma_len = xdpf->len;
u32 dma_len = xdptxd->len;
session->pkt_count++;
......@@ -124,7 +140,7 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi,
}
inline_dseg->byte_count = cpu_to_be32(dma_len | MLX5_INLINE_SEG);
memcpy(inline_dseg->data, xdpf->data, dma_len);
memcpy(inline_dseg->data, xdptxd->data, dma_len);
session->ds_count += ds_cnt;
stats->inlnw++;
......@@ -132,7 +148,7 @@ mlx5e_xdp_mpwqe_add_dseg(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_info *xdpi,
}
no_inline:
dseg->addr = cpu_to_be64(dma_addr);
dseg->addr = cpu_to_be64(xdptxd->dma_addr);
dseg->byte_count = cpu_to_be32(dma_len);
dseg->lkey = sq->mkey_be;
session->ds_count++;
......
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include "rx.h"
#include "en/xdp.h"
#include <net/xdp_sock.h>
/* RX data path */
bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
{
/* Check in advance that we have enough frames, instead of allocating
* one-by-one, failing and moving frames to the Reuse Ring.
*/
return xsk_umem_has_addrs_rq(rq->umem, count);
}
int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
struct xdp_umem *umem = rq->umem;
u64 handle;
if (!xsk_umem_peek_addr_rq(umem, &handle))
return -ENOMEM;
dma_info->xsk.handle = handle + rq->buff.umem_headroom;
dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
/* No need to add headroom to the DMA address. In striding RQ case, we
* just provide pages for UMR, and headroom is counted at the setup
* stage when creating a WQE. In non-striding RQ case, headroom is
* accounted in mlx5e_alloc_rx_wqe.
*/
dma_info->addr = xdp_umem_get_dma(umem, handle);
xsk_umem_discard_addr_rq(umem);
dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
DMA_BIDIRECTIONAL);
return 0;
}
static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
{
xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
}
/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
* the userspace if possible, and if not, this function is called to reuse them
* in the driver.
*/
void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
}
/* Return a frame back to the hardware to fill in again. It is used by XDP when
* the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
*/
void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
{
struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
mlx5e_xsk_recycle_frame(rq, handle);
}
static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
u32 cqe_bcnt)
{
struct sk_buff *skb;
skb = napi_alloc_skb(rq->cq.napi, cqe_bcnt);
if (unlikely(!skb)) {
rq->stats->buff_alloc_err++;
return NULL;
}
skb_put_data(skb, data, cqe_bcnt);
return skb;
}
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
u16 cqe_bcnt,
u32 head_offset,
u32 page_idx)
{
struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
u32 cqe_bcnt32 = cqe_bcnt;
void *va, *data;
u32 frag_size;
bool consumed;
/* Check packet size. Note LRO doesn't use linear SKB */
if (unlikely(cqe_bcnt > rq->hw_mtu)) {
rq->stats->oversize_pkts_sw_drop++;
return NULL;
}
/* head_offset is not used in this function, because di->xsk.data and
* di->addr point directly to the necessary place. Furthermore, in the
* current implementation, one page = one packet = one frame, so
* head_offset should always be 0.
*/
WARN_ON_ONCE(head_offset);
va = di->xsk.data;
data = va + rx_headroom;
frag_size = rq->buff.headroom + cqe_bcnt32;
dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
prefetch(data);
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
rcu_read_unlock();
/* Possible flows:
* - XDP_REDIRECT to XSKMAP:
* The page is owned by the userspace from now.
* - XDP_TX and other XDP_REDIRECTs:
* The page was returned by ZCA and recycled.
* - XDP_DROP:
* Recycle the page.
* - XDP_PASS:
* Allocate an SKB, copy the data and recycle the page.
*
* Pages to be recycled go to the Reuse Ring on MPWQE deallocation. Its
* size is the same as the Driver RX Ring's size, and pages for WQEs are
* allocated first from the Reuse Ring, so it has enough space.
*/
if (likely(consumed)) {
if (likely(__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)))
__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
return NULL; /* page/packet was consumed by XDP */
}
/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
* frame. On SKB allocation failure, NULL is returned.
*/
return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
}
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5_cqe64 *cqe,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
struct mlx5e_dma_info *di = wi->di;
u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
void *va, *data;
bool consumed;
u32 frag_size;
/* wi->offset is not used in this function, because di->xsk.data and
* di->addr point directly to the necessary place. Furthermore, in the
* current implementation, one page = one packet = one frame, so
* wi->offset should always be 0.
*/
WARN_ON_ONCE(wi->offset);
va = di->xsk.data;
data = va + rx_headroom;
frag_size = rq->buff.headroom + cqe_bcnt;
dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
prefetch(data);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
rq->stats->wqe_err++;
return NULL;
}
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
rcu_read_unlock();
if (likely(consumed))
return NULL; /* page/packet was consumed by XDP */
/* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse
* will be handled by mlx5e_put_rx_frag.
* On SKB allocation failure, NULL is returned.
*/
return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
}
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2019 Mellanox Technologies. */
#ifndef __MLX5_EN_XSK_RX_H__
#define __MLX5_EN_XSK_RX_H__
#include "en.h"
/* RX data path */
bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info);
void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info);
void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
u16 cqe_bcnt,
u32 head_offset,
u32 page_idx);
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5_cqe64 *cqe,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt);
#endif /* __MLX5_EN_XSK_RX_H__ */
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include "setup.h"
#include "en/params.h"
bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5_core_dev *mdev)
{
/* AF_XDP doesn't support frames larger than PAGE_SIZE, and the current
* mlx5e XDP implementation doesn't support multiple packets per page.
*/
if (xsk->chunk_size != PAGE_SIZE)
return false;
/* Current MTU and XSK headroom don't allow packets to fit the frames. */
if (mlx5e_rx_get_linear_frag_sz(params, xsk) > xsk->chunk_size)
return false;
/* frag_sz is different for regular and XSK RQs, so ensure that linear
* SKB mode is possible.
*/
switch (params->rq_wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
return mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk);
default: /* MLX5_WQ_TYPE_CYCLIC */
return mlx5e_rx_is_linear_skb(params, xsk);
}
}
static void mlx5e_build_xskicosq_param(struct mlx5e_priv *priv,
u8 log_wq_size,
struct mlx5e_sq_param *param)
{
void *sqc = param->sqc;
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
mlx5e_build_sq_param_common(priv, param);
MLX5_SET(wq, wq, log_wq_sz, log_wq_size);
}
static void mlx5e_build_xsk_cparam(struct mlx5e_priv *priv,
struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5e_channel_param *cparam)
{
const u8 xskicosq_size = MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
mlx5e_build_rq_param(priv, params, xsk, &cparam->rq);
mlx5e_build_xdpsq_param(priv, params, &cparam->xdp_sq);
mlx5e_build_xskicosq_param(priv, xskicosq_size, &cparam->icosq);
mlx5e_build_rx_cq_param(priv, params, xsk, &cparam->rx_cq);
mlx5e_build_tx_cq_param(priv, params, &cparam->tx_cq);
mlx5e_build_ico_cq_param(priv, xskicosq_size, &cparam->icosq_cq);
}
int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk, struct xdp_umem *umem,
struct mlx5e_channel *c)
{
struct mlx5e_channel_param cparam = {};
struct net_dim_cq_moder icocq_moder = {};
int err;
if (!mlx5e_validate_xsk_param(params, xsk, priv->mdev))
return -EINVAL;
mlx5e_build_xsk_cparam(priv, params, xsk, &cparam);
err = mlx5e_open_cq(c, params->rx_cq_moderation, &cparam.rx_cq, &c->xskrq.cq);
if (unlikely(err))
return err;
err = mlx5e_open_rq(c, params, &cparam.rq, xsk, umem, &c->xskrq);
if (unlikely(err))
goto err_close_rx_cq;
err = mlx5e_open_cq(c, params->tx_cq_moderation, &cparam.tx_cq, &c->xsksq.cq);
if (unlikely(err))
goto err_close_rq;
/* Create a separate SQ, so that when the UMEM is disabled, we could
* close this SQ safely and stop receiving CQEs. In other case, e.g., if
* the XDPSQ was used instead, we might run into trouble when the UMEM
* is disabled and then reenabled, but the SQ continues receiving CQEs
* from the old UMEM.
*/
err = mlx5e_open_xdpsq(c, params, &cparam.xdp_sq, umem, &c->xsksq, true);
if (unlikely(err))
goto err_close_tx_cq;
err = mlx5e_open_cq(c, icocq_moder, &cparam.icosq_cq, &c->xskicosq.cq);
if (unlikely(err))
goto err_close_sq;
/* Create a dedicated SQ for posting NOPs whenever we need an IRQ to be
* triggered and NAPI to be called on the correct CPU.
*/
err = mlx5e_open_icosq(c, params, &cparam.icosq, &c->xskicosq);
if (unlikely(err))
goto err_close_icocq;
spin_lock_init(&c->xskicosq_lock);
set_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
return 0;
err_close_icocq:
mlx5e_close_cq(&c->xskicosq.cq);
err_close_sq:
mlx5e_close_xdpsq(&c->xsksq);
err_close_tx_cq:
mlx5e_close_cq(&c->xsksq.cq);
err_close_rq:
mlx5e_close_rq(&c->xskrq);
err_close_rx_cq:
mlx5e_close_cq(&c->xskrq.cq);
return err;
}
void mlx5e_close_xsk(struct mlx5e_channel *c)
{
clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
napi_synchronize(&c->napi);
mlx5e_close_rq(&c->xskrq);
mlx5e_close_cq(&c->xskrq.cq);
mlx5e_close_icosq(&c->xskicosq);
mlx5e_close_cq(&c->xskicosq.cq);
mlx5e_close_xdpsq(&c->xsksq);
mlx5e_close_cq(&c->xsksq.cq);
}
void mlx5e_activate_xsk(struct mlx5e_channel *c)
{
set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state);
/* TX queue is created active. */
mlx5e_trigger_irq(&c->xskicosq);
}
void mlx5e_deactivate_xsk(struct mlx5e_channel *c)
{
mlx5e_deactivate_rq(&c->xskrq);
/* TX queue is disabled on close. */
}
static int mlx5e_redirect_xsk_rqt(struct mlx5e_priv *priv, u16 ix, u32 rqn)
{
struct mlx5e_redirect_rqt_param direct_rrp = {
.is_rss = false,
{
.rqn = rqn,
},
};
u32 rqtn = priv->xsk_tir[ix].rqt.rqtn;
return mlx5e_redirect_rqt(priv, rqtn, 1, direct_rrp);
}
int mlx5e_xsk_redirect_rqt_to_channel(struct mlx5e_priv *priv, struct mlx5e_channel *c)
{
return mlx5e_redirect_xsk_rqt(priv, c->ix, c->xskrq.rqn);
}
int mlx5e_xsk_redirect_rqt_to_drop(struct mlx5e_priv *priv, u16 ix)
{
return mlx5e_redirect_xsk_rqt(priv, ix, priv->drop_rq.rqn);
}
int mlx5e_xsk_redirect_rqts_to_channels(struct mlx5e_priv *priv, struct mlx5e_channels *chs)
{
int err, i;
if (!priv->xsk.refcnt)
return 0;
for (i = 0; i < chs->num; i++) {
struct mlx5e_channel *c = chs->c[i];
if (!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
continue;
err = mlx5e_xsk_redirect_rqt_to_channel(priv, c);
if (unlikely(err))
goto err_stop;
}
return 0;
err_stop:
for (i--; i >= 0; i--) {
if (!test_bit(MLX5E_CHANNEL_STATE_XSK, chs->c[i]->state))
continue;
mlx5e_xsk_redirect_rqt_to_drop(priv, i);
}
return err;
}
void mlx5e_xsk_redirect_rqts_to_drop(struct mlx5e_priv *priv, struct mlx5e_channels *chs)
{
int i;
if (!priv->xsk.refcnt)
return;
for (i = 0; i < chs->num; i++) {
if (!test_bit(MLX5E_CHANNEL_STATE_XSK, chs->c[i]->state))
continue;
mlx5e_xsk_redirect_rqt_to_drop(priv, i);
}
}
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2019 Mellanox Technologies. */
#ifndef __MLX5_EN_XSK_SETUP_H__
#define __MLX5_EN_XSK_SETUP_H__
#include "en.h"
struct mlx5e_xsk_param;
bool mlx5e_validate_xsk_param(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk,
struct mlx5_core_dev *mdev);
int mlx5e_open_xsk(struct mlx5e_priv *priv, struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk, struct xdp_umem *umem,
struct mlx5e_channel *c);
void mlx5e_close_xsk(struct mlx5e_channel *c);
void mlx5e_activate_xsk(struct mlx5e_channel *c);
void mlx5e_deactivate_xsk(struct mlx5e_channel *c);
int mlx5e_xsk_redirect_rqt_to_channel(struct mlx5e_priv *priv, struct mlx5e_channel *c);
int mlx5e_xsk_redirect_rqt_to_drop(struct mlx5e_priv *priv, u16 ix);
int mlx5e_xsk_redirect_rqts_to_channels(struct mlx5e_priv *priv, struct mlx5e_channels *chs);
void mlx5e_xsk_redirect_rqts_to_drop(struct mlx5e_priv *priv, struct mlx5e_channels *chs);
#endif /* __MLX5_EN_XSK_SETUP_H__ */
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include "tx.h"
#include "umem.h"
#include "en/xdp.h"
#include "en/params.h"
#include <net/xdp_sock.h>
int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid)
{
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_params *params = &priv->channels.params;
struct mlx5e_channel *c;
u16 ix;
if (unlikely(!mlx5e_xdp_is_open(priv)))
return -ENETDOWN;
if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
return -EINVAL;
c = priv->channels.c[ix];
if (unlikely(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)))
return -ENXIO;
if (!napi_if_scheduled_mark_missed(&c->napi)) {
spin_lock(&c->xskicosq_lock);
mlx5e_trigger_irq(&c->xskicosq);
spin_unlock(&c->xskicosq_lock);
}
return 0;
}
/* When TX fails (because of the size of the packet), we need to get completions
* in order, so post a NOP to get a CQE. Since AF_XDP doesn't distinguish
* between successful TX and errors, handling in mlx5e_poll_xdpsq_cq is the
* same.
*/
static void mlx5e_xsk_tx_post_err(struct mlx5e_xdpsq *sq,
struct mlx5e_xdp_info *xdpi)
{
u16 pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc);
struct mlx5e_xdp_wqe_info *wi = &sq->db.wqe_info[pi];
struct mlx5e_tx_wqe *nopwqe;
wi->num_wqebbs = 1;
wi->num_pkts = 1;
nopwqe = mlx5e_post_nop(&sq->wq, sq->sqn, &sq->pc);
mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi);
sq->doorbell_cseg = &nopwqe->ctrl;
}
bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
{
struct xdp_umem *umem = sq->umem;
struct mlx5e_xdp_info xdpi;
struct mlx5e_xdp_xmit_data xdptxd;
bool work_done = true;
bool flush = false;
xdpi.mode = MLX5E_XDP_XMIT_MODE_XSK;
for (; budget; budget--) {
int check_result = sq->xmit_xdp_frame_check(sq);
struct xdp_desc desc;
if (unlikely(check_result < 0)) {
work_done = false;
break;
}
if (!xsk_umem_consume_tx(umem, &desc)) {
/* TX will get stuck until something wakes it up by
* triggering NAPI. Currently it's expected that the
* application calls sendto() if there are consumed, but
* not completed frames.
*/
break;
}
xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
xdptxd.data = xdp_umem_get_data(umem, desc.addr);
xdptxd.len = desc.len;
dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
xdptxd.len, DMA_BIDIRECTIONAL);
if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) {
if (sq->mpwqe.wqe)
mlx5e_xdp_mpwqe_complete(sq);
mlx5e_xsk_tx_post_err(sq, &xdpi);
}
flush = true;
}
if (flush) {
if (sq->mpwqe.wqe)
mlx5e_xdp_mpwqe_complete(sq);
mlx5e_xmit_xdp_doorbell(sq);
xsk_umem_consume_tx_done(umem);
}
return !(budget && work_done);
}
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2019 Mellanox Technologies. */
#ifndef __MLX5_EN_XSK_TX_H__
#define __MLX5_EN_XSK_TX_H__
#include "en.h"
/* TX data path */
int mlx5e_xsk_async_xmit(struct net_device *dev, u32 qid);
bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
#endif /* __MLX5_EN_XSK_TX_H__ */
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include <net/xdp_sock.h>
#include "umem.h"
#include "setup.h"
#include "en/params.h"
static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
struct device *dev = priv->mdev->device;
u32 i;
for (i = 0; i < umem->npgs; i++) {
dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL);
if (unlikely(dma_mapping_error(dev, dma)))
goto err_unmap;
umem->pages[i].dma = dma;
}
return 0;
err_unmap:
while (i--) {
dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL);
umem->pages[i].dma = 0;
}
return -ENOMEM;
}
static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
struct device *dev = priv->mdev->device;
u32 i;
for (i = 0; i < umem->npgs; i++) {
dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL);
umem->pages[i].dma = 0;
}
}
static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
{
if (!xsk->umems) {
xsk->umems = kcalloc(MLX5E_MAX_NUM_CHANNELS,
sizeof(*xsk->umems), GFP_KERNEL);
if (unlikely(!xsk->umems))
return -ENOMEM;
}
xsk->refcnt++;
xsk->ever_used = true;
return 0;
}
static void mlx5e_xsk_put_umems(struct mlx5e_xsk *xsk)
{
if (!--xsk->refcnt) {
kfree(xsk->umems);
xsk->umems = NULL;
}
}
static int mlx5e_xsk_add_umem(struct mlx5e_xsk *xsk, struct xdp_umem *umem, u16 ix)
{
int err;
err = mlx5e_xsk_get_umems(xsk);
if (unlikely(err))
return err;
xsk->umems[ix] = umem;
return 0;
}
static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
{
xsk->umems[ix] = NULL;
mlx5e_xsk_put_umems(xsk);
}
static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
{
return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
}
void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
{
xsk->headroom = umem->headroom;
xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
}
static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
struct xdp_umem *umem, u16 ix)
{
struct mlx5e_params *params = &priv->channels.params;
struct mlx5e_xsk_param xsk;
struct mlx5e_channel *c;
int err;
if (unlikely(mlx5e_xsk_get_umem(&priv->channels.params, &priv->xsk, ix)))
return -EBUSY;
if (unlikely(!mlx5e_xsk_is_umem_sane(umem)))
return -EINVAL;
err = mlx5e_xsk_map_umem(priv, umem);
if (unlikely(err))
return err;
err = mlx5e_xsk_add_umem(&priv->xsk, umem, ix);
if (unlikely(err))
goto err_unmap_umem;
mlx5e_build_xsk_param(umem, &xsk);
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
/* XSK objects will be created on open. */
goto validate_closed;
}
if (!params->xdp_prog) {
/* XSK objects will be created when an XDP program is set,
* and the channels are reopened.
*/
goto validate_closed;
}
c = priv->channels.c[ix];
err = mlx5e_open_xsk(priv, params, &xsk, umem, c);
if (unlikely(err))
goto err_remove_umem;
mlx5e_activate_xsk(c);
/* Don't wait for WQEs, because the newer xdpsock sample doesn't provide
* any Fill Ring entries at the setup stage.
*/
err = mlx5e_xsk_redirect_rqt_to_channel(priv, priv->channels.c[ix]);
if (unlikely(err))
goto err_deactivate;
return 0;
err_deactivate:
mlx5e_deactivate_xsk(c);
mlx5e_close_xsk(c);
err_remove_umem:
mlx5e_xsk_remove_umem(&priv->xsk, ix);
err_unmap_umem:
mlx5e_xsk_unmap_umem(priv, umem);
return err;
validate_closed:
/* Check the configuration in advance, rather than fail at a later stage
* (in mlx5e_xdp_set or on open) and end up with no channels.
*/
if (!mlx5e_validate_xsk_param(params, &xsk, priv->mdev)) {
err = -EINVAL;
goto err_remove_umem;
}
return 0;
}
static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix)
{
struct xdp_umem *umem = mlx5e_xsk_get_umem(&priv->channels.params,
&priv->xsk, ix);
struct mlx5e_channel *c;
if (unlikely(!umem))
return -EINVAL;
if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
goto remove_umem;
/* XSK RQ and SQ are only created if XDP program is set. */
if (!priv->channels.params.xdp_prog)
goto remove_umem;
c = priv->channels.c[ix];
mlx5e_xsk_redirect_rqt_to_drop(priv, ix);
mlx5e_deactivate_xsk(c);
mlx5e_close_xsk(c);
remove_umem:
mlx5e_xsk_remove_umem(&priv->xsk, ix);
mlx5e_xsk_unmap_umem(priv, umem);
return 0;
}
static int mlx5e_xsk_enable_umem(struct mlx5e_priv *priv, struct xdp_umem *umem,
u16 ix)
{
int err;
mutex_lock(&priv->state_lock);
err = mlx5e_xsk_enable_locked(priv, umem, ix);
mutex_unlock(&priv->state_lock);
return err;
}
static int mlx5e_xsk_disable_umem(struct mlx5e_priv *priv, u16 ix)
{
int err;
mutex_lock(&priv->state_lock);
err = mlx5e_xsk_disable_locked(priv, ix);
mutex_unlock(&priv->state_lock);
return err;
}
int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
{
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_params *params = &priv->channels.params;
u16 ix;
if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
return -EINVAL;
return umem ? mlx5e_xsk_enable_umem(priv, umem, ix) :
mlx5e_xsk_disable_umem(priv, ix);
}
int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
{
struct xdp_umem_fq_reuse *reuseq;
reuseq = xsk_reuseq_prepare(nentries);
if (unlikely(!reuseq))
return -ENOMEM;
xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
return 0;
}
u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
{
u16 res = xsk->refcnt ? params->num_channels : 0;
while (res) {
if (mlx5e_xsk_get_umem(params, xsk, res - 1))
break;
--res;
}
return res;
}
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/* Copyright (c) 2019 Mellanox Technologies. */
#ifndef __MLX5_EN_XSK_UMEM_H__
#define __MLX5_EN_XSK_UMEM_H__
#include "en.h"
static inline struct xdp_umem *mlx5e_xsk_get_umem(struct mlx5e_params *params,
struct mlx5e_xsk *xsk, u16 ix)
{
if (!xsk || !xsk->umems)
return NULL;
if (unlikely(ix >= params->num_channels))
return NULL;
return xsk->umems[ix];
}
struct mlx5e_xsk_param;
void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk);
/* .ndo_bpf callback. */
int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid);
int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries);
u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk);
#endif /* __MLX5_EN_XSK_UMEM_H__ */
......@@ -32,6 +32,7 @@
#include "en.h"
#include "en/port.h"
#include "en/xsk/umem.h"
#include "lib/clock.h"
void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
......@@ -388,8 +389,17 @@ static int mlx5e_set_ringparam(struct net_device *dev,
void mlx5e_ethtool_get_channels(struct mlx5e_priv *priv,
struct ethtool_channels *ch)
{
mutex_lock(&priv->state_lock);
ch->max_combined = mlx5e_get_netdev_max_channels(priv->netdev);
ch->combined_count = priv->channels.params.num_channels;
if (priv->xsk.refcnt) {
/* The upper half are XSK queues. */
ch->max_combined *= 2;
ch->combined_count *= 2;
}
mutex_unlock(&priv->state_lock);
}
static void mlx5e_get_channels(struct net_device *dev,
......@@ -403,6 +413,7 @@ static void mlx5e_get_channels(struct net_device *dev,
int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
struct ethtool_channels *ch)
{
struct mlx5e_params *cur_params = &priv->channels.params;
unsigned int count = ch->combined_count;
struct mlx5e_channels new_channels = {};
bool arfs_enabled;
......@@ -414,16 +425,26 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv,
return -EINVAL;
}
if (priv->channels.params.num_channels == count)
if (cur_params->num_channels == count)
return 0;
mutex_lock(&priv->state_lock);
/* Don't allow changing the number of channels if there is an active
* XSK, because the numeration of the XSK and regular RQs will change.
*/
if (priv->xsk.refcnt) {
err = -EINVAL;
netdev_err(priv->netdev, "%s: AF_XDP is active, cannot change the number of channels\n",
__func__);
goto out;
}
new_channels.params = priv->channels.params;
new_channels.params.num_channels = count;
if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
priv->channels.params = new_channels.params;
*cur_params = new_channels.params;
if (!netif_is_rxfh_configured(priv->netdev))
mlx5e_build_default_indir_rqt(priv->rss_params.indirection_rqt,
MLX5E_INDIR_RQT_SIZE, count);
......
......@@ -32,6 +32,8 @@
#include <linux/mlx5/fs.h>
#include "en.h"
#include "en/params.h"
#include "en/xsk/umem.h"
struct mlx5e_ethtool_rule {
struct list_head list;
......@@ -414,6 +416,14 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
if (fs->ring_cookie == RX_CLS_FLOW_DISC) {
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP;
} else {
struct mlx5e_params *params = &priv->channels.params;
enum mlx5e_rq_group group;
struct mlx5e_tir *tir;
u16 ix;
mlx5e_qid_get_ch_and_group(params, fs->ring_cookie, &ix, &group);
tir = group == MLX5E_RQ_GROUP_XSK ? priv->xsk_tir : priv->direct_tir;
dst = kzalloc(sizeof(*dst), GFP_KERNEL);
if (!dst) {
err = -ENOMEM;
......@@ -421,7 +431,7 @@ add_ethtool_flow_rule(struct mlx5e_priv *priv,
}
dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
dst->tir_num = priv->direct_tir[fs->ring_cookie].tirn;
dst->tir_num = tir[ix].tirn;
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
}
......@@ -600,9 +610,9 @@ static int validate_flow(struct mlx5e_priv *priv,
if (fs->location >= MAX_NUM_OF_ETHTOOL_RULES)
return -ENOSPC;
if (fs->ring_cookie >= priv->channels.params.num_channels &&
fs->ring_cookie != RX_CLS_FLOW_DISC)
return -EINVAL;
if (fs->ring_cookie != RX_CLS_FLOW_DISC)
if (!mlx5e_qid_validate(&priv->channels.params, fs->ring_cookie))
return -EINVAL;
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT)) {
case ETHER_FLOW:
......
......@@ -1519,7 +1519,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
if (err)
goto err_close_drop_rq;
err = mlx5e_create_direct_rqts(priv);
err = mlx5e_create_direct_rqts(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_rqts;
......@@ -1527,7 +1527,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
if (err)
goto err_destroy_direct_rqts;
err = mlx5e_create_direct_tirs(priv);
err = mlx5e_create_direct_tirs(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_tirs;
......@@ -1544,11 +1544,11 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
err_destroy_ttc_table:
mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
err_destroy_direct_tirs:
mlx5e_destroy_direct_tirs(priv);
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
err_destroy_indirect_tirs:
mlx5e_destroy_indirect_tirs(priv, false);
err_destroy_direct_rqts:
mlx5e_destroy_direct_rqts(priv);
mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
err_destroy_indirect_rqts:
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
err_close_drop_rq:
......@@ -1562,9 +1562,9 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
mlx5_del_flow_rules(rpriv->vport_rx_rule);
mlx5e_destroy_ttc_table(priv, &priv->fs.ttc);
mlx5e_destroy_direct_tirs(priv);
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
mlx5e_destroy_indirect_tirs(priv, false);
mlx5e_destroy_direct_rqts(priv);
mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
mlx5e_close_drop_rq(&priv->drop_rq);
}
......
......@@ -47,6 +47,7 @@
#include "en_accel/tls_rxtx.h"
#include "lib/clock.h"
#include "en/xdp.h"
#include "en/xsk/rx.h"
static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
{
......@@ -235,8 +236,8 @@ static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
return true;
}
static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
if (mlx5e_rx_cache_get(rq, dma_info))
return 0;
......@@ -256,13 +257,23 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
return 0;
}
static inline int mlx5e_page_alloc(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
if (rq->umem)
return mlx5e_xsk_page_alloc_umem(rq, dma_info);
else
return mlx5e_page_alloc_pool(rq, dma_info);
}
void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info)
{
dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir);
}
void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
bool recycle)
void mlx5e_page_release_dynamic(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info,
bool recycle)
{
if (likely(recycle)) {
if (mlx5e_rx_cache_put(rq, dma_info))
......@@ -277,6 +288,20 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
}
}
static inline void mlx5e_page_release(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info,
bool recycle)
{
if (rq->umem)
/* The `recycle` parameter is ignored, and the page is always
* put into the Reuse Ring, because there is no way to return
* the page to the userspace when the interface goes down.
*/
mlx5e_xsk_page_release(rq, dma_info);
else
mlx5e_page_release_dynamic(rq, dma_info, recycle);
}
static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *frag)
{
......@@ -288,7 +313,7 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
* offset) should just use the new one without replenishing again
* by themselves.
*/
err = mlx5e_page_alloc_mapped(rq, frag->di);
err = mlx5e_page_alloc(rq, frag->di);
return err;
}
......@@ -354,6 +379,13 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
int err;
int i;
if (rq->umem) {
int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
return -ENOMEM;
}
for (i = 0; i < wqe_bulk; i++) {
struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
......@@ -401,11 +433,17 @@ mlx5e_copy_skb_header(struct device *pdev, struct sk_buff *skb,
static void
mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle)
{
const bool no_xdp_xmit =
bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
bool no_xdp_xmit;
struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
int i;
/* A common case for AF_XDP. */
if (bitmap_full(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE))
return;
no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap,
MLX5_MPWRQ_PAGES_PER_WQE);
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
mlx5e_page_release(rq, &dma_info[i], recycle);
......@@ -427,11 +465,6 @@ static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n)
mlx5_wq_ll_update_db_record(wq);
}
static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
{
return mlx5_wq_cyc_get_ctr_wrap_cnt(&sq->wq, sq->pc);
}
static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq,
struct mlx5_wq_cyc *wq,
u16 pi, u16 nnops)
......@@ -459,6 +492,12 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
int err;
int i;
if (rq->umem &&
unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
err = -ENOMEM;
goto err;
}
pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc);
contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi);
if (unlikely(contig_wqebbs_room < MLX5E_UMR_WQEBBS)) {
......@@ -467,12 +506,10 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
}
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
if (unlikely(mlx5e_icosq_wrap_cnt(sq) < 2))
memcpy(umr_wqe, &rq->mpwqe.umr_wqe,
offsetof(struct mlx5e_umr_wqe, inline_mtts));
memcpy(umr_wqe, &rq->mpwqe.umr_wqe, offsetof(struct mlx5e_umr_wqe, inline_mtts));
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
err = mlx5e_page_alloc_mapped(rq, dma_info);
err = mlx5e_page_alloc(rq, dma_info);
if (unlikely(err))
goto err_unmap;
umr_wqe->inline_mtts[i].ptag = cpu_to_be64(dma_info->addr | MLX5_EN_WR);
......@@ -487,6 +524,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
sq->db.ico_wqe[pi].umr.rq = rq;
sq->pc += MLX5E_UMR_WQEBBS;
sq->doorbell_cseg = &umr_wqe->ctrl;
......@@ -498,6 +536,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
dma_info--;
mlx5e_page_release(rq, dma_info, true);
}
err:
rq->stats->buff_alloc_err++;
return err;
......@@ -544,11 +584,10 @@ bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
return !!err;
}
static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
{
struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq);
struct mlx5_cqe64 *cqe;
u8 completed_umr = 0;
u16 sqcc;
int i;
......@@ -589,7 +628,7 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
if (likely(wi->opcode == MLX5_OPCODE_UMR)) {
sqcc += MLX5E_UMR_WQEBBS;
completed_umr++;
wi->umr.rq->mpwqe.umr_completed++;
} else if (likely(wi->opcode == MLX5_OPCODE_NOP)) {
sqcc++;
} else {
......@@ -605,24 +644,25 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
sq->cc = sqcc;
mlx5_cqwq_update_db_record(&cq->wq);
if (likely(completed_umr)) {
mlx5e_post_rx_mpwqe(rq, completed_umr);
rq->mpwqe.umr_in_progress -= completed_umr;
}
}
bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
{
struct mlx5e_icosq *sq = &rq->channel->icosq;
struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
u8 umr_completed = rq->mpwqe.umr_completed;
int alloc_err = 0;
u8 missing, i;
u16 head;
if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
return false;
mlx5e_poll_ico_cq(&sq->cq, rq);
if (umr_completed) {
mlx5e_post_rx_mpwqe(rq, umr_completed);
rq->mpwqe.umr_in_progress -= umr_completed;
rq->mpwqe.umr_completed = 0;
}
missing = mlx5_wq_ll_missing(wq) - rq->mpwqe.umr_in_progress;
......@@ -636,7 +676,9 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
head = rq->mpwqe.actual_wq_head;
i = missing;
do {
if (unlikely(mlx5e_alloc_rx_mpwqe(rq, head)))
alloc_err = mlx5e_alloc_rx_mpwqe(rq, head);
if (unlikely(alloc_err))
break;
head = mlx5_wq_ll_get_wqe_next_ix(wq, head);
} while (--i);
......@@ -650,6 +692,12 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
rq->mpwqe.actual_wq_head = head;
/* If XSK Fill Ring doesn't have enough frames, busy poll by
* rescheduling the NAPI poll.
*/
if (unlikely(alloc_err == -ENOMEM && rq->umem))
return true;
return false;
}
......@@ -1018,7 +1066,7 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
}
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt);
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
rcu_read_unlock();
if (consumed)
return NULL; /* page/packet was consumed by XDP */
......@@ -1235,7 +1283,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
prefetch(data);
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32);
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
rcu_read_unlock();
if (consumed) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
......
......@@ -46,6 +46,8 @@
#define MLX5E_DECLARE_TX_STAT(type, fld) "tx%d_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_XDPSQ_STAT(type, fld) "tx%d_xdp_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_RQ_XDPSQ_STAT(type, fld) "rx%d_xdp_tx_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_XSKRQ_STAT(type, fld) "rx%d_xsk_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_XSKSQ_STAT(type, fld) "tx%d_xsk_"#fld, offsetof(type, fld)
#define MLX5E_DECLARE_CH_STAT(type, fld) "ch%d_"#fld, offsetof(type, fld)
struct counter_desc {
......@@ -116,12 +118,39 @@ struct mlx5e_sw_stats {
u64 ch_poll;
u64 ch_arm;
u64 ch_aff_change;
u64 ch_force_irq;
u64 ch_eq_rearm;
#ifdef CONFIG_MLX5_EN_TLS
u64 tx_tls_ooo;
u64 tx_tls_resync_bytes;
#endif
u64 rx_xsk_packets;
u64 rx_xsk_bytes;
u64 rx_xsk_csum_complete;
u64 rx_xsk_csum_unnecessary;
u64 rx_xsk_csum_unnecessary_inner;
u64 rx_xsk_csum_none;
u64 rx_xsk_ecn_mark;
u64 rx_xsk_removed_vlan_packets;
u64 rx_xsk_xdp_drop;
u64 rx_xsk_xdp_redirect;
u64 rx_xsk_wqe_err;
u64 rx_xsk_mpwqe_filler_cqes;
u64 rx_xsk_mpwqe_filler_strides;
u64 rx_xsk_oversize_pkts_sw_drop;
u64 rx_xsk_buff_alloc_err;
u64 rx_xsk_cqe_compress_blks;
u64 rx_xsk_cqe_compress_pkts;
u64 rx_xsk_congst_umr;
u64 rx_xsk_arfs_err;
u64 tx_xsk_xmit;
u64 tx_xsk_mpwqe;
u64 tx_xsk_inlnw;
u64 tx_xsk_full;
u64 tx_xsk_err;
u64 tx_xsk_cqes;
};
struct mlx5e_qcounter_stats {
......@@ -256,6 +285,7 @@ struct mlx5e_ch_stats {
u64 poll;
u64 arm;
u64 aff_change;
u64 force_irq;
u64 eq_rearm;
};
......
......@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include "en.h"
#include "en/xdp.h"
#include "en/xsk/tx.h"
static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
{
......@@ -87,7 +88,12 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
napi);
struct mlx5e_ch_stats *ch_stats = c->stats;
struct mlx5e_xdpsq *xsksq = &c->xsksq;
struct mlx5e_rq *xskrq = &c->xskrq;
struct mlx5e_rq *rq = &c->rq;
bool xsk_open = test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
bool aff_change = false;
bool busy_xsk = false;
bool busy = false;
int work_done = 0;
int i;
......@@ -97,22 +103,38 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
for (i = 0; i < c->num_tc; i++)
busy |= mlx5e_poll_tx_cq(&c->sq[i].cq, budget);
busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq, NULL);
busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq);
if (c->xdp)
busy |= mlx5e_poll_xdpsq_cq(&rq->xdpsq.cq, rq);
busy |= mlx5e_poll_xdpsq_cq(&c->rq_xdpsq.cq);
if (likely(budget)) { /* budget=0 means: don't poll rx rings */
work_done = mlx5e_poll_rx_cq(&rq->cq, budget);
if (xsk_open)
work_done = mlx5e_poll_rx_cq(&xskrq->cq, budget);
if (likely(budget - work_done))
work_done += mlx5e_poll_rx_cq(&rq->cq, budget - work_done);
busy |= work_done == budget;
}
busy |= c->rq.post_wqes(rq);
mlx5e_poll_ico_cq(&c->icosq.cq);
busy |= rq->post_wqes(rq);
if (xsk_open) {
mlx5e_poll_ico_cq(&c->xskicosq.cq);
busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
busy_xsk |= xskrq->post_wqes(xskrq);
}
busy |= busy_xsk;
if (busy) {
if (likely(mlx5e_channel_no_affinity_change(c)))
return budget;
ch_stats->aff_change++;
aff_change = true;
if (budget && work_done == budget)
work_done--;
}
......@@ -133,6 +155,18 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
mlx5e_cq_arm(&c->icosq.cq);
mlx5e_cq_arm(&c->xdpsq.cq);
if (xsk_open) {
mlx5e_handle_rx_dim(xskrq);
mlx5e_cq_arm(&c->xskicosq.cq);
mlx5e_cq_arm(&xsksq->cq);
mlx5e_cq_arm(&xskrq->cq);
}
if (unlikely(aff_change && busy_xsk)) {
mlx5e_trigger_irq(&c->icosq);
ch_stats->force_irq++;
}
return work_done;
}
......
......@@ -87,7 +87,7 @@ int mlx5i_init(struct mlx5_core_dev *mdev,
mlx5e_set_netdev_mtu_boundaries(priv);
netdev->mtu = netdev->max_mtu;
mlx5e_build_nic_params(mdev, &priv->rss_params, &priv->channels.params,
mlx5e_build_nic_params(mdev, NULL, &priv->rss_params, &priv->channels.params,
mlx5e_get_netdev_max_channels(netdev),
netdev->mtu);
mlx5i_build_nic_params(mdev, &priv->channels.params);
......@@ -365,7 +365,7 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
if (err)
goto err_close_drop_rq;
err = mlx5e_create_direct_rqts(priv);
err = mlx5e_create_direct_rqts(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_rqts;
......@@ -373,7 +373,7 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
if (err)
goto err_destroy_direct_rqts;
err = mlx5e_create_direct_tirs(priv);
err = mlx5e_create_direct_tirs(priv, priv->direct_tir);
if (err)
goto err_destroy_indirect_tirs;
......@@ -384,11 +384,11 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
return 0;
err_destroy_direct_tirs:
mlx5e_destroy_direct_tirs(priv);
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
err_destroy_indirect_tirs:
mlx5e_destroy_indirect_tirs(priv, true);
err_destroy_direct_rqts:
mlx5e_destroy_direct_rqts(priv);
mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
err_destroy_indirect_rqts:
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
err_close_drop_rq:
......@@ -401,9 +401,9 @@ static int mlx5i_init_rx(struct mlx5e_priv *priv)
static void mlx5i_cleanup_rx(struct mlx5e_priv *priv)
{
mlx5i_destroy_flow_steering(priv);
mlx5e_destroy_direct_tirs(priv);
mlx5e_destroy_direct_tirs(priv, priv->direct_tir);
mlx5e_destroy_indirect_tirs(priv, true);
mlx5e_destroy_direct_rqts(priv);
mlx5e_destroy_direct_rqts(priv, priv->direct_tir);
mlx5e_destroy_rqt(priv, &priv->indir_rqt);
mlx5e_close_drop_rq(&priv->drop_rq);
mlx5e_destroy_q_counters(priv);
......
......@@ -134,11 +134,6 @@ static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
*wq->db = cpu_to_be32(wq->wqe_ctr);
}
static inline u16 mlx5_wq_cyc_get_ctr_wrap_cnt(struct mlx5_wq_cyc *wq, u16 ctr)
{
return ctr >> wq->fbc.log_sz;
}
static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
{
return ctr & wq->fbc.sz_m1;
......
......@@ -77,10 +77,11 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
void xsk_flush(struct xdp_sock *xs);
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
/* Used from netdev driver */
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
void xsk_umem_discard_addr(struct xdp_umem *umem);
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
void xsk_umem_consume_tx_done(struct xdp_umem *umem);
struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
......@@ -99,6 +100,16 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
}
/* Reuse-queue aware version of FILL queue helpers */
static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
if (rq->length >= cnt)
return true;
return xsk_umem_has_addrs(umem, cnt - rq->length);
}
static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
......@@ -146,6 +157,11 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
return false;
}
static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
return false;
}
static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return NULL;
......@@ -159,8 +175,8 @@ static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
}
static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
u32 *len)
static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
struct xdp_desc *desc)
{
return false;
}
......@@ -200,6 +216,11 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
return 0;
}
static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
{
return false;
}
static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
{
return NULL;
......
......@@ -46,6 +46,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_FILL_RING 5
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
......@@ -60,6 +61,13 @@ struct xdp_statistics {
__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
};
struct xdp_options {
__u32 flags;
};
/* Flags for the flags field of struct xdp_options */
#define XDP_OPTIONS_ZEROCOPY (1 << 0)
/* Pgoff for mmaping the rings */
#define XDP_PGOFF_RX_RING 0
#define XDP_PGOFF_TX_RING 0x80000000
......
......@@ -37,6 +37,12 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
READ_ONCE(xs->umem->fq);
}
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
return xskq_has_addrs(umem->fq, cnt);
}
EXPORT_SYMBOL(xsk_umem_has_addrs);
u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return xskq_peek_addr(umem->fq, addr);
......@@ -166,22 +172,18 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_consume_tx_done);
bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
{
struct xdp_desc desc;
struct xdp_sock *xs;
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
if (!xskq_peek_desc(xs->tx, &desc))
if (!xskq_peek_desc(xs->tx, desc))
continue;
if (xskq_produce_addr_lazy(umem->cq, desc.addr))
if (xskq_produce_addr_lazy(umem->cq, desc->addr))
goto out;
*dma = xdp_umem_get_dma(umem, desc.addr);
*len = desc.len;
xskq_discard_desc(xs->tx);
rcu_read_unlock();
return true;
......@@ -644,6 +646,26 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
return 0;
}
case XDP_OPTIONS:
{
struct xdp_options opts = {};
if (len < sizeof(opts))
return -EINVAL;
mutex_lock(&xs->mutex);
if (xs->zc)
opts.flags |= XDP_OPTIONS_ZEROCOPY;
mutex_unlock(&xs->mutex);
len = sizeof(opts);
if (copy_to_user(optval, &opts, len))
return -EFAULT;
if (put_user(len, optlen))
return -EFAULT;
return 0;
}
default:
break;
}
......
......@@ -117,6 +117,20 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
return q->nentries - (producer - q->cons_tail);
}
static inline bool xskq_has_addrs(struct xsk_queue *q, u32 cnt)
{
u32 entries = q->prod_tail - q->cons_tail;
if (entries >= cnt)
return true;
/* Refresh the local pointer. */
q->prod_tail = READ_ONCE(q->ring->producer);
entries = q->prod_tail - q->cons_tail;
return entries >= cnt;
}
/* UMEM queue */
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
......
......@@ -68,6 +68,7 @@ static int opt_queue;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags;
static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
static __u32 prog_id;
struct xsk_umem_info {
......@@ -276,6 +277,12 @@ static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
{
struct xsk_umem_info *umem;
struct xsk_umem_config cfg = {
.fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS,
.comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
.frame_size = opt_xsk_frame_size,
.frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
};
int ret;
umem = calloc(1, sizeof(*umem));
......@@ -283,7 +290,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
exit_with_error(errno);
ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
NULL);
&cfg);
if (ret)
exit_with_error(-ret);
......@@ -323,11 +330,9 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem)
&idx);
if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
exit_with_error(-ret);
for (i = 0;
i < XSK_RING_PROD__DEFAULT_NUM_DESCS *
XSK_UMEM__DEFAULT_FRAME_SIZE;
i += XSK_UMEM__DEFAULT_FRAME_SIZE)
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i;
for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++)
*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) =
i * opt_xsk_frame_size;
xsk_ring_prod__submit(&xsk->umem->fq,
XSK_RING_PROD__DEFAULT_NUM_DESCS);
......@@ -346,6 +351,7 @@ static struct option long_options[] = {
{"interval", required_argument, 0, 'n'},
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
{"frame-size", required_argument, 0, 'f'},
{0, 0, 0, 0}
};
......@@ -365,8 +371,9 @@ static void usage(const char *prog)
" -n, --interval=n Specify statistics update interval (default 1 sec).\n"
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
" -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n"
"\n";
fprintf(stderr, str, prog);
fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
exit(EXIT_FAILURE);
}
......@@ -377,7 +384,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
c = getopt_long(argc, argv, "Frtli:q:psSNn:cz", long_options,
c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
&option_index);
if (c == -1)
break;
......@@ -420,6 +427,9 @@ static void parse_command_line(int argc, char **argv)
case 'F':
opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
break;
case 'f':
opt_xsk_frame_size = atoi(optarg);
break;
default:
usage(basename(argv[0]));
}
......@@ -432,6 +442,11 @@ static void parse_command_line(int argc, char **argv)
usage(basename(argv[0]));
}
if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) {
fprintf(stderr, "--frame-size=%d is not a power of two\n",
opt_xsk_frame_size);
usage(basename(argv[0]));
}
}
static void kick_tx(struct xsk_socket_info *xsk)
......@@ -583,8 +598,7 @@ static void tx_only(struct xsk_socket_info *xsk)
for (i = 0; i < BATCH_SIZE; i++) {
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
= (frame_nb + i) <<
XSK_UMEM__DEFAULT_FRAME_SHIFT;
= (frame_nb + i) * opt_xsk_frame_size;
xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
sizeof(pkt_data) - 1;
}
......@@ -661,21 +675,19 @@ int main(int argc, char **argv)
}
ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
NUM_FRAMES * opt_xsk_frame_size);
if (ret)
exit_with_error(ret);
/* Create sockets... */
umem = xsk_configure_umem(bufs,
NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
xsks[num_socks++] = xsk_configure_socket(umem);
if (opt_bench == BENCH_TXONLY) {
int i;
for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE;
i += XSK_UMEM__DEFAULT_FRAME_SIZE)
(void)gen_eth_frame(umem, i);
for (i = 0; i < NUM_FRAMES; i++)
(void)gen_eth_frame(umem, i * opt_xsk_frame_size);
}
signal(SIGINT, int_exit);
......
......@@ -46,6 +46,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_FILL_RING 5
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
......@@ -60,6 +61,13 @@ struct xdp_statistics {
__u64 tx_invalid_descs; /* Dropped due to invalid descriptor */
};
struct xdp_options {
__u32 flags;
};
/* Flags for the flags field of struct xdp_options */
#define XDP_OPTIONS_ZEROCOPY (1 << 0)
/* Pgoff for mmaping the rings */
#define XDP_PGOFF_RX_RING 0
#define XDP_PGOFF_TX_RING 0x80000000
......
......@@ -65,6 +65,7 @@ struct xsk_socket {
int xsks_map_fd;
__u32 queue_id;
char ifname[IFNAMSIZ];
bool zc;
};
struct xsk_nl_info {
......@@ -480,6 +481,7 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
void *rx_map = NULL, *tx_map = NULL;
struct sockaddr_xdp sxdp = {};
struct xdp_mmap_offsets off;
struct xdp_options opts;
struct xsk_socket *xsk;
socklen_t optlen;
int err;
......@@ -597,6 +599,16 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
}
xsk->prog_fd = -1;
optlen = sizeof(opts);
err = getsockopt(xsk->fd, SOL_XDP, XDP_OPTIONS, &opts, &optlen);
if (err) {
err = -errno;
goto out_mmap_tx;
}
xsk->zc = opts.flags & XDP_OPTIONS_ZEROCOPY;
if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
err = xsk_setup_xdp_prog(xsk);
if (err)
......
......@@ -167,7 +167,7 @@ LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk);
#define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048
#define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048
#define XSK_UMEM__DEFAULT_FRAME_SHIFT 11 /* 2048 bytes */
#define XSK_UMEM__DEFAULT_FRAME_SHIFT 12 /* 4096 bytes */
#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT)
#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment