Commit 1df845be authored by David S. Miller's avatar David S. Miller

Merge branch 'mlx5-next'

Saeed Mahameed says:

====================
Mellanox 100G mlx5 driver receive path optimizations

Changes from V2:
	- Rebased to 46e7b8d8 ("net: dsa: kill circular reference with slave priv")
	- Updated: ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
		* Per Eric Dumazet comment we changed the driver memory handling scheme to
		work with order-0 pages rather than order-5 via split_page().
		* This means that now a mlx5e rx skb can hold one or (more in case of HW LRO)
                skb frag each pointing to a 4K order-0 page rather than one frag with order-5 page.
	- Updated: ("net/mlx5e: Add fragmented memory support for RX multi packet WQE")
		* Code refactoring and code reuse due the split_page() mechanism,
		  now the MPWQE and fragmented MPWQE handling almost look the same,
		  and share most of the code.
	- In some cases we see 2%-3% packet rate degradation in comparison to the order-5 pages approach,
	  due to split_page() cpu consumption, but still we do see 3%-10% improvement in comparison to the
          current linear SKB approach.
	- We do believe that now the driver memory scheme is significantly less vulnerable
	  to the memory DOS attack Eric pointed at.

Changes from V1:
	- Rebased to efde611b ("Merge branch 'nfp-next'")
	- Dropped: ("net/mlx5: Refactor mlx5_core_mr to mkey")
                Already merged into 4.6 from rdma tree.
	- Dropped: ("net/mlx5_core: Add ConnectX-5 to list of supported devices")
                Will be pushed to net as we want it in 4.6 release.
	- Dropped: ("net/mlx5e: Change RX moderation period to be based on CQE")
                Will be pushed in a later series with full software based adaptive moderation.
	- Added: ("net/mlx5e: Delay skb->data access")
		Small trivial optimization.
	- Updated: ("net/mlx5e: Support RX multi-packet WQE (Striding RQ)")
	 	Changed Striding RQ defaults to:
			> 	NUM WQEs = 16
			> 	Strides Per WQE = 1024
			> 	Stride Size = 128
	- Updated: ("net/mlx5e: Use napi_alloc_skb for RX SKB allocations")
		Consider the IP packet alignment already done in napi_alloc_skb.

Changes from V0:
	- Fixed a typo in commit message reported by Sergei
	- Align SKB fragments truesize to stride size
	- Use skb_add_rx_frag and remove the use of SKB_TRUESIZE
	- Fix: # MTTs alignment on Power PC
	- Fix: Free original (unaligned) pointer of MTT array
	- Use dev_alloc_pages and dev_alloc_page
	- Extend the stats.buff_alloc_err counter
	- Reform the copying of packet header into skb linear data
	- Add compiler hints for conditional statements
	- Prefetch skd->data prior to copying packet header into it
	- Rework: mlx5e_complete_rx_fragmented_mpwqe
	- Handle SKB fragments before linear data
	- Dropped ("net/mlx5e: Prefetch next RX CQE") for now
	- Added a small patch that Adds ConnectX-5 devices to the list of supported devices
	- Rebased to 1cdba550 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next")

This series includes Some RX modifications and optimizations for
the mlx5 Ethernet driver.

From Rana, we have one patch that adds the support for Connectx-4
queue counters.

From Tariq, several patches that are centralized around improving
RX path message rate, CPU and Memory utilization, in each patch
commit message you will find the performance improvements numbers
related to that specific patch.

In the 2nd patch we used a queue counter to report "out of buffer"
dropped packet count, "Dropped packets due to lack of software resources"

3rd patch modifies the driver's to RSS default value to be spread along the
close NUMA node cores only for better out of the box experience.

In the 4th and 5th patches we utilized the use of RX multi-packet WQE
(Striding RQ) for better memory utilization especially in case of hardware
LRO is enabled and for better message rate for small packets.

In the 6th and 7th patches we added a fallback mechanism to use fragmented
memory when allocating large WQE strides fails, using UMR
(User Memory Registration) and ICO (Internal Control Operations) SQs.

In the 8th to 11th patches we did some small modification which show some small
extra improvements.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents b8fd789a 54984407
......@@ -165,6 +165,8 @@ static const struct {
},
};
#define MLX5E_NUM_Q_CNTRS(priv) (NUM_Q_COUNTERS * (!!priv->q_counter))
static int mlx5e_get_sset_count(struct net_device *dev, int sset)
{
struct mlx5e_priv *priv = netdev_priv(dev);
......@@ -172,6 +174,7 @@ static int mlx5e_get_sset_count(struct net_device *dev, int sset)
switch (sset) {
case ETH_SS_STATS:
return NUM_VPORT_COUNTERS + NUM_PPORT_COUNTERS +
MLX5E_NUM_Q_CNTRS(priv) +
priv->params.num_channels * NUM_RQ_STATS +
priv->params.num_channels * priv->params.num_tc *
NUM_SQ_STATS;
......@@ -200,6 +203,11 @@ static void mlx5e_get_strings(struct net_device *dev,
strcpy(data + (idx++) * ETH_GSTRING_LEN,
vport_strings[i]);
/* Q counters */
for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
strcpy(data + (idx++) * ETH_GSTRING_LEN,
qcounter_stats_strings[i]);
/* PPORT counters */
for (i = 0; i < NUM_PPORT_COUNTERS; i++)
strcpy(data + (idx++) * ETH_GSTRING_LEN,
......@@ -240,6 +248,9 @@ static void mlx5e_get_ethtool_stats(struct net_device *dev,
for (i = 0; i < NUM_VPORT_COUNTERS; i++)
data[idx++] = ((u64 *)&priv->stats.vport)[i];
for (i = 0; i < MLX5E_NUM_Q_CNTRS(priv); i++)
data[idx++] = ((u32 *)&priv->stats.qcnt)[i];
for (i = 0; i < NUM_PPORT_COUNTERS; i++)
data[idx++] = be64_to_cpu(((__be64 *)&priv->stats.pport)[i]);
......@@ -262,8 +273,9 @@ static void mlx5e_get_ringparam(struct net_device *dev,
struct ethtool_ringparam *param)
{
struct mlx5e_priv *priv = netdev_priv(dev);
int rq_wq_type = priv->params.rq_wq_type;
param->rx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
param->rx_max_pending = 1 << mlx5_max_log_rq_size(rq_wq_type);
param->tx_max_pending = 1 << MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE;
param->rx_pending = 1 << priv->params.log_rq_size;
param->tx_pending = 1 << priv->params.log_sq_size;
......@@ -274,6 +286,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
{
struct mlx5e_priv *priv = netdev_priv(dev);
bool was_opened;
int rq_wq_type = priv->params.rq_wq_type;
u16 min_rx_wqes;
u8 log_rq_size;
u8 log_sq_size;
......@@ -289,16 +302,16 @@ static int mlx5e_set_ringparam(struct net_device *dev,
__func__);
return -EINVAL;
}
if (param->rx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE)) {
if (param->rx_pending < (1 << mlx5_min_log_rq_size(rq_wq_type))) {
netdev_info(dev, "%s: rx_pending (%d) < min (%d)\n",
__func__, param->rx_pending,
1 << MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE);
1 << mlx5_min_log_rq_size(rq_wq_type));
return -EINVAL;
}
if (param->rx_pending > (1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE)) {
if (param->rx_pending > (1 << mlx5_max_log_rq_size(rq_wq_type))) {
netdev_info(dev, "%s: rx_pending (%d) > max (%d)\n",
__func__, param->rx_pending,
1 << MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE);
1 << mlx5_max_log_rq_size(rq_wq_type));
return -EINVAL;
}
if (param->tx_pending < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) {
......@@ -316,8 +329,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
log_rq_size = order_base_2(param->rx_pending);
log_sq_size = order_base_2(param->tx_pending);
min_rx_wqes = min_t(u16, param->rx_pending - 1,
MLX5E_PARAMS_DEFAULT_MIN_RX_WQES);
min_rx_wqes = mlx5_min_rx_wqes(rq_wq_type, param->rx_pending);
if (log_rq_size == priv->params.log_rq_size &&
log_sq_size == priv->params.log_sq_size &&
......@@ -386,7 +398,7 @@ static int mlx5e_set_channels(struct net_device *dev,
mlx5e_close_locked(dev);
priv->params.num_channels = count;
mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
mlx5e_build_default_indir_rqt(priv->mdev, priv->params.indirection_rqt,
MLX5E_INDIR_RQT_SIZE, count);
if (was_opened)
......
......@@ -54,10 +54,11 @@ void mlx5e_send_nop(struct mlx5e_sq *sq, bool notify_hw)
sq->skb[pi] = NULL;
sq->pc++;
sq->stats.nop++;
if (notify_hw) {
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
mlx5e_tx_notify_hw(sq, wqe, 0);
mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
}
}
......@@ -309,7 +310,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
bf_sz = wi->num_wqebbs << 3;
cseg->fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
mlx5e_tx_notify_hw(sq, wqe, bf_sz);
mlx5e_tx_notify_hw(sq, &wqe->ctrl, bf_sz);
}
/* fill sq edge with nops to avoid wqe wrap around */
......@@ -387,7 +388,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
wi = &sq->wqe_info[ci];
if (unlikely(!skb)) { /* nop */
sq->stats.nop++;
sqcc++;
continue;
}
......
......@@ -49,6 +49,60 @@ struct mlx5_cqe64 *mlx5e_get_cqe(struct mlx5e_cq *cq)
return cqe;
}
static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq)
{
struct mlx5_wq_cyc *wq;
struct mlx5_cqe64 *cqe;
struct mlx5e_sq *sq;
u16 sqcc;
cqe = mlx5e_get_cqe(cq);
if (likely(!cqe))
return;
sq = container_of(cq, struct mlx5e_sq, cq);
wq = &sq->wq;
/* sq->cc must be updated only after mlx5_cqwq_update_db_record(),
* otherwise a cq overrun may occur
*/
sqcc = sq->cc;
do {
u16 ci = be16_to_cpu(cqe->wqe_counter) & wq->sz_m1;
struct mlx5e_ico_wqe_info *icowi = &sq->ico_wqe_info[ci];
mlx5_cqwq_pop(&cq->wq);
sqcc += icowi->num_wqebbs;
if (unlikely((cqe->op_own >> 4) != MLX5_CQE_REQ)) {
WARN_ONCE(true, "mlx5e: Bad OP in ICOSQ CQE: 0x%x\n",
cqe->op_own);
break;
}
switch (icowi->opcode) {
case MLX5_OPCODE_NOP:
break;
case MLX5_OPCODE_UMR:
mlx5e_post_rx_fragmented_mpwqe(&sq->channel->rq);
break;
default:
WARN_ONCE(true,
"mlx5e: Bad OPCODE in ICOSQ WQE info: 0x%x\n",
icowi->opcode);
}
} while ((cqe = mlx5e_get_cqe(cq)));
mlx5_cqwq_update_db_record(&cq->wq);
/* ensure cq space is freed before enabling more cqes */
wmb();
sq->cc = sqcc;
}
int mlx5e_napi_poll(struct napi_struct *napi, int budget)
{
struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
......@@ -64,6 +118,9 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
work_done = mlx5e_poll_rx_cq(&c->rq.cq, budget);
busy |= work_done == budget;
mlx5e_poll_ico_cq(&c->icosq.cq);
busy |= mlx5e_post_rx_wqes(&c->rq);
if (busy)
......@@ -80,6 +137,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
for (i = 0; i < c->num_tc; i++)
mlx5e_cq_arm(&c->sq[i].cq);
mlx5e_cq_arm(&c->rq.cq);
mlx5e_cq_arm(&c->icosq.cq);
return work_done;
}
......@@ -89,7 +147,6 @@ void mlx5e_completion_event(struct mlx5_core_cq *mcq)
struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
set_bit(MLX5E_CHANNEL_NAPI_SCHED, &cq->channel->flags);
barrier();
napi_schedule(cq->napi);
}
......
......@@ -538,3 +538,71 @@ void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
mlx5_core_destroy_sq(dev, sq->qpn);
}
EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked);
int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id)
{
u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)];
u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)];
int err;
memset(in, 0, sizeof(in));
memset(out, 0, sizeof(out));
MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
err = mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out));
if (!err)
*counter_id = MLX5_GET(alloc_q_counter_out, out,
counter_set_id);
return err;
}
EXPORT_SYMBOL_GPL(mlx5_core_alloc_q_counter);
int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id)
{
u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)];
u32 out[MLX5_ST_SZ_DW(dealloc_q_counter_out)];
memset(in, 0, sizeof(in));
memset(out, 0, sizeof(out));
MLX5_SET(dealloc_q_counter_in, in, opcode,
MLX5_CMD_OP_DEALLOC_Q_COUNTER);
MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter_id);
return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out,
sizeof(out));
}
EXPORT_SYMBOL_GPL(mlx5_core_dealloc_q_counter);
int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
int reset, void *out, int out_size)
{
u32 in[MLX5_ST_SZ_DW(query_q_counter_in)];
memset(in, 0, sizeof(in));
MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
MLX5_SET(query_q_counter_in, in, clear, reset);
MLX5_SET(query_q_counter_in, in, counter_set_id, counter_id);
return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, out_size);
}
EXPORT_SYMBOL_GPL(mlx5_core_query_q_counter);
int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
u32 *out_of_buffer)
{
int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
void *out;
int err;
out = mlx5_vzalloc(outlen);
if (!out)
return -ENOMEM;
err = mlx5_core_query_q_counter(dev, counter_id, 0, out, outlen);
if (!err)
*out_of_buffer = MLX5_GET(query_q_counter_out, out,
out_of_buffer);
kfree(out);
return err;
}
......@@ -644,7 +644,8 @@ struct mlx5_err_cqe {
};
struct mlx5_cqe64 {
u8 rsvd0[4];
u8 rsvd0[2];
__be16 wqe_id;
u8 lro_tcppsh_abort_dupack;
u8 lro_min_ttl;
__be16 lro_tcp_win;
......@@ -696,6 +697,42 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe)
return (u64)lo | ((u64)hi << 32);
}
struct mpwrq_cqe_bc {
__be16 filler_consumed_strides;
__be16 byte_cnt;
};
static inline u16 mpwrq_get_cqe_byte_cnt(struct mlx5_cqe64 *cqe)
{
struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
return be16_to_cpu(bc->byte_cnt);
}
static inline u16 mpwrq_get_cqe_bc_consumed_strides(struct mpwrq_cqe_bc *bc)
{
return 0x7fff & be16_to_cpu(bc->filler_consumed_strides);
}
static inline u16 mpwrq_get_cqe_consumed_strides(struct mlx5_cqe64 *cqe)
{
struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
return mpwrq_get_cqe_bc_consumed_strides(bc);
}
static inline bool mpwrq_is_filler_cqe(struct mlx5_cqe64 *cqe)
{
struct mpwrq_cqe_bc *bc = (struct mpwrq_cqe_bc *)&cqe->byte_cnt;
return 0x8000 & be16_to_cpu(bc->filler_consumed_strides);
}
static inline u16 mpwrq_get_cqe_stride_index(struct mlx5_cqe64 *cqe)
{
return be16_to_cpu(cqe->wqe_counter);
}
enum {
CQE_L4_HDR_TYPE_NONE = 0x0,
CQE_L4_HDR_TYPE_TCP_NO_ACK = 0x1,
......
......@@ -668,6 +668,12 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen,
struct mlx5_core_qp *sq);
void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev,
struct mlx5_core_qp *sq);
int mlx5_core_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id);
int mlx5_core_dealloc_q_counter(struct mlx5_core_dev *dev, u16 counter_id);
int mlx5_core_query_q_counter(struct mlx5_core_dev *dev, u16 counter_id,
int reset, void *out, int out_size);
int mlx5_core_query_out_of_buffer(struct mlx5_core_dev *dev, u16 counter_id,
u32 *out_of_buffer);
static inline const char *mlx5_qp_type_str(int type)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment