Commit c6a21c38 authored by Majd Dibbiny's avatar Majd Dibbiny Committed by Jason Gunthorpe

IB/mlx5: Change TX affinity assignment in RoCE LAG mode

In the current code, the TX affinity is per RoCE device, which can cause
unfairness between different contexts. e.g. if we open two contexts, and
each open 10 QPs concurrently, all of the QPs of the first context might
end up on the first port instead of distributed on the two ports as
expected

To overcome this unfairness between processes, we maintain per device TX
affinity, and per process TX affinity.

The allocation algorithm is as follow:

1. Hold two tx_port_affinity atomic variables, one per RoCE device and one
   per ucontext. Both initialized to 0.

2. In mlx5_ib_alloc_ucontext do:
 2.1. ucontext.tx_port_affinity = device.tx_port_affinity
 2.2. device.tx_port_affinity += 1

3. In modify QP INIT2RST:
 3.1. qp.tx_port_affinity = ucontext.tx_port_affinity % MLX5_PORT_NUM
 3.2. ucontext.tx_port_affinity += 1
Signed-off-by: default avatarMajd Dibbiny <majd@mellanox.com>
Reviewed-by: default avatarMoni Shoua <monis@mellanox.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent dc9f5d0f
...@@ -1826,6 +1826,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, ...@@ -1826,6 +1826,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
context->lib_caps = req.lib_caps; context->lib_caps = req.lib_caps;
print_lib_caps(dev, context->lib_caps); print_lib_caps(dev, context->lib_caps);
if (mlx5_lag_is_active(dev->mdev)) {
u8 port = mlx5_core_native_port_num(dev->mdev);
atomic_set(&context->tx_port_affinity,
atomic_add_return(
1, &dev->roce[port].tx_port_affinity));
}
return &context->ibucontext; return &context->ibucontext;
out_mdev: out_mdev:
......
...@@ -139,6 +139,8 @@ struct mlx5_ib_ucontext { ...@@ -139,6 +139,8 @@ struct mlx5_ib_ucontext {
u64 lib_caps; u64 lib_caps;
DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
u16 devx_uid; u16 devx_uid;
/* For RoCE LAG TX affinity */
atomic_t tx_port_affinity;
}; };
static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
...@@ -699,7 +701,7 @@ struct mlx5_roce { ...@@ -699,7 +701,7 @@ struct mlx5_roce {
rwlock_t netdev_lock; rwlock_t netdev_lock;
struct net_device *netdev; struct net_device *netdev;
struct notifier_block nb; struct notifier_block nb;
atomic_t next_port; atomic_t tx_port_affinity;
enum ib_port_state last_port_state; enum ib_port_state last_port_state;
struct mlx5_ib_dev *dev; struct mlx5_ib_dev *dev;
u8 native_port_num; u8 native_port_num;
......
...@@ -2909,6 +2909,37 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, ...@@ -2909,6 +2909,37 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
return 0; return 0;
} }
static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
struct mlx5_ib_pd *pd,
struct mlx5_ib_qp_base *qp_base,
u8 port_num)
{
struct mlx5_ib_ucontext *ucontext = NULL;
unsigned int tx_port_affinity;
if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context)
ucontext = to_mucontext(pd->ibpd.uobject->context);
if (ucontext) {
tx_port_affinity = (unsigned int)atomic_add_return(
1, &ucontext->tx_port_affinity) %
MLX5_MAX_PORTS +
1;
mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n",
tx_port_affinity, qp_base->mqp.qpn, ucontext);
} else {
tx_port_affinity =
(unsigned int)atomic_add_return(
1, &dev->roce[port_num].tx_port_affinity) %
MLX5_MAX_PORTS +
1;
mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n",
tx_port_affinity, qp_base->mqp.qpn);
}
return tx_port_affinity;
}
static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask, const struct ib_qp_attr *attr, int attr_mask,
enum ib_qp_state cur_state, enum ib_qp_state new_state, enum ib_qp_state cur_state, enum ib_qp_state new_state,
...@@ -2974,6 +3005,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, ...@@ -2974,6 +3005,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (!context) if (!context)
return -ENOMEM; return -ENOMEM;
pd = get_pd(qp);
context->flags = cpu_to_be32(mlx5_st << 16); context->flags = cpu_to_be32(mlx5_st << 16);
if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
...@@ -3002,9 +3034,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, ...@@ -3002,9 +3034,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
(ibqp->qp_type == IB_QPT_XRC_TGT)) { (ibqp->qp_type == IB_QPT_XRC_TGT)) {
if (mlx5_lag_is_active(dev->mdev)) { if (mlx5_lag_is_active(dev->mdev)) {
u8 p = mlx5_core_native_port_num(dev->mdev); u8 p = mlx5_core_native_port_num(dev->mdev);
tx_affinity = (unsigned int)atomic_add_return(1, tx_affinity = get_tx_affinity(dev, pd, base, p);
&dev->roce[p].next_port) %
MLX5_MAX_PORTS + 1;
context->flags |= cpu_to_be32(tx_affinity << 24); context->flags |= cpu_to_be32(tx_affinity << 24);
} }
} }
...@@ -3062,7 +3092,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, ...@@ -3062,7 +3092,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
goto out; goto out;
} }
pd = get_pd(qp);
get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
&send_cq, &recv_cq); &send_cq, &recv_cq);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment