Commit f95ccffc authored by Jack Morgenstein's avatar Jack Morgenstein Committed by Jason Gunthorpe

IB/mlx4: Use 4K pages for kernel QP's WQE buffer

In the current implementation, the driver tries to allocate contiguous
memory, and if it fails, it falls back to 4K fragmented allocation.

Once the memory is fragmented, the first allocation might take a lot
of time, and even fail, which can cause connection failures.

This patch changes the logic to always allocate with 4K granularity,
since it's more robust and more likely to succeed.

This patch was tested with Lustre and no performance degradation
was observed.

Note: This commit eliminates the "shrinking WQE" feature. This feature
depended on using vmap to create a virtually contiguous send WQ.
vmap use was abandoned due to problems with several processors (see the
commit cited below). As a result, shrinking WQE was available only with
physically contiguous send WQs. Allocating such send WQs caused the
problems described above.
Therefore, as a side effect of eliminating the use of large physically
contiguous send WQs, the shrinking WQE feature became unavailable.

Warning example:
worker/20:1: page allocation failure: order:8, mode:0x80d0
CPU: 20 PID: 513 Comm: kworker/20:1 Tainted: G OE ------------
Workqueue: ib_cm cm_work_handler [ib_cm]
Call Trace:
[<ffffffff81686d81>] dump_stack+0x19/0x1b
[<ffffffff81186160>] warn_alloc_failed+0x110/0x180
[<ffffffff8118a954>] __alloc_pages_nodemask+0x9b4/0xba0
[<ffffffff811ce868>] alloc_pages_current+0x98/0x110
[<ffffffff81184fae>] __get_free_pages+0xe/0x50
[<ffffffff8133f6fe>] swiotlb_alloc_coherent+0x5e/0x150
[<ffffffff81062551>] x86_swiotlb_alloc_coherent+0x41/0x50
[<ffffffffa056b4c4>] mlx4_buf_direct_alloc.isra.7+0xc4/0x180 [mlx4_core]
[<ffffffffa056b73b>] mlx4_buf_alloc+0x1bb/0x260 [mlx4_core]
[<ffffffffa0b15496>] create_qp_common+0x536/0x1000 [mlx4_ib]
[<ffffffff811c6ef7>] ? dma_pool_free+0xa7/0xd0
[<ffffffffa0b163c1>] mlx4_ib_create_qp+0x3b1/0xdc0 [mlx4_ib]
[<ffffffffa0b01bc2>] ? mlx4_ib_create_cq+0x2d2/0x430 [mlx4_ib]
[<ffffffffa0b21f20>] mlx4_ib_create_qp_wrp+0x10/0x20 [mlx4_ib]
[<ffffffffa08f152a>] ib_create_qp+0x7a/0x2f0 [ib_core]
[<ffffffffa06205d4>] rdma_create_qp+0x34/0xb0 [rdma_cm]
[<ffffffffa08275c9>] kiblnd_create_conn+0xbf9/0x1950 [ko2iblnd]
[<ffffffffa074077a>] ? cfs_percpt_unlock+0x1a/0xb0 [libcfs]
[<ffffffffa0835519>] kiblnd_passive_connect+0xa99/0x18c0 [ko2iblnd]

Fixes: 73898db0 ("net/mlx4: Avoid wrong virtual mappings")
Signed-off-by: default avatarJack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent bccd0622
...@@ -322,7 +322,6 @@ struct mlx4_ib_qp { ...@@ -322,7 +322,6 @@ struct mlx4_ib_qp {
u32 doorbell_qpn; u32 doorbell_qpn;
__be32 sq_signal_bits; __be32 sq_signal_bits;
unsigned sq_next_wqe; unsigned sq_next_wqe;
int sq_max_wqes_per_wr;
int sq_spare_wqes; int sq_spare_wqes;
struct mlx4_ib_wq sq; struct mlx4_ib_wq sq;
......
...@@ -204,89 +204,24 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) ...@@ -204,89 +204,24 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
/* /*
* Stamp a SQ WQE so that it is invalid if prefetched by marking the * Stamp a SQ WQE so that it is invalid if prefetched by marking the
* first four bytes of every 64 byte chunk with * first four bytes of every 64 byte chunk with 0xffffffff, except for
* 0x7FFFFFF | (invalid_ownership_value << 31). * the very first chunk of the WQE.
*
* When the max work request size is less than or equal to the WQE
* basic block size, as an optimization, we can stamp all WQEs with
* 0xffffffff, and skip the very first chunk of each WQE.
*/ */
static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
{ {
__be32 *wqe; __be32 *wqe;
int i; int i;
int s; int s;
int ind;
void *buf; void *buf;
__be32 stamp;
struct mlx4_wqe_ctrl_seg *ctrl; struct mlx4_wqe_ctrl_seg *ctrl;
if (qp->sq_max_wqes_per_wr > 1) { buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
s = roundup(size, 1U << qp->sq.wqe_shift); ctrl = (struct mlx4_wqe_ctrl_seg *)buf;
for (i = 0; i < s; i += 64) {
ind = (i >> qp->sq.wqe_shift) + n;
stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
cpu_to_be32(0xffffffff);
buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
*wqe = stamp;
}
} else {
ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4; s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
for (i = 64; i < s; i += 64) { for (i = 64; i < s; i += 64) {
wqe = buf + i; wqe = buf + i;
*wqe = cpu_to_be32(0xffffffff); *wqe = cpu_to_be32(0xffffffff);
} }
}
}
static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
{
struct mlx4_wqe_ctrl_seg *ctrl;
struct mlx4_wqe_inline_seg *inl;
void *wqe;
int s;
ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
s = sizeof(struct mlx4_wqe_ctrl_seg);
if (qp->ibqp.qp_type == IB_QPT_UD) {
struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
struct mlx4_av *av = (struct mlx4_av *)dgram->av;
memset(dgram, 0, sizeof *dgram);
av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
s += sizeof(struct mlx4_wqe_datagram_seg);
}
/* Pad the remainder of the WQE with an inline data segment. */
if (size > s) {
inl = wqe + s;
inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
}
ctrl->srcrb_flags = 0;
ctrl->qpn_vlan.fence_size = size / 16;
/*
* Make sure descriptor is fully written before setting ownership bit
* (because HW can start executing as soon as we do).
*/
wmb();
ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
}
/* Post NOP WQE to prevent wrap-around in the middle of WR */
static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
{
unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
if (unlikely(s < qp->sq_max_wqes_per_wr)) {
post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
ind += s;
}
return ind;
} }
static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
...@@ -433,8 +368,7 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, ...@@ -433,8 +368,7 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
} }
static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp, enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
bool shrink_wqe)
{ {
int s; int s;
...@@ -461,69 +395,19 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, ...@@ -461,69 +395,19 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
if (s > dev->dev->caps.max_sq_desc_sz) if (s > dev->dev->caps.max_sq_desc_sz)
return -EINVAL; return -EINVAL;
/*
* Hermon supports shrinking WQEs, such that a single work
* request can include multiple units of 1 << wqe_shift. This
* way, work requests can differ in size, and do not have to
* be a power of 2 in size, saving memory and speeding up send
* WR posting. Unfortunately, if we do this then the
* wqe_index field in CQEs can't be used to look up the WR ID
* anymore, so we do this only if selective signaling is off.
*
* Further, on 32-bit platforms, we can't use vmap() to make
* the QP buffer virtually contiguous. Thus we have to use
* constant-sized WRs to make sure a WR is always fully within
* a single page-sized chunk.
*
* Finally, we use NOP work requests to pad the end of the
* work queue, to avoid wrap-around in the middle of WR. We
* set NEC bit to avoid getting completions with error for
* these NOP WRs, but since NEC is only supported starting
* with firmware 2.2.232, we use constant-sized WRs for older
* firmware.
*
* And, since MLX QPs only support SEND, we use constant-sized
* WRs in this case.
*
* We look for the smallest value of wqe_shift such that the
* resulting number of wqes does not exceed device
* capabilities.
*
* We set WQE size to at least 64 bytes, this way stamping
* invalidates each WQE.
*/
if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
qp->sq_signal_bits && BITS_PER_LONG == 64 &&
type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
!(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
qp->sq.wqe_shift = ilog2(64);
else
qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
for (;;) {
qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
/* /*
* We need to leave 2 KB + 1 WR of headroom in the SQ to * We need to leave 2 KB + 1 WR of headroom in the SQ to
* allow HW to prefetch. * allow HW to prefetch.
*/ */
qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
qp->sq_max_wqes_per_wr +
qp->sq_spare_wqes); qp->sq_spare_wqes);
if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) qp->sq.max_gs =
break; (min(dev->dev->caps.max_sq_desc_sz,
(1 << qp->sq.wqe_shift)) -
if (qp->sq_max_wqes_per_wr <= 1)
return -EINVAL;
++qp->sq.wqe_shift;
}
qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
(qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
send_wqe_overhead(type, qp->flags)) / send_wqe_overhead(type, qp->flags)) /
sizeof (struct mlx4_wqe_data_seg); sizeof (struct mlx4_wqe_data_seg);
...@@ -538,7 +422,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, ...@@ -538,7 +422,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
} }
cap->max_send_wr = qp->sq.max_post = cap->max_send_wr = qp->sq.max_post =
(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; qp->sq.wqe_cnt - qp->sq_spare_wqes;
cap->max_send_sge = min(qp->sq.max_gs, cap->max_send_sge = min(qp->sq.max_gs,
min(dev->dev->caps.max_sq_sg, min(dev->dev->caps.max_sq_sg,
dev->dev->caps.max_rq_sg)); dev->dev->caps.max_rq_sg));
...@@ -977,7 +861,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, ...@@ -977,7 +861,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
{ {
int qpn; int qpn;
int err; int err;
struct ib_qp_cap backup_cap;
struct mlx4_ib_sqp *sqp = NULL; struct mlx4_ib_sqp *sqp = NULL;
struct mlx4_ib_qp *qp; struct mlx4_ib_qp *qp;
enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
...@@ -1178,9 +1061,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, ...@@ -1178,9 +1061,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
goto err; goto err;
} }
memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap)); err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
err = set_kernel_sq_size(dev, &init_attr->cap,
qp_type, qp, true);
if (err) if (err)
goto err; goto err;
...@@ -1192,21 +1073,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, ...@@ -1192,21 +1073,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
*qp->db.db = 0; *qp->db.db = 0;
} }
if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size, if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2,
&qp->buf)) { &qp->buf)) {
memcpy(&init_attr->cap, &backup_cap,
sizeof(backup_cap));
err = set_kernel_sq_size(dev, &init_attr->cap, qp_type,
qp, false);
if (err)
goto err_db;
if (mlx4_buf_alloc(dev->dev, qp->buf_size,
PAGE_SIZE * 2, &qp->buf)) {
err = -ENOMEM; err = -ENOMEM;
goto err_db; goto err_db;
} }
}
err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
&qp->mtt); &qp->mtt);
...@@ -2582,11 +2453,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, ...@@ -2582,11 +2453,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
for (i = 0; i < qp->sq.wqe_cnt; ++i) { for (i = 0; i < qp->sq.wqe_cnt; ++i) {
ctrl = get_send_wqe(qp, i); ctrl = get_send_wqe(qp, i);
ctrl->owner_opcode = cpu_to_be32(1 << 31); ctrl->owner_opcode = cpu_to_be32(1 << 31);
if (qp->sq_max_wqes_per_wr == 1)
ctrl->qpn_vlan.fence_size = ctrl->qpn_vlan.fence_size =
1 << (qp->sq.wqe_shift - 4); 1 << (qp->sq.wqe_shift - 4);
stamp_send_wqe(qp, i);
stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
} }
} }
...@@ -3580,7 +3449,6 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, ...@@ -3580,7 +3449,6 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
int nreq; int nreq;
int err = 0; int err = 0;
unsigned ind; unsigned ind;
int uninitialized_var(stamp);
int uninitialized_var(size); int uninitialized_var(size);
unsigned uninitialized_var(seglen); unsigned uninitialized_var(seglen);
__be32 dummy; __be32 dummy;
...@@ -3853,22 +3721,14 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, ...@@ -3853,22 +3721,14 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
stamp = ind + qp->sq_spare_wqes;
ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
/* /*
* We can improve latency by not stamping the last * We can improve latency by not stamping the last
* send queue WQE until after ringing the doorbell, so * send queue WQE until after ringing the doorbell, so
* only stamp here if there are still more WQEs to post. * only stamp here if there are still more WQEs to post.
*
* Same optimization applies to padding with NOP wqe
* in case of WQE shrinking (used to prevent wrap-around
* in the middle of WR).
*/ */
if (wr->next) { if (wr->next)
stamp_send_wqe(qp, stamp, size * 16); stamp_send_wqe(qp, ind + qp->sq_spare_wqes);
ind = pad_wraparound(qp, ind); ind++;
}
} }
out: out:
...@@ -3890,9 +3750,8 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, ...@@ -3890,9 +3750,8 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
*/ */
mmiowb(); mmiowb();
stamp_send_wqe(qp, stamp, size * 16); stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1);
ind = pad_wraparound(qp, ind);
qp->sq_next_wqe = ind; qp->sq_next_wqe = ind;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment