IB/mlx4: Use 4K pages for kernel QP's WQE buffer

In the current implementation, the driver tries to allocate contiguous memory, and if it fails, it falls back to 4K fragmented allocation. Once the memory is fragmented, the first allocation might take a lot of time, and even fail, which can cause connection failures. This patch changes the logic to always allocate with 4K granularity, since it's more robust and more likely to succeed. This patch was tested with Lustre and no performance degradation was observed. Note: This commit eliminates the "shrinking WQE" feature. This feature depended on using vmap to create a virtually contiguous send WQ. vmap use was abandoned due to problems with several processors (see the commit cited below). As a result, shrinking WQE was available only with physically contiguous send WQs. Allocating such send WQs caused the problems described above. Therefore, as a side effect of eliminating the use of large physically contiguous send WQs, the shrinking WQE feature became unavailable. Warning example: worker/20:1: page allocation failure: order:8, mode:0x80d0 CPU: 20 PID: 513 Comm: kworker/20:1 Tainted: G OE ------------ Workqueue: ib_cm cm_work_handler [ib_cm] Call Trace: [<ffffffff81686d81>] dump_stack+0x19/0x1b [<ffffffff81186160>] warn_alloc_failed+0x110/0x180 [<ffffffff8118a954>] __alloc_pages_nodemask+0x9b4/0xba0 [<ffffffff811ce868>] alloc_pages_current+0x98/0x110 [<ffffffff81184fae>] __get_free_pages+0xe/0x50 [<ffffffff8133f6fe>] swiotlb_alloc_coherent+0x5e/0x150 [<ffffffff81062551>] x86_swiotlb_alloc_coherent+0x41/0x50 [<ffffffffa056b4c4>] mlx4_buf_direct_alloc.isra.7+0xc4/0x180 [mlx4_core] [<ffffffffa056b73b>] mlx4_buf_alloc+0x1bb/0x260 [mlx4_core] [<ffffffffa0b15496>] create_qp_common+0x536/0x1000 [mlx4_ib] [<ffffffff811c6ef7>] ? dma_pool_free+0xa7/0xd0 [<ffffffffa0b163c1>] mlx4_ib_create_qp+0x3b1/0xdc0 [mlx4_ib] [<ffffffffa0b01bc2>] ? mlx4_ib_create_cq+0x2d2/0x430 [mlx4_ib] [<ffffffffa0b21f20>] mlx4_ib_create_qp_wrp+0x10/0x20 [mlx4_ib] [<ffffffffa08f152a>] ib_create_qp+0x7a/0x2f0 [ib_core] [<ffffffffa06205d4>] rdma_create_qp+0x34/0xb0 [rdma_cm] [<ffffffffa08275c9>] kiblnd_create_conn+0xbf9/0x1950 [ko2iblnd] [<ffffffffa074077a>] ? cfs_percpt_unlock+0x1a/0xb0 [libcfs] [<ffffffffa0835519>] kiblnd_passive_connect+0xa99/0x18c0 [ko2iblnd] Fixes: 73898db0 ("net/mlx4: Avoid wrong virtual mappings") Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

IB/mlx4: Use 4K pages for kernel QP's WQE buffer
In the current implementation, the driver tries to allocate contiguous memory, and if it fails, it falls back to 4K fragmented allocation. Once the memory is fragmented, the first allocation might take a lot of time, and even fail, which can cause connection failures. This patch changes the logic to always allocate with 4K granularity, since it's more robust and more likely to succeed. This patch was tested with Lustre and no performance degradation was observed. Note: This commit eliminates the "shrinking WQE" feature. This feature depended on using vmap to create a virtually contiguous send WQ. vmap use was abandoned due to problems with several processors (see the commit cited below). As a result, shrinking WQE was available only with physically contiguous send WQs. Allocating such send WQs caused the problems described above. Therefore, as a side effect of eliminating the use of large physically contiguous send WQs, the shrinking WQE feature became unavailable. Warning example: worker/20:1: page allocation failure: order:8, mode:0x80d0 CPU: 20 PID: 513 Comm: kworker/20:1 Tainted: G OE ------------ Workqueue: ib_cm cm_work_handler [ib_cm] Call Trace: [<ffffffff81686d81>] dump_stack+0x19/0x1b [<ffffffff81186160>] warn_alloc_failed+0x110/0x180 [<ffffffff8118a954>] __alloc_pages_nodemask+0x9b4/0xba0 [<ffffffff811ce868>] alloc_pages_current+0x98/0x110 [<ffffffff81184fae>] __get_free_pages+0xe/0x50 [<ffffffff8133f6fe>] swiotlb_alloc_coherent+0x5e/0x150 [<ffffffff81062551>] x86_swiotlb_alloc_coherent+0x41/0x50 [<ffffffffa056b4c4>] mlx4_buf_direct_alloc.isra.7+0xc4/0x180 [mlx4_core] [<ffffffffa056b73b>] mlx4_buf_alloc+0x1bb/0x260 [mlx4_core] [<ffffffffa0b15496>] create_qp_common+0x536/0x1000 [mlx4_ib] [<ffffffff811c6ef7>] ? dma_pool_free+0xa7/0xd0 [<ffffffffa0b163c1>] mlx4_ib_create_qp+0x3b1/0xdc0 [mlx4_ib] [<ffffffffa0b01bc2>] ? mlx4_ib_create_cq+0x2d2/0x430 [mlx4_ib] [<ffffffffa0b21f20>] mlx4_ib_create_qp_wrp+0x10/0x20 [mlx4_ib] [<ffffffffa08f152a>] ib_create_qp+0x7a/0x2f0 [ib_core] [<ffffffffa06205d4>] rdma_create_qp+0x34/0xb0 [rdma_cm] [<ffffffffa08275c9>] kiblnd_create_conn+0xbf9/0x1950 [ko2iblnd] [<ffffffffa074077a>] ? cfs_percpt_unlock+0x1a/0xb0 [libcfs] [<ffffffffa0835519>] kiblnd_passive_connect+0xa99/0x18c0 [ko2iblnd] Fixes: 73898db0 ("net/mlx4: Avoid wrong virtual mappings") Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
f95ccffc · Jack Morgenstein · Jason Gunthorpe · bccd0622 · f95ccffc · f95ccffc
Commit f95ccffc authored Jul 26, 2018 by Jack Morgenstein Committed by Jason Gunthorpe Jul 30, 2018
Show whitespace changes
Inline Side-by-side

Showing with 34 additions and 176 deletions

drivers/infiniband/hw/mlx4/mlx4_ib.h drivers/infiniband/hw/mlx4/mlx4_ib.h +0 -1

drivers/infiniband/hw/mlx4/qp.c drivers/infiniband/hw/mlx4/qp.c +34 -175

No files found.
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -322,7 +322,6 @@ struct mlx4_ib_qp {
 	u32			doorbell_qpn;
 	__be32			sq_signal_bits;
 	unsigned		sq_next_wqe;
-	int			sq_max_wqes_per_wr;
 	int			sq_spare_wqes;
 	struct mlx4_ib_wq	sq;

--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -204,89 +204,24 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
 /*
 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
- * first four bytes of every 64 byte chunk with
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
- *     0x7FFFFFF | (invalid_ownership_value << 31).
+ * the very first chunk of the WQE.
- *
- * When the max work request size is less than or equal to the WQE
- * basic block size, as an optimization, we can stamp all WQEs with
- * 0xffffffff, and skip the very first chunk of each WQE.
 */
-static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
 {
 	__be32 *wqe;
 	int i;
 	int s;
-	int ind;
 	void *buf;
-	__be32 stamp;
 	struct mlx4_wqe_ctrl_seg *ctrl;
-	if (qp->sq_max_wqes_per_wr > 1) {
+	buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
-		s = roundup(size, 1U << qp->sq.wqe_shift);
+	ctrl = (struct mlx4_wqe_ctrl_seg *)buf;
-		for (i = 0; i < s; i += 64) {
-			ind = (i >> qp->sq.wqe_shift) + n;
-			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
-						       cpu_to_be32(0xffffffff);
-			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
-			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
-			*wqe = stamp;
-		}
-	} else {
-		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
 	s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
 	for (i = 64; i < s; i += 64) {
 		wqe = buf + i;
 		*wqe = cpu_to_be32(0xffffffff);
 	}
-	}
-}
-static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
-{
-	struct mlx4_wqe_ctrl_seg *ctrl;
-	struct mlx4_wqe_inline_seg *inl;
-	void *wqe;
-	int s;
-	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
-	s = sizeof(struct mlx4_wqe_ctrl_seg);
-	if (qp->ibqp.qp_type == IB_QPT_UD) {
-		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
-		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
-		memset(dgram, 0, sizeof *dgram);
-		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
-		s += sizeof(struct mlx4_wqe_datagram_seg);
-	}
-	/* Pad the remainder of the WQE with an inline data segment. */
-	if (size > s) {
-		inl = wqe + s;
-		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
-	}
-	ctrl->srcrb_flags = 0;
-	ctrl->qpn_vlan.fence_size = size / 16;
-	/*
-	 * Make sure descriptor is fully written before setting ownership bit
-	 * (because HW can start executing as soon as we do).
-	 */
-	wmb();
-	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
-		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
-	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
-}
-/* Post NOP WQE to prevent wrap-around in the middle of WR */
-static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
-{
-	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
-	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
-		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
-		ind += s;
-	}
-	return ind;
 }
 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -433,8 +368,7 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 }
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp,
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
-			      bool shrink_wqe)
 {
 	int s;
@@ -461,69 +395,19 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	if (s > dev->dev->caps.max_sq_desc_sz)
 		return -EINVAL;
-	/*
-	 * Hermon supports shrinking WQEs, such that a single work
-	 * request can include multiple units of 1 << wqe_shift.  This
-	 * way, work requests can differ in size, and do not have to
-	 * be a power of 2 in size, saving memory and speeding up send
-	 * WR posting.  Unfortunately, if we do this then the
-	 * wqe_index field in CQEs can't be used to look up the WR ID
-	 * anymore, so we do this only if selective signaling is off.
-	 *
-	 * Further, on 32-bit platforms, we can't use vmap() to make
-	 * the QP buffer virtually contiguous.  Thus we have to use
-	 * constant-sized WRs to make sure a WR is always fully within
-	 * a single page-sized chunk.
-	 *
-	 * Finally, we use NOP work requests to pad the end of the
-	 * work queue, to avoid wrap-around in the middle of WR.  We
-	 * set NEC bit to avoid getting completions with error for
-	 * these NOP WRs, but since NEC is only supported starting
-	 * with firmware 2.2.232, we use constant-sized WRs for older
-	 * firmware.
-	 *
-	 * And, since MLX QPs only support SEND, we use constant-sized
-	 * WRs in this case.
-	 *
-	 * We look for the smallest value of wqe_shift such that the
-	 * resulting number of wqes does not exceed device
-	 * capabilities.
-	 *
-	 * We set WQE size to at least 64 bytes, this way stamping
-	 * invalidates each WQE.
-	 */
-	if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
-	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
-	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
-	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
-		      MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
-		qp->sq.wqe_shift = ilog2(64);
-	else
 	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
-	for (;;) {
-		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
 	/*
 	 * We need to leave 2 KB + 1 WR of headroom in the SQ to
 	 * allow HW to prefetch.
 	 */
-		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
-		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
-						    qp->sq_max_wqes_per_wr +
 					    qp->sq_spare_wqes);
-		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+	qp->sq.max_gs =
-			break;
+		(min(dev->dev->caps.max_sq_desc_sz,
+		     (1 << qp->sq.wqe_shift)) -
-		if (qp->sq_max_wqes_per_wr <= 1)
-			return -EINVAL;
-		++qp->sq.wqe_shift;
-	}
-	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
-			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
 		 send_wqe_overhead(type, qp->flags)) /
 		sizeof (struct mlx4_wqe_data_seg);
@@ -538,7 +422,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	}
 	cap->max_send_wr  = qp->sq.max_post =
-		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt - qp->sq_spare_wqes;
 	cap->max_send_sge = min(qp->sq.max_gs,
 				min(dev->dev->caps.max_sq_sg,
 				    dev->dev->caps.max_rq_sg));
@@ -977,7 +861,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 {
 	int qpn;
 	int err;
-	struct ib_qp_cap backup_cap;
 	struct mlx4_ib_sqp *sqp = NULL;
 	struct mlx4_ib_qp *qp;
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
@@ -1178,9 +1061,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 				goto err;
 		}
-		memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap));
+		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
-		err = set_kernel_sq_size(dev, &init_attr->cap,
-					 qp_type, qp, true);
 		if (err)
 			goto err;
@@ -1192,21 +1073,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			*qp->db.db = 0;
 		}
-		if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size,
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size,  PAGE_SIZE * 2,
 				   &qp->buf)) {
-			memcpy(&init_attr->cap, &backup_cap,
-			       sizeof(backup_cap));
-			err = set_kernel_sq_size(dev, &init_attr->cap, qp_type,
-						 qp, false);
-			if (err)
-				goto err_db;
-			if (mlx4_buf_alloc(dev->dev, qp->buf_size,
-					   PAGE_SIZE * 2, &qp->buf)) {
 			err = -ENOMEM;
 			goto err_db;
 		}
-		}
 		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
 				    &qp->mtt);
@@ -2582,11 +2453,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
 			ctrl = get_send_wqe(qp, i);
 			ctrl->owner_opcode = cpu_to_be32(1 << 31);
-			if (qp->sq_max_wqes_per_wr == 1)
 			ctrl->qpn_vlan.fence_size =
 				1 << (qp->sq.wqe_shift - 4);
+			stamp_send_wqe(qp, i);
-			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
 		}
 	}
@@ -3580,7 +3449,6 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	int nreq;
 	int err = 0;
 	unsigned ind;
-	int uninitialized_var(stamp);
 	int uninitialized_var(size);
 	unsigned uninitialized_var(seglen);
 	__be32 dummy;
@@ -3853,22 +3721,14 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
 			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
-		stamp = ind + qp->sq_spare_wqes;
-		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
 		/*
 		 * We can improve latency by not stamping the last
 		 * send queue WQE until after ringing the doorbell, so
 		 * only stamp here if there are still more WQEs to post.
-		 *
-		 * Same optimization applies to padding with NOP wqe
-		 * in case of WQE shrinking (used to prevent wrap-around
-		 * in the middle of WR).
 		 */
-		if (wr->next) {
+		if (wr->next)
-			stamp_send_wqe(qp, stamp, size * 16);
+			stamp_send_wqe(qp, ind + qp->sq_spare_wqes);
-			ind = pad_wraparound(qp, ind);
+		ind++;
-		}
 	}
 out:
@@ -3890,9 +3750,8 @@ static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 		 */
 		mmiowb();
-		stamp_send_wqe(qp, stamp, size * 16);
+		stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1);
-		ind = pad_wraparound(qp, ind);
 		qp->sq_next_wqe = ind;
 	}