Commit d9b13c20 authored by Jianxin Xiong's avatar Jianxin Xiong Committed by Doug Ledford

IB/rdmavt, hfi1: Fix NFSoRDMA failure with FRMR enabled

Hanging has been observed while writing a file over NFSoRDMA. Dmesg on
the server contains messages like these:

[  931.992501] svcrdma: Error -22 posting RDMA_READ
[  952.076879] svcrdma: Error -22 posting RDMA_READ
[  982.154127] svcrdma: Error -22 posting RDMA_READ
[ 1012.235884] svcrdma: Error -22 posting RDMA_READ
[ 1042.319194] svcrdma: Error -22 posting RDMA_READ

Here is why:

With the base memory management extension enabled, FRMR is used instead
of FMR. The xprtrdma server issues each RDMA read request as the following
bundle:

(1)IB_WR_REG_MR, signaled;
(2)IB_WR_RDMA_READ, signaled;
(3)IB_WR_LOCAL_INV, signaled & fencing.

These requests are signaled. In order to generate completion, the fast
register work request is processed by the hfi1 send engine after being
posted to the work queue, and the corresponding lkey is not valid until
the request is processed. However, the rdmavt driver validates lkey when
the RDMA read request is posted and thus it fails immediately with error
-EINVAL (-22).

This patch changes the work flow of local operations (fast register and
local invalidate) so that fast register work requests are always
processed immediately to ensure that the corresponding lkey is valid
when subsequent work requests are posted. Local invalidate requests are
processed immediately if fencing is not required and no previous local
invalidate request is pending.

To allow completion generation for signaled local operations that have
been processed before posting to the work queue, an internal send flag
RVT_SEND_COMPLETION_ONLY is added. The hfi1 send engine checks this flag
and only generates completion for such requests.
Reviewed-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarJianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: default avatarDoug Ledford <dledford@redhat.com>
parent 856cc4c2
...@@ -402,7 +402,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -402,7 +402,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
char newreq; char newreq;
int middle = 0; int middle = 0;
int delta; int delta;
int err;
ps->s_txreq = get_txreq(ps->dev, qp); ps->s_txreq = get_txreq(ps->dev, qp);
if (IS_ERR(ps->s_txreq)) if (IS_ERR(ps->s_txreq))
...@@ -484,25 +483,27 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -484,25 +483,27 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
*/ */
if (wqe->wr.opcode == IB_WR_REG_MR || if (wqe->wr.opcode == IB_WR_REG_MR ||
wqe->wr.opcode == IB_WR_LOCAL_INV) { wqe->wr.opcode == IB_WR_LOCAL_INV) {
int local_ops = 0;
int err = 0;
if (qp->s_last != qp->s_cur) if (qp->s_last != qp->s_cur)
goto bail; goto bail;
if (++qp->s_cur == qp->s_size) if (++qp->s_cur == qp->s_size)
qp->s_cur = 0; qp->s_cur = 0;
if (++qp->s_tail == qp->s_size) if (++qp->s_tail == qp->s_size)
qp->s_tail = 0; qp->s_tail = 0;
if (wqe->wr.opcode == IB_WR_REG_MR) if (!(wqe->wr.send_flags &
err = rvt_fast_reg_mr( RVT_SEND_COMPLETION_ONLY)) {
qp, wqe->reg_wr.mr,
wqe->reg_wr.key,
wqe->reg_wr.access);
else
err = rvt_invalidate_rkey( err = rvt_invalidate_rkey(
qp, qp,
wqe->wr.ex.invalidate_rkey); wqe->wr.ex.invalidate_rkey);
local_ops = 1;
}
hfi1_send_complete(qp, wqe, hfi1_send_complete(qp, wqe,
err ? IB_WC_LOC_PROT_ERR err ? IB_WC_LOC_PROT_ERR
: IB_WC_SUCCESS); : IB_WC_SUCCESS);
atomic_dec(&qp->local_ops_pending); if (local_ops)
atomic_dec(&qp->local_ops_pending);
qp->s_hdrwords = 0; qp->s_hdrwords = 0;
goto done_free_tx; goto done_free_tx;
} }
......
...@@ -442,16 +442,15 @@ static void ruc_loopback(struct rvt_qp *sqp) ...@@ -442,16 +442,15 @@ static void ruc_loopback(struct rvt_qp *sqp)
sqp->s_len = wqe->length; sqp->s_len = wqe->length;
switch (wqe->wr.opcode) { switch (wqe->wr.opcode) {
case IB_WR_REG_MR: case IB_WR_REG_MR:
if (rvt_fast_reg_mr(sqp, wqe->reg_wr.mr, wqe->reg_wr.key,
wqe->reg_wr.access))
send_status = IB_WC_LOC_PROT_ERR;
local_ops = 1;
goto send_comp; goto send_comp;
case IB_WR_LOCAL_INV: case IB_WR_LOCAL_INV:
if (rvt_invalidate_rkey(sqp, wqe->wr.ex.invalidate_rkey)) if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
send_status = IB_WC_LOC_PROT_ERR; if (rvt_invalidate_rkey(sqp,
local_ops = 1; wqe->wr.ex.invalidate_rkey))
send_status = IB_WC_LOC_PROT_ERR;
local_ops = 1;
}
goto send_comp; goto send_comp;
case IB_WR_SEND_WITH_INV: case IB_WR_SEND_WITH_INV:
......
...@@ -77,7 +77,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -77,7 +77,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
u32 len; u32 len;
u32 pmtu = qp->pmtu; u32 pmtu = qp->pmtu;
int middle = 0; int middle = 0;
int err;
ps->s_txreq = get_txreq(ps->dev, qp); ps->s_txreq = get_txreq(ps->dev, qp);
if (IS_ERR(ps->s_txreq)) if (IS_ERR(ps->s_txreq))
...@@ -125,20 +124,22 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ...@@ -125,20 +124,22 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
*/ */
if (wqe->wr.opcode == IB_WR_REG_MR || if (wqe->wr.opcode == IB_WR_REG_MR ||
wqe->wr.opcode == IB_WR_LOCAL_INV) { wqe->wr.opcode == IB_WR_LOCAL_INV) {
int local_ops = 0;
int err = 0;
if (qp->s_last != qp->s_cur) if (qp->s_last != qp->s_cur)
goto bail; goto bail;
if (++qp->s_cur == qp->s_size) if (++qp->s_cur == qp->s_size)
qp->s_cur = 0; qp->s_cur = 0;
if (wqe->wr.opcode == IB_WR_REG_MR) if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
err = rvt_fast_reg_mr(qp, wqe->reg_wr.mr,
wqe->reg_wr.key,
wqe->reg_wr.access);
else
err = rvt_invalidate_rkey( err = rvt_invalidate_rkey(
qp, wqe->wr.ex.invalidate_rkey); qp, wqe->wr.ex.invalidate_rkey);
local_ops = 1;
}
hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
: IB_WC_SUCCESS); : IB_WC_SUCCESS);
atomic_dec(&qp->local_ops_pending); if (local_ops)
atomic_dec(&qp->local_ops_pending);
qp->s_hdrwords = 0; qp->s_hdrwords = 0;
goto done_free_tx; goto done_free_tx;
} }
......
...@@ -1579,6 +1579,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, ...@@ -1579,6 +1579,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
int ret; int ret;
size_t cplen; size_t cplen;
bool reserved_op; bool reserved_op;
int local_ops_delayed = 0;
BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE)); BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
...@@ -1592,25 +1593,37 @@ static int rvt_post_one_wr(struct rvt_qp *qp, ...@@ -1592,25 +1593,37 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
cplen = ret; cplen = ret;
/* /*
* Local operations including fast register and local invalidate * Local operations include fast register and local invalidate.
* can be processed immediately w/o being posted to the send queue * Fast register needs to be processed immediately because the
* if neither fencing nor completion generation is needed. However, * registered lkey may be used by following work requests and the
* once fencing or completion is requested, direct processing of * lkey needs to be valid at the time those requests are posted.
* following local operations must be disabled until all the local * Local invalidate can be processed immediately if fencing is
* operations posted to the send queue have completed. This is * not required and no previous local invalidate ops are pending.
* necessary to ensure the correct ordering. * Signaled local operations that have been processed immediately
* need to have requests with "completion only" flags set posted
* to the send queue in order to generate completions.
*/ */
if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) && if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
!(wr->send_flags & (IB_SEND_FENCE | IB_SEND_SIGNALED)) &&
!atomic_read(&qp->local_ops_pending)) {
struct ib_reg_wr *reg = reg_wr(wr);
switch (wr->opcode) { switch (wr->opcode) {
case IB_WR_REG_MR: case IB_WR_REG_MR:
return rvt_fast_reg_mr(qp, reg->mr, reg->key, ret = rvt_fast_reg_mr(qp,
reg->access); reg_wr(wr)->mr,
reg_wr(wr)->key,
reg_wr(wr)->access);
if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
return ret;
break;
case IB_WR_LOCAL_INV: case IB_WR_LOCAL_INV:
return rvt_invalidate_rkey(qp, wr->ex.invalidate_rkey); if ((wr->send_flags & IB_SEND_FENCE) ||
atomic_read(&qp->local_ops_pending)) {
local_ops_delayed = 1;
} else {
ret = rvt_invalidate_rkey(
qp, wr->ex.invalidate_rkey);
if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
return ret;
}
break;
default: default:
return -EINVAL; return -EINVAL;
} }
...@@ -1675,7 +1688,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, ...@@ -1675,7 +1688,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
} }
if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
atomic_inc(&qp->local_ops_pending); if (local_ops_delayed)
atomic_inc(&qp->local_ops_pending);
else
wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
wqe->ssn = 0; wqe->ssn = 0;
wqe->psn = 0; wqe->psn = 0;
wqe->lpsn = 0; wqe->lpsn = 0;
......
...@@ -148,6 +148,7 @@ ...@@ -148,6 +148,7 @@
* Internal send flags * Internal send flags
*/ */
#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START #define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START
#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1)
/* /*
* Send work request queue entry. * Send work request queue entry.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment