Commit b09160c3 authored by Israel Rukshin's avatar Israel Rukshin Committed by Christoph Hellwig

nvmet-rdma: add metadata/T10-PI support

For capable HCAs (e.g. ConnectX-5/ConnectX-6) this will allow end-to-end
protection information passthrough and validation for NVMe over RDMA
transport. Metadata support was implemented over the new RDMA signature
verbs API.
Signed-off-by: default avatarIsrael Rukshin <israelr@mellanox.com>
Signed-off-by: default avatarMax Gurtovoy <maxg@mellanox.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent c6e3f133
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
/* Assume mpsmin == device_page_size == 4KB */ /* Assume mpsmin == device_page_size == 4KB */
#define NVMET_RDMA_MAX_MDTS 8 #define NVMET_RDMA_MAX_MDTS 8
#define NVMET_RDMA_MAX_METADATA_MDTS 5
struct nvmet_rdma_srq; struct nvmet_rdma_srq;
...@@ -60,6 +61,7 @@ struct nvmet_rdma_rsp { ...@@ -60,6 +61,7 @@ struct nvmet_rdma_rsp {
struct nvmet_rdma_queue *queue; struct nvmet_rdma_queue *queue;
struct ib_cqe read_cqe; struct ib_cqe read_cqe;
struct ib_cqe write_cqe;
struct rdma_rw_ctx rw; struct rdma_rw_ctx rw;
struct nvmet_req req; struct nvmet_req req;
...@@ -161,6 +163,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); ...@@ -161,6 +163,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc);
static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
...@@ -423,6 +426,9 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, ...@@ -423,6 +426,9 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
/* Data In / RDMA READ */ /* Data In / RDMA READ */
r->read_cqe.done = nvmet_rdma_read_data_done; r->read_cqe.done = nvmet_rdma_read_data_done;
/* Data Out / RDMA WRITE */
r->write_cqe.done = nvmet_rdma_write_data_done;
return 0; return 0;
out_free_rsp: out_free_rsp:
...@@ -532,6 +538,129 @@ static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) ...@@ -532,6 +538,129 @@ static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
spin_unlock(&queue->rsp_wr_wait_lock); spin_unlock(&queue->rsp_wr_wait_lock);
} }
static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr)
{
struct ib_mr_status mr_status;
int ret;
u16 status = 0;
ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
if (ret) {
pr_err("ib_check_mr_status failed, ret %d\n", ret);
return NVME_SC_INVALID_PI;
}
if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
switch (mr_status.sig_err.err_type) {
case IB_SIG_BAD_GUARD:
status = NVME_SC_GUARD_CHECK;
break;
case IB_SIG_BAD_REFTAG:
status = NVME_SC_REFTAG_CHECK;
break;
case IB_SIG_BAD_APPTAG:
status = NVME_SC_APPTAG_CHECK;
break;
}
pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
mr_status.sig_err.err_type,
mr_status.sig_err.expected,
mr_status.sig_err.actual);
}
return status;
}
static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi,
struct nvme_command *cmd, struct ib_sig_domain *domain,
u16 control, u8 pi_type)
{
domain->sig_type = IB_SIG_TYPE_T10_DIF;
domain->sig.dif.bg_type = IB_T10DIF_CRC;
domain->sig.dif.pi_interval = 1 << bi->interval_exp;
domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
if (control & NVME_RW_PRINFO_PRCHK_REF)
domain->sig.dif.ref_remap = true;
domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
domain->sig.dif.app_escape = true;
if (pi_type == NVME_NS_DPS_PI_TYPE3)
domain->sig.dif.ref_escape = true;
}
static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req,
struct ib_sig_attrs *sig_attrs)
{
struct nvme_command *cmd = req->cmd;
u16 control = le16_to_cpu(cmd->rw.control);
u8 pi_type = req->ns->pi_type;
struct blk_integrity *bi;
bi = bdev_get_integrity(req->ns->bdev);
memset(sig_attrs, 0, sizeof(*sig_attrs));
if (control & NVME_RW_PRINFO_PRACT) {
/* for WRITE_INSERT/READ_STRIP no wire domain */
sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
pi_type);
/* Clear the PRACT bit since HCA will generate/verify the PI */
control &= ~NVME_RW_PRINFO_PRACT;
cmd->rw.control = cpu_to_le16(control);
/* PI is added by the HW */
req->transfer_len += req->metadata_len;
} else {
/* for WRITE_PASS/READ_PASS both wire/memory domains exist */
nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
pi_type);
nvmet_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
pi_type);
}
if (control & NVME_RW_PRINFO_PRCHK_REF)
sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG;
if (control & NVME_RW_PRINFO_PRCHK_GUARD)
sig_attrs->check_mask |= IB_SIG_CHECK_GUARD;
if (control & NVME_RW_PRINFO_PRCHK_APP)
sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG;
}
static int nvmet_rdma_rw_ctx_init(struct nvmet_rdma_rsp *rsp, u64 addr, u32 key,
struct ib_sig_attrs *sig_attrs)
{
struct rdma_cm_id *cm_id = rsp->queue->cm_id;
struct nvmet_req *req = &rsp->req;
int ret;
if (req->metadata_len)
ret = rdma_rw_ctx_signature_init(&rsp->rw, cm_id->qp,
cm_id->port_num, req->sg, req->sg_cnt,
req->metadata_sg, req->metadata_sg_cnt, sig_attrs,
addr, key, nvmet_data_dir(req));
else
ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
req->sg, req->sg_cnt, 0, addr, key,
nvmet_data_dir(req));
return ret;
}
static void nvmet_rdma_rw_ctx_destroy(struct nvmet_rdma_rsp *rsp)
{
struct rdma_cm_id *cm_id = rsp->queue->cm_id;
struct nvmet_req *req = &rsp->req;
if (req->metadata_len)
rdma_rw_ctx_destroy_signature(&rsp->rw, cm_id->qp,
cm_id->port_num, req->sg, req->sg_cnt,
req->metadata_sg, req->metadata_sg_cnt,
nvmet_data_dir(req));
else
rdma_rw_ctx_destroy(&rsp->rw, cm_id->qp, cm_id->port_num,
req->sg, req->sg_cnt, nvmet_data_dir(req));
}
static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
{ {
...@@ -539,11 +668,8 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) ...@@ -539,11 +668,8 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
if (rsp->n_rdma) { if (rsp->n_rdma)
rdma_rw_ctx_destroy(&rsp->rw, queue->qp, nvmet_rdma_rw_ctx_destroy(rsp);
queue->cm_id->port_num, rsp->req.sg,
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
}
if (rsp->req.sg != rsp->cmd->inline_sg) if (rsp->req.sg != rsp->cmd->inline_sg)
nvmet_req_free_sgls(&rsp->req); nvmet_req_free_sgls(&rsp->req);
...@@ -598,11 +724,16 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) ...@@ -598,11 +724,16 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
rsp->send_wr.opcode = IB_WR_SEND; rsp->send_wr.opcode = IB_WR_SEND;
} }
if (nvmet_rdma_need_data_out(rsp)) if (nvmet_rdma_need_data_out(rsp)) {
if (rsp->req.metadata_len)
first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
cm_id->port_num, NULL, &rsp->send_wr); cm_id->port_num, &rsp->write_cqe, NULL);
else else
first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
cm_id->port_num, NULL, &rsp->send_wr);
} else {
first_wr = &rsp->send_wr; first_wr = &rsp->send_wr;
}
nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
...@@ -621,15 +752,14 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) ...@@ -621,15 +752,14 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
struct nvmet_rdma_rsp *rsp = struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
struct nvmet_rdma_queue *queue = cq->cq_context; struct nvmet_rdma_queue *queue = cq->cq_context;
u16 status = 0;
WARN_ON(rsp->n_rdma <= 0); WARN_ON(rsp->n_rdma <= 0);
atomic_add(rsp->n_rdma, &queue->sq_wr_avail); atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
queue->cm_id->port_num, rsp->req.sg,
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
rsp->n_rdma = 0; rsp->n_rdma = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) { if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvmet_rdma_rw_ctx_destroy(rsp);
nvmet_req_uninit(&rsp->req); nvmet_req_uninit(&rsp->req);
nvmet_rdma_release_rsp(rsp); nvmet_rdma_release_rsp(rsp);
if (wc->status != IB_WC_WR_FLUSH_ERR) { if (wc->status != IB_WC_WR_FLUSH_ERR) {
...@@ -640,9 +770,60 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) ...@@ -640,9 +770,60 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
return; return;
} }
if (rsp->req.metadata_len)
status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr);
nvmet_rdma_rw_ctx_destroy(rsp);
if (unlikely(status))
nvmet_req_complete(&rsp->req, status);
else
rsp->req.execute(&rsp->req); rsp->req.execute(&rsp->req);
} }
static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe);
struct nvmet_rdma_queue *queue = cq->cq_context;
struct rdma_cm_id *cm_id = rsp->queue->cm_id;
u16 status;
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY))
return;
WARN_ON(rsp->n_rdma <= 0);
atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
rsp->n_rdma = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvmet_rdma_rw_ctx_destroy(rsp);
nvmet_req_uninit(&rsp->req);
nvmet_rdma_release_rsp(rsp);
if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_info("RDMA WRITE for CQE 0x%p failed with status %s (%d).\n",
wc->wr_cqe, ib_wc_status_msg(wc->status),
wc->status);
nvmet_rdma_error_comp(queue);
}
return;
}
/*
* Upon RDMA completion check the signature status
* - if succeeded send good NVMe response
* - if failed send bad NVMe response with appropriate error
*/
status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr);
if (unlikely(status))
rsp->req.cqe->status = cpu_to_le16(status << 1);
nvmet_rdma_rw_ctx_destroy(rsp);
if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) {
pr_err("sending cmd response failed\n");
nvmet_rdma_release_rsp(rsp);
}
}
static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
u64 off) u64 off)
{ {
...@@ -697,9 +878,9 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) ...@@ -697,9 +878,9 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
struct nvme_keyed_sgl_desc *sgl, bool invalidate) struct nvme_keyed_sgl_desc *sgl, bool invalidate)
{ {
struct rdma_cm_id *cm_id = rsp->queue->cm_id;
u64 addr = le64_to_cpu(sgl->addr); u64 addr = le64_to_cpu(sgl->addr);
u32 key = get_unaligned_le32(sgl->key); u32 key = get_unaligned_le32(sgl->key);
struct ib_sig_attrs sig_attrs;
int ret; int ret;
rsp->req.transfer_len = get_unaligned_le24(sgl->length); rsp->req.transfer_len = get_unaligned_le24(sgl->length);
...@@ -708,13 +889,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, ...@@ -708,13 +889,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
if (!rsp->req.transfer_len) if (!rsp->req.transfer_len)
return 0; return 0;
if (rsp->req.metadata_len)
nvmet_rdma_set_sig_attrs(&rsp->req, &sig_attrs);
ret = nvmet_req_alloc_sgls(&rsp->req); ret = nvmet_req_alloc_sgls(&rsp->req);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
goto error_out; goto error_out;
ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, ret = nvmet_rdma_rw_ctx_init(rsp, addr, key, &sig_attrs);
rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
nvmet_data_dir(&rsp->req));
if (unlikely(ret < 0)) if (unlikely(ret < 0))
goto error_out; goto error_out;
rsp->n_rdma += ret; rsp->n_rdma += ret;
...@@ -1108,6 +1290,9 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) ...@@ -1108,6 +1290,9 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
} }
if (queue->port->pi_enable && queue->host_qid)
qp_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
if (ret) { if (ret) {
pr_err("failed to create_qp ret= %d\n", ret); pr_err("failed to create_qp ret= %d\n", ret);
...@@ -1226,6 +1411,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, ...@@ -1226,6 +1411,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
struct rdma_cm_id *cm_id, struct rdma_cm_id *cm_id,
struct rdma_cm_event *event) struct rdma_cm_event *event)
{ {
struct nvmet_rdma_port *port = cm_id->context;
struct nvmet_rdma_queue *queue; struct nvmet_rdma_queue *queue;
int ret; int ret;
...@@ -1252,6 +1438,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, ...@@ -1252,6 +1438,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
queue->dev = ndev; queue->dev = ndev;
queue->cm_id = cm_id; queue->cm_id = cm_id;
queue->port = port->nport;
spin_lock_init(&queue->state_lock); spin_lock_init(&queue->state_lock);
queue->state = NVMET_RDMA_Q_CONNECTING; queue->state = NVMET_RDMA_Q_CONNECTING;
...@@ -1369,7 +1556,6 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, ...@@ -1369,7 +1556,6 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event) struct rdma_cm_event *event)
{ {
struct nvmet_rdma_port *port = cm_id->context;
struct nvmet_rdma_device *ndev; struct nvmet_rdma_device *ndev;
struct nvmet_rdma_queue *queue; struct nvmet_rdma_queue *queue;
int ret = -EINVAL; int ret = -EINVAL;
...@@ -1385,7 +1571,6 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ...@@ -1385,7 +1571,6 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
ret = -ENOMEM; ret = -ENOMEM;
goto put_device; goto put_device;
} }
queue->port = port->nport;
if (queue->host_qid == 0) { if (queue->host_qid == 0) {
/* Let inflight controller teardown complete */ /* Let inflight controller teardown complete */
...@@ -1657,6 +1842,14 @@ static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) ...@@ -1657,6 +1842,14 @@ static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
goto out_destroy_id; goto out_destroy_id;
} }
if (port->nport->pi_enable &&
!(cm_id->device->attrs.device_cap_flags &
IB_DEVICE_INTEGRITY_HANDOVER)) {
pr_err("T10-PI is not supported for %pISpcs\n", addr);
ret = -EINVAL;
goto out_destroy_id;
}
port->cm_id = cm_id; port->cm_id = cm_id;
return 0; return 0;
...@@ -1766,6 +1959,8 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, ...@@ -1766,6 +1959,8 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl)
{ {
if (ctrl->pi_support)
return NVMET_RDMA_MAX_METADATA_MDTS;
return NVMET_RDMA_MAX_MDTS; return NVMET_RDMA_MAX_MDTS;
} }
...@@ -1774,6 +1969,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { ...@@ -1774,6 +1969,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
.type = NVMF_TRTYPE_RDMA, .type = NVMF_TRTYPE_RDMA,
.msdbd = 1, .msdbd = 1,
.has_keyed_sgls = 1, .has_keyed_sgls = 1,
.metadata_support = 1,
.add_port = nvmet_rdma_add_port, .add_port = nvmet_rdma_add_port,
.remove_port = nvmet_rdma_remove_port, .remove_port = nvmet_rdma_remove_port,
.queue_response = nvmet_rdma_queue_response, .queue_response = nvmet_rdma_queue_response,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment