Commit 4f5099af authored by Keith Busch's avatar Keith Busch Committed by Matthew Wilcox

NVMe: IOCTL path RCU protect queue access

This adds rcu protected access to a queue in the nvme IOCTL path
to fix potential races between a surprise removal and queue usage in
nvme_submit_sync_cmd. The fix holds the rcu_read_lock() here to prevent
the nvme_queue from freeing while this path is executing so it can't
sleep, and so this path will no longer wait for a available command
id should they all be in use at the time a passthrough IOCTL request
is received.
Signed-off-by: default avatarKeith Busch <keith.busch@intel.com>
Signed-off-by: default avatarMatthew Wilcox <matthew.r.wilcox@intel.com>
parent 5a92e700
...@@ -268,18 +268,30 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) ...@@ -268,18 +268,30 @@ static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
return rcu_dereference_raw(dev->queues[qid]); return rcu_dereference_raw(dev->queues[qid]);
} }
struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
{ {
rcu_read_lock(); rcu_read_lock();
return rcu_dereference(dev->queues[get_cpu() + 1]); return rcu_dereference(dev->queues[get_cpu() + 1]);
} }
void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
{ {
put_cpu(); put_cpu();
rcu_read_unlock(); rcu_read_unlock();
} }
static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx)
__acquires(RCU)
{
rcu_read_lock();
return rcu_dereference(dev->queues[q_idx]);
}
static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
{
rcu_read_unlock();
}
/** /**
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use * @nvmeq: The queue to use
...@@ -292,6 +304,10 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) ...@@ -292,6 +304,10 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
unsigned long flags; unsigned long flags;
u16 tail; u16 tail;
spin_lock_irqsave(&nvmeq->q_lock, flags); spin_lock_irqsave(&nvmeq->q_lock, flags);
if (nvmeq->q_suspended) {
spin_unlock_irqrestore(&nvmeq->q_lock, flags);
return -EBUSY;
}
tail = nvmeq->sq_tail; tail = nvmeq->sq_tail;
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
if (++tail == nvmeq->q_depth) if (++tail == nvmeq->q_depth)
...@@ -812,27 +828,46 @@ static void sync_completion(struct nvme_dev *dev, void *ctx, ...@@ -812,27 +828,46 @@ static void sync_completion(struct nvme_dev *dev, void *ctx,
* Returns 0 on success. If the result is negative, it's a Linux error code; * Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code * if the result is positive, it's an NVM Express status code
*/ */
int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx,
struct nvme_command *cmd,
u32 *result, unsigned timeout) u32 *result, unsigned timeout)
{ {
int cmdid; int cmdid, ret;
struct sync_cmd_info cmdinfo; struct sync_cmd_info cmdinfo;
struct nvme_queue *nvmeq;
nvmeq = lock_nvmeq(dev, q_idx);
if (!nvmeq) {
unlock_nvmeq(nvmeq);
return -ENODEV;
}
cmdinfo.task = current; cmdinfo.task = current;
cmdinfo.status = -EINTR; cmdinfo.status = -EINTR;
cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion, cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout);
timeout); if (cmdid < 0) {
if (cmdid < 0) unlock_nvmeq(nvmeq);
return cmdid; return cmdid;
}
cmd->common.command_id = cmdid; cmd->common.command_id = cmdid;
set_current_state(TASK_KILLABLE); set_current_state(TASK_KILLABLE);
nvme_submit_cmd(nvmeq, cmd); ret = nvme_submit_cmd(nvmeq, cmd);
if (ret) {
free_cmdid(nvmeq, cmdid, NULL);
unlock_nvmeq(nvmeq);
set_current_state(TASK_RUNNING);
return ret;
}
unlock_nvmeq(nvmeq);
schedule_timeout(timeout); schedule_timeout(timeout);
if (cmdinfo.status == -EINTR) { if (cmdinfo.status == -EINTR) {
nvme_abort_command(nvmeq, cmdid); nvmeq = lock_nvmeq(dev, q_idx);
if (nvmeq)
nvme_abort_command(nvmeq, cmdid);
unlock_nvmeq(nvmeq);
return -EINTR; return -EINTR;
} }
...@@ -853,15 +888,20 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, ...@@ -853,15 +888,20 @@ static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
return cmdid; return cmdid;
cmdinfo->status = -EINTR; cmdinfo->status = -EINTR;
cmd->common.command_id = cmdid; cmd->common.command_id = cmdid;
nvme_submit_cmd(nvmeq, cmd); return nvme_submit_cmd(nvmeq, cmd);
return 0;
} }
int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
u32 *result) u32 *result)
{ {
return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result, return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT);
ADMIN_TIMEOUT); }
int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
u32 *result)
{
return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result,
NVME_IO_TIMEOUT);
} }
static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
...@@ -1434,7 +1474,6 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write, ...@@ -1434,7 +1474,6 @@ void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{ {
struct nvme_dev *dev = ns->dev; struct nvme_dev *dev = ns->dev;
struct nvme_queue *nvmeq;
struct nvme_user_io io; struct nvme_user_io io;
struct nvme_command c; struct nvme_command c;
unsigned length, meta_len; unsigned length, meta_len;
...@@ -1510,20 +1549,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) ...@@ -1510,20 +1549,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL); length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
nvmeq = get_nvmeq(dev);
/*
* Since nvme_submit_sync_cmd sleeps, we can't keep preemption
* disabled. We may be preempted at any point, and be rescheduled
* to a different CPU. That will cause cacheline bouncing, but no
* additional races since q_lock already protects against other CPUs.
*/
put_nvmeq(nvmeq);
if (length != (io.nblocks + 1) << ns->lba_shift) if (length != (io.nblocks + 1) << ns->lba_shift)
status = -ENOMEM; status = -ENOMEM;
else if (!nvmeq || nvmeq->q_suspended)
status = -EBUSY;
else else
status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); status = nvme_submit_io_cmd(dev, &c, NULL);
if (meta_len) { if (meta_len) {
if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
...@@ -1597,8 +1626,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, ...@@ -1597,8 +1626,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev,
if (length != cmd.data_len) if (length != cmd.data_len)
status = -ENOMEM; status = -ENOMEM;
else else
status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c, status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout);
&cmd.result, timeout);
if (cmd.data_len) { if (cmd.data_len) {
nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
......
...@@ -2033,7 +2033,6 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, ...@@ -2033,7 +2033,6 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
int res = SNTI_TRANSLATION_SUCCESS; int res = SNTI_TRANSLATION_SUCCESS;
int nvme_sc; int nvme_sc;
struct nvme_dev *dev = ns->dev; struct nvme_dev *dev = ns->dev;
struct nvme_queue *nvmeq;
u32 num_cmds; u32 num_cmds;
struct nvme_iod *iod; struct nvme_iod *iod;
u64 unit_len; u64 unit_len;
...@@ -2106,18 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, ...@@ -2106,18 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
nvme_offset += unit_num_blocks; nvme_offset += unit_num_blocks;
nvmeq = get_nvmeq(dev); nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
/*
* Since nvme_submit_sync_cmd sleeps, we can't keep
* preemption disabled. We may be preempted at any
* point, and be rescheduled to a different CPU. That
* will cause cacheline bouncing, but no additional
* races since q_lock already protects against other
* CPUs.
*/
put_nvmeq(nvmeq);
nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL,
NVME_IO_TIMEOUT);
if (nvme_sc != NVME_SC_SUCCESS) { if (nvme_sc != NVME_SC_SUCCESS) {
nvme_unmap_user_pages(dev, nvme_unmap_user_pages(dev,
(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
...@@ -2644,7 +2632,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, ...@@ -2644,7 +2632,6 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
{ {
int res = SNTI_TRANSLATION_SUCCESS; int res = SNTI_TRANSLATION_SUCCESS;
int nvme_sc; int nvme_sc;
struct nvme_queue *nvmeq;
struct nvme_command c; struct nvme_command c;
u8 immed, pcmod, pc, no_flush, start; u8 immed, pcmod, pc, no_flush, start;
...@@ -2671,10 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, ...@@ -2671,10 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
c.common.opcode = nvme_cmd_flush; c.common.opcode = nvme_cmd_flush;
c.common.nsid = cpu_to_le32(ns->ns_id); c.common.nsid = cpu_to_le32(ns->ns_id);
nvmeq = get_nvmeq(ns->dev); nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
put_nvmeq(nvmeq);
nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
res = nvme_trans_status_code(hdr, nvme_sc); res = nvme_trans_status_code(hdr, nvme_sc);
if (res) if (res)
goto out; goto out;
...@@ -2697,15 +2681,12 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, ...@@ -2697,15 +2681,12 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
int res = SNTI_TRANSLATION_SUCCESS; int res = SNTI_TRANSLATION_SUCCESS;
int nvme_sc; int nvme_sc;
struct nvme_command c; struct nvme_command c;
struct nvme_queue *nvmeq;
memset(&c, 0, sizeof(c)); memset(&c, 0, sizeof(c));
c.common.opcode = nvme_cmd_flush; c.common.opcode = nvme_cmd_flush;
c.common.nsid = cpu_to_le32(ns->ns_id); c.common.nsid = cpu_to_le32(ns->ns_id);
nvmeq = get_nvmeq(ns->dev); nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
put_nvmeq(nvmeq);
nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
res = nvme_trans_status_code(hdr, nvme_sc); res = nvme_trans_status_code(hdr, nvme_sc);
if (res) if (res)
...@@ -2872,7 +2853,6 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, ...@@ -2872,7 +2853,6 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
struct nvme_dev *dev = ns->dev; struct nvme_dev *dev = ns->dev;
struct scsi_unmap_parm_list *plist; struct scsi_unmap_parm_list *plist;
struct nvme_dsm_range *range; struct nvme_dsm_range *range;
struct nvme_queue *nvmeq;
struct nvme_command c; struct nvme_command c;
int i, nvme_sc, res = -ENOMEM; int i, nvme_sc, res = -ENOMEM;
u16 ndesc, list_len; u16 ndesc, list_len;
...@@ -2914,10 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, ...@@ -2914,10 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
c.dsm.nr = cpu_to_le32(ndesc - 1); c.dsm.nr = cpu_to_le32(ndesc - 1);
c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
nvmeq = get_nvmeq(dev); nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
put_nvmeq(nvmeq);
nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
res = nvme_trans_status_code(hdr, nvme_sc); res = nvme_trans_status_code(hdr, nvme_sc);
dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
......
...@@ -151,10 +151,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, ...@@ -151,10 +151,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
unsigned long addr, unsigned length); unsigned long addr, unsigned length);
void nvme_unmap_user_pages(struct nvme_dev *dev, int write, void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
struct nvme_iod *iod); struct nvme_iod *iod);
struct nvme_queue *get_nvmeq(struct nvme_dev *dev); int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *);
void put_nvmeq(struct nvme_queue *nvmeq);
int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
u32 *result, unsigned timeout);
int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
u32 *result); u32 *result);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment