Commit bf0beec0 authored by Ming Lei's avatar Ming Lei Committed by Jens Axboe

blk-mq: drain I/O when all CPUs in a hctx are offline

Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:

"That was the constraint of managed interrupts from the very beginning:

 The driver/subsystem has to quiesce the interrupt line and the associated
 queue _before_ it gets shutdown in CPU unplug and not fiddle with it
 until it's restarted by the core when the CPU is plugged in again."

However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown.  Even worse, CPUHP_BLK_MQ_DEAD is a
cpuhp state handled after the CPU is down, so there isn't any chance to
quiesce the hctx before shutting down the CPU.

Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs
where the last CPU goes away, and wait for completion of in-flight
requests.  This guarantees that there is no inflight I/O before shutting
down the managed IRQ.

Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need
to wait for completion of in-flight requests from these drivers to avoid
a potential dead-lock. It is safe to do this for stacking drivers as those
do not use interrupts at all and their I/O completions are triggered by
underlying devices I/O completion.

[1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/

[hch: different retry mechanism, merged two patches, minor cleanups]
Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Reviewed-by: default avatarDaniel Wagner <dwagner@suse.de>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 602380d2
...@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = { ...@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(SCHED_RESTART),
HCTX_STATE_NAME(INACTIVE),
}; };
#undef HCTX_STATE_NAME #undef HCTX_STATE_NAME
...@@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = { ...@@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(TAG_SHARED), HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING),
}; };
#undef HCTX_FLAG_NAME #undef HCTX_FLAG_NAME
......
...@@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -180,6 +180,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
sbitmap_finish_wait(bt, ws, &wait); sbitmap_finish_wait(bt, ws, &wait);
found_tag: found_tag:
/*
* Give up this allocation if the hctx is inactive. The caller will
* retry on an active hctx.
*/
if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
return BLK_MQ_NO_TAG;
}
return tag + tag_offset; return tag + tag_offset;
} }
......
...@@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) ...@@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
e->type->ops.limit_depth(data->cmd_flags, data); e->type->ops.limit_depth(data->cmd_flags, data);
} }
retry:
data->ctx = blk_mq_get_ctx(q); data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!(data->flags & BLK_MQ_REQ_INTERNAL)) if (!(data->flags & BLK_MQ_REQ_INTERNAL))
blk_mq_tag_busy(data->hctx); blk_mq_tag_busy(data->hctx);
/*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
* should have migrated us to an online CPU by now.
*/
tag = blk_mq_get_tag(data); tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_NO_TAG) if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL; return NULL;
/*
* Give up the CPU and sleep for a random short time to ensure
* that thread using a realtime scheduling class are migrated
* off the the CPU, and thus off the hctx that is going away.
*/
msleep(3);
goto retry;
}
return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
} }
...@@ -2335,6 +2351,86 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, ...@@ -2335,6 +2351,86 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
return -ENOMEM; return -ENOMEM;
} }
struct rq_iter_data {
struct blk_mq_hw_ctx *hctx;
bool has_rq;
};
static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
{
struct rq_iter_data *iter_data = data;
if (rq->mq_hctx != iter_data->hctx)
return true;
iter_data->has_rq = true;
return false;
}
static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->sched_tags ?
hctx->sched_tags : hctx->tags;
struct rq_iter_data data = {
.hctx = hctx,
};
blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
return data.has_rq;
}
static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
struct blk_mq_hw_ctx *hctx)
{
if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
return false;
if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
return false;
return true;
}
static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
!blk_mq_last_cpu_in_hctx(cpu, hctx))
return 0;
/*
* Prevent new request from being allocated on the current hctx.
*
* The smp_mb__after_atomic() Pairs with the implied barrier in
* test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
* seen once we return from the tag allocator.
*/
set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
smp_mb__after_atomic();
/*
* Try to grab a reference to the queue and wait for any outstanding
* requests. If we could not grab a reference the queue has been
* frozen and there are no requests.
*/
if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
while (blk_mq_hctx_has_requests(hctx))
msleep(5);
percpu_ref_put(&hctx->queue->q_usage_counter);
}
return 0;
}
static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
struct blk_mq_hw_ctx, cpuhp_online);
if (cpumask_test_cpu(cpu, hctx->cpumask))
clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
return 0;
}
/* /*
* 'cpu' is going away. splice any existing rq_list entries from this * 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it * software queue to the hw queue dispatch list, and ensure that it
...@@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) ...@@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
enum hctx_type type; enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
if (!cpumask_test_cpu(cpu, hctx->cpumask))
return 0;
ctx = __blk_mq_get_ctx(hctx->queue, cpu); ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type; type = hctx->type;
...@@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) ...@@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{ {
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead); &hctx->cpuhp_dead);
} }
...@@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q, ...@@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
{ {
hctx->queue_num = hctx_idx; hctx->queue_num = hctx_idx;
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx]; hctx->tags = set->tags[hctx_idx];
...@@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void) ...@@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void)
{ {
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead); blk_mq_hctx_notify_dead);
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
blk_mq_hctx_notify_online,
blk_mq_hctx_notify_offline);
return 0; return 0;
} }
subsys_initcall(blk_mq_init); subsys_initcall(blk_mq_init);
...@@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i) ...@@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128; lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.cmd_size = sizeof(struct loop_cmd);
lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
lo->tag_set.driver_data = lo; lo->tag_set.driver_data = lo;
err = blk_mq_alloc_tag_set(&lo->tag_set); err = blk_mq_alloc_tag_set(&lo->tag_set);
......
...@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) ...@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
md->tag_set->ops = &dm_mq_ops; md->tag_set->ops = &dm_mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id; md->tag_set->numa_node = md->numa_node_id;
md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md; md->tag_set->driver_data = md;
......
...@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx { ...@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
*/ */
atomic_t nr_active; atomic_t nr_active;
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
/** @cpuhp_dead: List to store request if some CPU die. */ /** @cpuhp_dead: List to store request if some CPU die. */
struct hlist_node cpuhp_dead; struct hlist_node cpuhp_dead;
/** @kobj: Kernel object for sysfs. */ /** @kobj: Kernel object for sysfs. */
...@@ -391,6 +393,11 @@ struct blk_mq_ops { ...@@ -391,6 +393,11 @@ struct blk_mq_ops {
enum { enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_TAG_SHARED = 1 << 1,
/*
* Set when this device requires underlying blk-mq device for
* completing IO:
*/
BLK_MQ_F_STACKING = 1 << 2,
BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
...@@ -400,6 +407,9 @@ enum { ...@@ -400,6 +407,9 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_S_SCHED_RESTART = 2,
/* hw queue is inactive after all its CPUs become offline */
BLK_MQ_S_INACTIVE = 3,
BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_MAX_DEPTH = 10240,
BLK_MQ_CPU_WORK_BATCH = 8, BLK_MQ_CPU_WORK_BATCH = 8,
......
...@@ -152,6 +152,7 @@ enum cpuhp_state { ...@@ -152,6 +152,7 @@ enum cpuhp_state {
CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_SMPBOOT_THREADS,
CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE,
CPUHP_AP_BLK_MQ_ONLINE,
CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_X86_INTEL_EPB_ONLINE,
CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_ONLINE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment