Commit 704b914f authored by Ming Lei's avatar Ming Lei Committed by Jens Axboe

blk-mq: move srcu from blk_mq_hw_ctx to request_queue

In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.
Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.comSigned-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 2a904d00
...@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida); ...@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
* For queue allocation * For queue allocation
*/ */
struct kmem_cache *blk_requestq_cachep; struct kmem_cache *blk_requestq_cachep;
struct kmem_cache *blk_requestq_srcu_cachep;
/* /*
* Controlling structure to kblockd * Controlling structure to kblockd
...@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work) ...@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
{ {
} }
struct request_queue *blk_alloc_queue(int node_id) struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{ {
struct request_queue *q; struct request_queue *q;
int ret; int ret;
q = kmem_cache_alloc_node(blk_requestq_cachep, q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
GFP_KERNEL | __GFP_ZERO, node_id); GFP_KERNEL | __GFP_ZERO, node_id);
if (!q) if (!q)
return NULL; return NULL;
if (alloc_srcu) {
blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
if (init_srcu_struct(q->srcu) != 0)
goto fail_q;
}
q->last_merge = NULL; q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0) if (q->id < 0)
goto fail_q; goto fail_srcu;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret) if (ret)
...@@ -508,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id) ...@@ -508,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id)
bioset_exit(&q->bio_split); bioset_exit(&q->bio_split);
fail_id: fail_id:
ida_simple_remove(&blk_queue_ida, q->id); ida_simple_remove(&blk_queue_ida, q->id);
fail_srcu:
if (alloc_srcu)
cleanup_srcu_struct(q->srcu);
fail_q: fail_q:
kmem_cache_free(blk_requestq_cachep, q); kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
return NULL; return NULL;
} }
...@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void) ...@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags)); sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf)); sizeof_field(struct bio, bi_opf));
BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
__alignof__(struct request_queue)) !=
sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */ /* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd", kblockd_workqueue = alloc_workqueue("kblockd",
...@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void) ...@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue", blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL); sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
sizeof(struct request_queue) +
sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
blk_debugfs_root = debugfs_create_dir("block", NULL); blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0; return 0;
......
...@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) ...@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj); kobj);
if (hctx->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq); blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map); sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask); free_cpumask_var(hctx->cpumask);
......
...@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); ...@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
*/ */
void blk_mq_wait_quiesce_done(struct request_queue *q) void blk_mq_wait_quiesce_done(struct request_queue *q)
{ {
struct blk_mq_hw_ctx *hctx; if (blk_queue_has_srcu(q))
unsigned int i; synchronize_srcu(q->srcu);
bool rcu = false; else
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(hctx->srcu);
else
rcu = true;
}
if (rcu)
synchronize_rcu(); synchronize_rcu();
} }
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
...@@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, ...@@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
} }
} }
static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
{
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
__alignof__(struct blk_mq_hw_ctx)) !=
sizeof(struct blk_mq_hw_ctx));
if (tag_set->flags & BLK_MQ_F_BLOCKING)
hw_ctx_size += sizeof(struct srcu_struct);
return hw_ctx_size;
}
static int blk_mq_init_hctx(struct request_queue *q, static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set, struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
...@@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, ...@@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
if (!hctx) if (!hctx)
goto fail_alloc_hctx; goto fail_alloc_hctx;
...@@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, ...@@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!hctx->fq) if (!hctx->fq)
goto free_bitmap; goto free_bitmap;
if (hctx->flags & BLK_MQ_F_BLOCKING)
init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx); blk_mq_hctx_kobj_init(hctx);
return hctx; return hctx;
...@@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, ...@@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q; struct request_queue *q;
int ret; int ret;
q = blk_alloc_queue(set->numa_node); q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
if (!q) if (!q)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
q->queuedata = queuedata; q->queuedata = queuedata;
...@@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, ...@@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q) struct request_queue *q)
{ {
WARN_ON_ONCE(blk_queue_has_srcu(q) !=
!!(set->flags & BLK_MQ_F_BLOCKING));
/* mark the queue as mq asap */ /* mark the queue as mq asap */
q->mq_ops = set->ops; q->mq_ops = set->ops;
......
...@@ -385,9 +385,9 @@ do { \ ...@@ -385,9 +385,9 @@ do { \
int srcu_idx; \ int srcu_idx; \
\ \
might_sleep(); \ might_sleep(); \
srcu_idx = srcu_read_lock((hctx)->srcu); \ srcu_idx = srcu_read_lock((hctx)->queue->srcu); \
(dispatch_ops); \ (dispatch_ops); \
srcu_read_unlock((hctx)->srcu, srcu_idx); \ srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
} \ } \
} while (0) } while (0)
......
...@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) ...@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{ {
struct request_queue *q = container_of(rcu_head, struct request_queue, struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head); rcu_head);
kmem_cache_free(blk_requestq_cachep, q);
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
} }
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
......
...@@ -27,6 +27,7 @@ struct blk_flush_queue { ...@@ -27,6 +27,7 @@ struct blk_flush_queue {
}; };
extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype; extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida; extern struct ida blk_queue_ida;
...@@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, ...@@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset, struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page); unsigned int max_sectors, bool *same_page);
struct request_queue *blk_alloc_queue(int node_id); static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
{
if (srcu)
return blk_requestq_srcu_cachep;
return blk_requestq_cachep;
}
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
int disk_scan_partitions(struct gendisk *disk, fmode_t mode); int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_alloc_events(struct gendisk *disk); int disk_alloc_events(struct gendisk *disk);
......
...@@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) ...@@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q; struct request_queue *q;
struct gendisk *disk; struct gendisk *disk;
q = blk_alloc_queue(node); q = blk_alloc_queue(node, false);
if (!q) if (!q)
return NULL; return NULL;
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/sbitmap.h> #include <linux/sbitmap.h>
#include <linux/srcu.h>
#include <linux/lockdep.h> #include <linux/lockdep.h>
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/prefetch.h> #include <linux/prefetch.h>
...@@ -375,13 +374,6 @@ struct blk_mq_hw_ctx { ...@@ -375,13 +374,6 @@ struct blk_mq_hw_ctx {
* q->unused_hctx_list. * q->unused_hctx_list.
*/ */
struct list_head hctx_list; struct list_head hctx_list;
/**
* @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
* blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
* blk_mq_hw_ctx_size().
*/
struct srcu_struct srcu[];
}; };
/** /**
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/percpu-refcount.h> #include <linux/percpu-refcount.h>
#include <linux/blkzoned.h> #include <linux/blkzoned.h>
#include <linux/sbitmap.h> #include <linux/sbitmap.h>
#include <linux/srcu.h>
struct module; struct module;
struct request_queue; struct request_queue;
...@@ -373,11 +374,18 @@ struct request_queue { ...@@ -373,11 +374,18 @@ struct request_queue {
* devices that do not have multiple independent access ranges. * devices that do not have multiple independent access ranges.
*/ */
struct blk_independent_access_ranges *ia_ranges; struct blk_independent_access_ranges *ia_ranges;
/**
* @srcu: Sleepable RCU. Use as lock when type of the request queue
* is blocking (BLK_MQ_F_BLOCKING). Must be the last member
*/
struct srcu_struct srcu[];
}; };
/* Keep blk_queue_flag_name[] in sync with the definitions below */ /* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
#define QUEUE_FLAG_DYING 1 /* queue being torn down */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */
#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */
#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
...@@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); ...@@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment