Commit 28d65729 authored by Salman Qazi's avatar Salman Qazi Committed by Jens Axboe

block: Limit number of items taken from the I/O scheduler in one go

Flushes bypass the I/O scheduler and get added to hctx->dispatch
in blk_mq_sched_bypass_insert.  This can happen while a kworker is running
hctx->run_work work item and is past the point in
blk_mq_sched_dispatch_requests where hctx->dispatch is checked.

The blk_mq_do_dispatch_sched call is not guaranteed to end in bounded time,
because the I/O scheduler can feed an arbitrary number of commands.

Since we have only one hctx->run_work, the commands waiting in
hctx->dispatch will wait an arbitrary length of time for run_work to be
rerun.

A similar phenomenon exists with dispatches from the software queue.

The solution is to poll hctx->dispatch in blk_mq_do_dispatch_sched and
blk_mq_do_dispatch_ctx and return from the run_work handler and let it
rerun.
Signed-off-by: default avatarSalman Qazi <sqazi@google.com>
Reviewed-by: default avatarMing Lei <ming.lei@redhat.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 895d4775
...@@ -86,12 +86,16 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) ...@@ -86,12 +86,16 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to * its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* be run again. This is necessary to avoid starving flushes.
*/ */
static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{ {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list); LIST_HEAD(rq_list);
int ret = 0;
do { do {
struct request *rq; struct request *rq;
...@@ -99,6 +103,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) ...@@ -99,6 +103,11 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break; break;
if (!list_empty_careful(&hctx->dispatch)) {
ret = -EAGAIN;
break;
}
if (!blk_mq_get_dispatch_budget(hctx)) if (!blk_mq_get_dispatch_budget(hctx))
break; break;
...@@ -123,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) ...@@ -123,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
*/ */
list_add(&rq->queuelist, &rq_list); list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true)); } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
return ret;
} }
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
...@@ -140,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, ...@@ -140,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to * its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* to be run again. This is necessary to avoid starving flushes.
*/ */
static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{ {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list); LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
int ret = 0;
do { do {
struct request *rq; struct request *rq;
if (!list_empty_careful(&hctx->dispatch)) {
ret = -EAGAIN;
break;
}
if (!sbitmap_any_bit_set(&hctx->ctx_map)) if (!sbitmap_any_bit_set(&hctx->ctx_map))
break; break;
...@@ -183,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) ...@@ -183,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
} while (blk_mq_dispatch_rq_list(q, &rq_list, true)); } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
WRITE_ONCE(hctx->dispatch_from, ctx); WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
} }
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{ {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.dispatch_request; const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
int ret = 0;
LIST_HEAD(rq_list); LIST_HEAD(rq_list);
/* RCU or SRCU read lock is needed before checking quiesced flag */
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
hctx->run++;
/* /*
* If we have previous entries on our dispatch list, grab them first for * If we have previous entries on our dispatch list, grab them first for
* more fair dispatch. * more fair dispatch.
...@@ -226,19 +242,41 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) ...@@ -226,19 +242,41 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_mark_restart_hctx(hctx); blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch) if (has_sched_dispatch)
blk_mq_do_dispatch_sched(hctx); ret = blk_mq_do_dispatch_sched(hctx);
else else
blk_mq_do_dispatch_ctx(hctx); ret = blk_mq_do_dispatch_ctx(hctx);
} }
} else if (has_sched_dispatch) { } else if (has_sched_dispatch) {
blk_mq_do_dispatch_sched(hctx); ret = blk_mq_do_dispatch_sched(hctx);
} else if (hctx->dispatch_busy) { } else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */ /* dequeue request one by one from sw queue if queue is busy */
blk_mq_do_dispatch_ctx(hctx); ret = blk_mq_do_dispatch_ctx(hctx);
} else { } else {
blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false); blk_mq_dispatch_rq_list(q, &rq_list, false);
} }
return ret;
}
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
/* RCU or SRCU read lock is needed before checking quiesced flag */
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
hctx->run++;
/*
* A return of -EAGAIN is an indication that hctx->dispatch is not
* empty and we must run again in order to avoid starving flushes.
*/
if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
blk_mq_run_hw_queue(hctx, true);
}
} }
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment