Commit 42211f6c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
 "A set of fixes in the area of block IO, that should go into the next
  -rc release. This contains:

   - An OOPS fix from Dmitry, fixing a regression with the bio integrity
     code in this series.

   - Fix truncation of elevator io context cache name, from Eric
     Biggers.

   - NVMe pull from Christoph includes FC fixes from James, APST
     fixes/tweaks from Kai-Heng, removal fix from Rakesh, and an RDMA
     fix from Sagi.

   - Two tweaks for the block throttling code. One from Joseph Qi,
     fixing an oops from the timer code, and one from Shaohua, improving
     the behavior on rotatonal storage.

   - Two blk-mq fixes from Ming, fixing corner cases with the direct
     issue code.

   - Locking fix for bfq cgroups from Paolo"

* 'for-linus' of git://git.kernel.dk/linux-block:
  block, bfq: access and cache blkg data only when safe
  Fix loop device flush before configure v3
  blk-throttle: set default latency baseline for harddisk
  blk-throttle: fix NULL pointer dereference in throtl_schedule_pending_timer
  nvme: relax APST default max latency to 100ms
  nvme: only consider exit latency when choosing useful non-op power states
  nvme-fc: fix missing put reference on controller create failure
  nvme-fc: on lldd/transport io error, terminate association
  nvme-rdma: fast fail incoming requests while we reconnect
  nvme-pci: fix multiple ctrl removal scheduling
  nvme: fix hang in remove path
  elevator: fix truncation of icq_cache_name
  blk-mq: fix direct issue
  blk-mq: pass correct hctx to blk_mq_try_issue_directly
  bio-integrity: Do not allocate integrity context for bio w/o data
parents 39e4edfd 8f9bebc3
...@@ -52,7 +52,7 @@ BFQG_FLAG_FNS(idling) ...@@ -52,7 +52,7 @@ BFQG_FLAG_FNS(idling)
BFQG_FLAG_FNS(empty) BFQG_FLAG_FNS(empty)
#undef BFQG_FLAG_FNS #undef BFQG_FLAG_FNS
/* This should be called with the queue_lock held. */ /* This should be called with the scheduler lock held. */
static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
{ {
unsigned long long now; unsigned long long now;
...@@ -67,7 +67,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) ...@@ -67,7 +67,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
bfqg_stats_clear_waiting(stats); bfqg_stats_clear_waiting(stats);
} }
/* This should be called with the queue_lock held. */ /* This should be called with the scheduler lock held. */
static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
struct bfq_group *curr_bfqg) struct bfq_group *curr_bfqg)
{ {
...@@ -81,7 +81,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ...@@ -81,7 +81,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
bfqg_stats_mark_waiting(stats); bfqg_stats_mark_waiting(stats);
} }
/* This should be called with the queue_lock held. */ /* This should be called with the scheduler lock held. */
static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
{ {
unsigned long long now; unsigned long long now;
...@@ -203,12 +203,30 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ...@@ -203,12 +203,30 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
static void bfqg_get(struct bfq_group *bfqg) static void bfqg_get(struct bfq_group *bfqg)
{ {
return blkg_get(bfqg_to_blkg(bfqg)); bfqg->ref++;
} }
void bfqg_put(struct bfq_group *bfqg) void bfqg_put(struct bfq_group *bfqg)
{ {
return blkg_put(bfqg_to_blkg(bfqg)); bfqg->ref--;
if (bfqg->ref == 0)
kfree(bfqg);
}
static void bfqg_and_blkg_get(struct bfq_group *bfqg)
{
/* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
bfqg_get(bfqg);
blkg_get(bfqg_to_blkg(bfqg));
}
void bfqg_and_blkg_put(struct bfq_group *bfqg)
{
bfqg_put(bfqg);
blkg_put(bfqg_to_blkg(bfqg));
} }
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
...@@ -312,7 +330,11 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) ...@@ -312,7 +330,11 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
if (bfqq) { if (bfqq) {
bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio = bfqq->new_ioprio;
bfqq->ioprio_class = bfqq->new_ioprio_class; bfqq->ioprio_class = bfqq->new_ioprio_class;
bfqg_get(bfqg); /*
* Make sure that bfqg and its associated blkg do not
* disappear before entity.
*/
bfqg_and_blkg_get(bfqg);
} }
entity->parent = bfqg->my_entity; /* NULL for root group */ entity->parent = bfqg->my_entity; /* NULL for root group */
entity->sched_data = &bfqg->sched_data; entity->sched_data = &bfqg->sched_data;
...@@ -399,6 +421,8 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) ...@@ -399,6 +421,8 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
return NULL; return NULL;
} }
/* see comments in bfq_bic_update_cgroup for why refcounting */
bfqg_get(bfqg);
return &bfqg->pd; return &bfqg->pd;
} }
...@@ -426,7 +450,7 @@ void bfq_pd_free(struct blkg_policy_data *pd) ...@@ -426,7 +450,7 @@ void bfq_pd_free(struct blkg_policy_data *pd)
struct bfq_group *bfqg = pd_to_bfqg(pd); struct bfq_group *bfqg = pd_to_bfqg(pd);
bfqg_stats_exit(&bfqg->stats); bfqg_stats_exit(&bfqg->stats);
return kfree(bfqg); bfqg_put(bfqg);
} }
void bfq_pd_reset_stats(struct blkg_policy_data *pd) void bfq_pd_reset_stats(struct blkg_policy_data *pd)
...@@ -496,9 +520,10 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ...@@ -496,9 +520,10 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
* Move @bfqq to @bfqg, deactivating it from its old group and reactivating * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
* it on the new one. Avoid putting the entity on the old group idle tree. * it on the new one. Avoid putting the entity on the old group idle tree.
* *
* Must be called under the queue lock; the cgroup owning @bfqg must * Must be called under the scheduler lock, to make sure that the blkg
* not disappear (by now this just means that we are called under * owning @bfqg does not disappear (see comments in
* rcu_read_lock()). * bfq_bic_update_cgroup on guaranteeing the consistency of blkg
* objects).
*/ */
void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg) struct bfq_group *bfqg)
...@@ -519,16 +544,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ...@@ -519,16 +544,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false); bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st) else if (entity->on_st)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
bfqg_put(bfqq_group(bfqq)); bfqg_and_blkg_put(bfqq_group(bfqq));
/*
* Here we use a reference to bfqg. We don't need a refcounter
* as the cgroup reference will not be dropped, so that its
* destroy() callback will not be invoked.
*/
entity->parent = bfqg->my_entity; entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data; entity->sched_data = &bfqg->sched_data;
bfqg_get(bfqg); /* pin down bfqg and its associated blkg */
bfqg_and_blkg_get(bfqg);
if (bfq_bfqq_busy(bfqq)) { if (bfq_bfqq_busy(bfqq)) {
bfq_pos_tree_add_move(bfqd, bfqq); bfq_pos_tree_add_move(bfqd, bfqq);
...@@ -545,8 +566,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ...@@ -545,8 +566,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* @bic: the bic to move. * @bic: the bic to move.
* @blkcg: the blk-cgroup to move to. * @blkcg: the blk-cgroup to move to.
* *
* Move bic to blkcg, assuming that bfqd->queue is locked; the caller * Move bic to blkcg, assuming that bfqd->lock is held; which makes
* has to make sure that the reference to cgroup is valid across the call. * sure that the reference to cgroup is valid across the call (see
* comments in bfq_bic_update_cgroup on this issue)
* *
* NOTE: an alternative approach might have been to store the current * NOTE: an alternative approach might have been to store the current
* cgroup in bfqq and getting a reference to it, reducing the lookup * cgroup in bfqq and getting a reference to it, reducing the lookup
...@@ -604,6 +626,57 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) ...@@ -604,6 +626,57 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
goto out; goto out;
bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
/*
* Update blkg_path for bfq_log_* functions. We cache this
* path, and update it here, for the following
* reasons. Operations on blkg objects in blk-cgroup are
* protected with the request_queue lock, and not with the
* lock that protects the instances of this scheduler
* (bfqd->lock). This exposes BFQ to the following sort of
* race.
*
* The blkg_lookup performed in bfq_get_queue, protected
* through rcu, may happen to return the address of a copy of
* the original blkg. If this is the case, then the
* bfqg_and_blkg_get performed in bfq_get_queue, to pin down
* the blkg, is useless: it does not prevent blk-cgroup code
* from destroying both the original blkg and all objects
* directly or indirectly referred by the copy of the
* blkg.
*
* On the bright side, destroy operations on a blkg invoke, as
* a first step, hooks of the scheduler associated with the
* blkg. And these hooks are executed with bfqd->lock held for
* BFQ. As a consequence, for any blkg associated with the
* request queue this instance of the scheduler is attached
* to, we are guaranteed that such a blkg is not destroyed, and
* that all the pointers it contains are consistent, while we
* are holding bfqd->lock. A blkg_lookup performed with
* bfqd->lock held then returns a fully consistent blkg, which
* remains consistent until this lock is held.
*
* Thanks to the last fact, and to the fact that: (1) bfqg has
* been obtained through a blkg_lookup in the above
* assignment, and (2) bfqd->lock is being held, here we can
* safely use the policy data for the involved blkg (i.e., the
* field bfqg->pd) to get to the blkg associated with bfqg,
* and then we can safely use any field of blkg. After we
* release bfqd->lock, even just getting blkg through this
* bfqg may cause dangling references to be traversed, as
* bfqg->pd may not exist any more.
*
* In view of the above facts, here we cache, in the bfqg, any
* blkg data we may need for this bic, and for its associated
* bfq_queue. As of now, we need to cache only the path of the
* blkg, which is used in the bfq_log_* functions.
*
* Finally, note that bfqg itself needs to be protected from
* destruction on the blkg_free of the original blkg (which
* invokes bfq_pd_free). We use an additional private
* refcounter for bfqg, to let it disappear only after no
* bfq_queue refers to it any longer.
*/
blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path));
bic->blkcg_serial_nr = serial_nr; bic->blkcg_serial_nr = serial_nr;
out: out:
rcu_read_unlock(); rcu_read_unlock();
...@@ -640,8 +713,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ...@@ -640,8 +713,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
* @bfqd: the device data structure with the root group. * @bfqd: the device data structure with the root group.
* @bfqg: the group to move from. * @bfqg: the group to move from.
* @st: the service tree with the entities. * @st: the service tree with the entities.
*
* Needs queue_lock to be taken and reference to be valid over the call.
*/ */
static void bfq_reparent_active_entities(struct bfq_data *bfqd, static void bfq_reparent_active_entities(struct bfq_data *bfqd,
struct bfq_group *bfqg, struct bfq_group *bfqg,
...@@ -692,8 +763,7 @@ void bfq_pd_offline(struct blkg_policy_data *pd) ...@@ -692,8 +763,7 @@ void bfq_pd_offline(struct blkg_policy_data *pd)
/* /*
* The idle tree may still contain bfq_queues belonging * The idle tree may still contain bfq_queues belonging
* to exited task because they never migrated to a different * to exited task because they never migrated to a different
* cgroup from the one being destroyed now. No one else * cgroup from the one being destroyed now.
* can access them so it's safe to act without any lock.
*/ */
bfq_flush_idle_tree(st); bfq_flush_idle_tree(st);
......
...@@ -3665,7 +3665,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) ...@@ -3665,7 +3665,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
kmem_cache_free(bfq_pool, bfqq); kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED #ifdef CONFIG_BFQ_GROUP_IOSCHED
bfqg_put(bfqg); bfqg_and_blkg_put(bfqg);
#endif #endif
} }
......
...@@ -759,6 +759,12 @@ struct bfq_group { ...@@ -759,6 +759,12 @@ struct bfq_group {
/* must be the first member */ /* must be the first member */
struct blkg_policy_data pd; struct blkg_policy_data pd;
/* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
char blkg_path[128];
/* reference counter (see comments in bfq_bic_update_cgroup) */
int ref;
struct bfq_entity entity; struct bfq_entity entity;
struct bfq_sched_data sched_data; struct bfq_sched_data sched_data;
...@@ -838,7 +844,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ...@@ -838,7 +844,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
void bfqg_put(struct bfq_group *bfqg); void bfqg_and_blkg_put(struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_GROUP_IOSCHED #ifdef CONFIG_BFQ_GROUP_IOSCHED
extern struct cftype bfq_blkcg_legacy_files[]; extern struct cftype bfq_blkcg_legacy_files[];
...@@ -910,20 +916,13 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); ...@@ -910,20 +916,13 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char __pbuf[128]; \ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
\
blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
__pbuf, ##args); \ bfqq_group(bfqq)->blkg_path, ##args); \
} while (0) } while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \
char __pbuf[128]; \ blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
\
blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
} while (0)
#else /* CONFIG_BFQ_GROUP_IOSCHED */ #else /* CONFIG_BFQ_GROUP_IOSCHED */
......
...@@ -175,6 +175,9 @@ bool bio_integrity_enabled(struct bio *bio) ...@@ -175,6 +175,9 @@ bool bio_integrity_enabled(struct bio *bio)
if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
return false; return false;
if (!bio_sectors(bio))
return false;
/* Already protected? */ /* Already protected? */
if (bio_integrity(bio)) if (bio_integrity(bio))
return false; return false;
......
...@@ -1461,22 +1461,28 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) ...@@ -1461,22 +1461,28 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
} }
static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
bool may_sleep) struct request *rq,
blk_qc_t *cookie, bool may_sleep)
{ {
struct request_queue *q = rq->q; struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = { struct blk_mq_queue_data bd = {
.rq = rq, .rq = rq,
.last = true, .last = true,
}; };
struct blk_mq_hw_ctx *hctx;
blk_qc_t new_cookie; blk_qc_t new_cookie;
int ret; int ret;
bool run_queue = true;
if (blk_mq_hctx_stopped(hctx)) {
run_queue = false;
goto insert;
}
if (q->elevator) if (q->elevator)
goto insert; goto insert;
if (!blk_mq_get_driver_tag(rq, &hctx, false)) if (!blk_mq_get_driver_tag(rq, NULL, false))
goto insert; goto insert;
new_cookie = request_to_qc_t(hctx, rq); new_cookie = request_to_qc_t(hctx, rq);
...@@ -1500,7 +1506,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, ...@@ -1500,7 +1506,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
__blk_mq_requeue_request(rq); __blk_mq_requeue_request(rq);
insert: insert:
blk_mq_sched_insert_request(rq, false, true, false, may_sleep); blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
} }
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
...@@ -1508,7 +1514,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, ...@@ -1508,7 +1514,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
{ {
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock(); rcu_read_lock();
__blk_mq_try_issue_directly(rq, cookie, false); __blk_mq_try_issue_directly(hctx, rq, cookie, false);
rcu_read_unlock(); rcu_read_unlock();
} else { } else {
unsigned int srcu_idx; unsigned int srcu_idx;
...@@ -1516,7 +1522,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, ...@@ -1516,7 +1522,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
might_sleep(); might_sleep();
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
__blk_mq_try_issue_directly(rq, cookie, true); __blk_mq_try_issue_directly(hctx, rq, cookie, true);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
} }
} }
...@@ -1619,9 +1625,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) ...@@ -1619,9 +1625,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_put_ctx(data.ctx); blk_mq_put_ctx(data.ctx);
if (same_queue_rq) if (same_queue_rq) {
data.hctx = blk_mq_map_queue(q,
same_queue_rq->mq_ctx->cpu);
blk_mq_try_issue_directly(data.hctx, same_queue_rq, blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie); &cookie);
}
} else if (q->nr_hw_queues > 1 && is_sync) { } else if (q->nr_hw_queues > 1 && is_sync) {
blk_mq_put_ctx(data.ctx); blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio); blk_mq_bio_to_request(rq, bio);
......
...@@ -27,6 +27,13 @@ static int throtl_quantum = 32; ...@@ -27,6 +27,13 @@ static int throtl_quantum = 32;
#define MIN_THROTL_IOPS (10) #define MIN_THROTL_IOPS (10)
#define DFL_LATENCY_TARGET (-1L) #define DFL_LATENCY_TARGET (-1L)
#define DFL_IDLE_THRESHOLD (0) #define DFL_IDLE_THRESHOLD (0)
#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
#define LATENCY_FILTERED_SSD (0)
/*
* For HD, very small latency comes from sequential IO. Such IO is helpless to
* help determine if its IO is impacted by others, hence we ignore the IO
*/
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
...@@ -212,6 +219,7 @@ struct throtl_data ...@@ -212,6 +219,7 @@ struct throtl_data
struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets; struct latency_bucket __percpu *latency_buckets;
unsigned long last_calculate_time; unsigned long last_calculate_time;
unsigned long filtered_latency;
bool track_bio_latency; bool track_bio_latency;
}; };
...@@ -698,7 +706,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) ...@@ -698,7 +706,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires) unsigned long expires)
{ {
unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
/* /*
* Since we are adjusting the throttle limit dynamically, the sleep * Since we are adjusting the throttle limit dynamically, the sleep
...@@ -2281,7 +2289,7 @@ void blk_throtl_bio_endio(struct bio *bio) ...@@ -2281,7 +2289,7 @@ void blk_throtl_bio_endio(struct bio *bio)
throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
bio_op(bio), lat); bio_op(bio), lat);
if (tg->latency_target) { if (tg->latency_target && lat >= tg->td->filtered_latency) {
int bucket; int bucket;
unsigned int threshold; unsigned int threshold;
...@@ -2417,14 +2425,20 @@ void blk_throtl_exit(struct request_queue *q) ...@@ -2417,14 +2425,20 @@ void blk_throtl_exit(struct request_queue *q)
void blk_throtl_register_queue(struct request_queue *q) void blk_throtl_register_queue(struct request_queue *q)
{ {
struct throtl_data *td; struct throtl_data *td;
int i;
td = q->td; td = q->td;
BUG_ON(!td); BUG_ON(!td);
if (blk_queue_nonrot(q)) if (blk_queue_nonrot(q)) {
td->throtl_slice = DFL_THROTL_SLICE_SSD; td->throtl_slice = DFL_THROTL_SLICE_SSD;
else td->filtered_latency = LATENCY_FILTERED_SSD;
} else {
td->throtl_slice = DFL_THROTL_SLICE_HD; td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD;
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */ /* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD; td->throtl_slice = DFL_THROTL_SLICE_HD;
......
...@@ -608,6 +608,9 @@ static int loop_switch(struct loop_device *lo, struct file *file) ...@@ -608,6 +608,9 @@ static int loop_switch(struct loop_device *lo, struct file *file)
*/ */
static int loop_flush(struct loop_device *lo) static int loop_flush(struct loop_device *lo)
{ {
/* loop not yet configured, no running thread, nothing to flush */
if (lo->lo_state != Lo_bound)
return 0;
return loop_switch(lo, NULL); return loop_switch(lo, NULL);
} }
......
...@@ -56,7 +56,7 @@ MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); ...@@ -56,7 +56,7 @@ MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
static int nvme_char_major; static int nvme_char_major;
module_param(nvme_char_major, int, 0); module_param(nvme_char_major, int, 0);
static unsigned long default_ps_max_latency_us = 25000; static unsigned long default_ps_max_latency_us = 100000;
module_param(default_ps_max_latency_us, ulong, 0644); module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us, MODULE_PARM_DESC(default_ps_max_latency_us,
"max power saving latency for new devices; use PM QOS to change per device"); "max power saving latency for new devices; use PM QOS to change per device");
...@@ -1342,7 +1342,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) ...@@ -1342,7 +1342,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
* transitioning between power states. Therefore, when running * transitioning between power states. Therefore, when running
* in any given state, we will enter the next lower-power * in any given state, we will enter the next lower-power
* non-operational state after waiting 50 * (enlat + exlat) * non-operational state after waiting 50 * (enlat + exlat)
* microseconds, as long as that state's total latency is under * microseconds, as long as that state's exit latency is under
* the requested maximum latency. * the requested maximum latency.
* *
* We will not autonomously enter any non-operational state for * We will not autonomously enter any non-operational state for
...@@ -1387,7 +1387,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) ...@@ -1387,7 +1387,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
* lowest-power state, not the number of states. * lowest-power state, not the number of states.
*/ */
for (state = (int)ctrl->npss; state >= 0; state--) { for (state = (int)ctrl->npss; state >= 0; state--) {
u64 total_latency_us, transition_ms; u64 total_latency_us, exit_latency_us, transition_ms;
if (target) if (target)
table->entries[state] = target; table->entries[state] = target;
...@@ -1408,12 +1408,15 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) ...@@ -1408,12 +1408,15 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl)
NVME_PS_FLAGS_NON_OP_STATE)) NVME_PS_FLAGS_NON_OP_STATE))
continue; continue;
total_latency_us = exit_latency_us =
(u64)le32_to_cpu(ctrl->psd[state].entry_lat) + (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
+ le32_to_cpu(ctrl->psd[state].exit_lat); if (exit_latency_us > ctrl->ps_max_latency_us)
if (total_latency_us > ctrl->ps_max_latency_us)
continue; continue;
total_latency_us =
exit_latency_us +
le32_to_cpu(ctrl->psd[state].entry_lat);
/* /*
* This state is good. Use it as the APST idle * This state is good. Use it as the APST idle
* target for higher power states. * target for higher power states.
...@@ -2438,6 +2441,10 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) ...@@ -2438,6 +2441,10 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
struct nvme_ns *ns; struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex); mutex_lock(&ctrl->namespaces_mutex);
/* Forcibly start all queues to avoid having stuck requests */
blk_mq_start_hw_queues(ctrl->admin_q);
list_for_each_entry(ns, &ctrl->namespaces, list) { list_for_each_entry(ns, &ctrl->namespaces, list) {
/* /*
* Revalidating a dead namespace sets capacity to 0. This will * Revalidating a dead namespace sets capacity to 0. This will
......
...@@ -1139,6 +1139,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) ...@@ -1139,6 +1139,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
/* *********************** NVME Ctrl Routines **************************** */ /* *********************** NVME Ctrl Routines **************************** */
static void __nvme_fc_final_op_cleanup(struct request *rq); static void __nvme_fc_final_op_cleanup(struct request *rq);
static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
static int static int
nvme_fc_reinit_request(void *data, struct request *rq) nvme_fc_reinit_request(void *data, struct request *rq)
...@@ -1265,7 +1266,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) ...@@ -1265,7 +1266,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_command *sqe = &op->cmd_iu.sqe; struct nvme_command *sqe = &op->cmd_iu.sqe;
__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
union nvme_result result; union nvme_result result;
bool complete_rq; bool complete_rq, terminate_assoc = true;
/* /*
* WARNING: * WARNING:
...@@ -1294,6 +1295,14 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) ...@@ -1294,6 +1295,14 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
* fabricate a CQE, the following fields will not be set as they * fabricate a CQE, the following fields will not be set as they
* are not referenced: * are not referenced:
* cqe.sqid, cqe.sqhd, cqe.command_id * cqe.sqid, cqe.sqhd, cqe.command_id
*
* Failure or error of an individual i/o, in a transport
* detected fashion unrelated to the nvme completion status,
* potentially cause the initiator and target sides to get out
* of sync on SQ head/tail (aka outstanding io count allowed).
* Per FC-NVME spec, failure of an individual command requires
* the connection to be terminated, which in turn requires the
* association to be terminated.
*/ */
fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
...@@ -1359,6 +1368,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) ...@@ -1359,6 +1368,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
goto done; goto done;
} }
terminate_assoc = false;
done: done:
if (op->flags & FCOP_FLAGS_AEN) { if (op->flags & FCOP_FLAGS_AEN) {
nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
...@@ -1366,7 +1377,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) ...@@ -1366,7 +1377,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
atomic_set(&op->state, FCPOP_STATE_IDLE); atomic_set(&op->state, FCPOP_STATE_IDLE);
op->flags = FCOP_FLAGS_AEN; /* clear other flags */ op->flags = FCOP_FLAGS_AEN; /* clear other flags */
nvme_fc_ctrl_put(ctrl); nvme_fc_ctrl_put(ctrl);
return; goto check_error;
} }
complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op);
...@@ -1379,6 +1390,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) ...@@ -1379,6 +1390,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
nvme_end_request(rq, status, result); nvme_end_request(rq, status, result);
} else } else
__nvme_fc_final_op_cleanup(rq); __nvme_fc_final_op_cleanup(rq);
check_error:
if (terminate_assoc)
nvme_fc_error_recovery(ctrl, "transport detected io error");
} }
static int static int
...@@ -2791,6 +2806,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ...@@ -2791,6 +2806,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = NULL; ctrl->ctrl.opts = NULL;
/* initiate nvme ctrl ref counting teardown */ /* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl); nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
/* as we're past the point where we transition to the ref /* as we're past the point where we transition to the ref
* counting teardown path, if we return a bad pointer here, * counting teardown path, if we return a bad pointer here,
......
...@@ -1367,7 +1367,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) ...@@ -1367,7 +1367,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
/* If there is a reset ongoing, we shouldn't reset again. */ /* If there is a reset ongoing, we shouldn't reset again. */
if (work_busy(&dev->reset_work)) if (dev->ctrl.state == NVME_CTRL_RESETTING)
return false; return false;
/* We shouldn't reset unless the controller is on fatal error state /* We shouldn't reset unless the controller is on fatal error state
...@@ -1903,7 +1903,7 @@ static void nvme_reset_work(struct work_struct *work) ...@@ -1903,7 +1903,7 @@ static void nvme_reset_work(struct work_struct *work)
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result = -ENODEV; int result = -ENODEV;
if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
goto out; goto out;
/* /*
...@@ -1913,9 +1913,6 @@ static void nvme_reset_work(struct work_struct *work) ...@@ -1913,9 +1913,6 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false); nvme_dev_disable(dev, false);
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
goto out;
result = nvme_pci_enable(dev); result = nvme_pci_enable(dev);
if (result) if (result)
goto out; goto out;
...@@ -2009,8 +2006,8 @@ static int nvme_reset(struct nvme_dev *dev) ...@@ -2009,8 +2006,8 @@ static int nvme_reset(struct nvme_dev *dev)
{ {
if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
return -ENODEV; return -ENODEV;
if (work_busy(&dev->reset_work)) if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
return -ENODEV; return -EBUSY;
if (!queue_work(nvme_workq, &dev->reset_work)) if (!queue_work(nvme_workq, &dev->reset_work))
return -EBUSY; return -EBUSY;
return 0; return 0;
...@@ -2136,6 +2133,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) ...@@ -2136,6 +2133,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result) if (result)
goto release_pools; goto release_pools;
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
queue_work(nvme_workq, &dev->reset_work); queue_work(nvme_workq, &dev->reset_work);
...@@ -2179,6 +2177,7 @@ static void nvme_remove(struct pci_dev *pdev) ...@@ -2179,6 +2177,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
cancel_work_sync(&dev->reset_work);
pci_set_drvdata(pdev, NULL); pci_set_drvdata(pdev, NULL);
if (!pci_device_is_present(pdev)) { if (!pci_device_is_present(pdev)) {
......
...@@ -753,28 +753,26 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ...@@ -753,28 +753,26 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
if (ret) if (ret)
goto requeue; goto requeue;
blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
ret = nvmf_connect_admin_queue(&ctrl->ctrl); ret = nvmf_connect_admin_queue(&ctrl->ctrl);
if (ret) if (ret)
goto stop_admin_q; goto requeue;
set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
if (ret) if (ret)
goto stop_admin_q; goto requeue;
nvme_start_keep_alive(&ctrl->ctrl); nvme_start_keep_alive(&ctrl->ctrl);
if (ctrl->queue_count > 1) { if (ctrl->queue_count > 1) {
ret = nvme_rdma_init_io_queues(ctrl); ret = nvme_rdma_init_io_queues(ctrl);
if (ret) if (ret)
goto stop_admin_q; goto requeue;
ret = nvme_rdma_connect_io_queues(ctrl); ret = nvme_rdma_connect_io_queues(ctrl);
if (ret) if (ret)
goto stop_admin_q; goto requeue;
} }
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
...@@ -782,7 +780,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ...@@ -782,7 +780,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
ctrl->ctrl.opts->nr_reconnects = 0; ctrl->ctrl.opts->nr_reconnects = 0;
if (ctrl->queue_count > 1) { if (ctrl->queue_count > 1) {
nvme_start_queues(&ctrl->ctrl);
nvme_queue_scan(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl);
nvme_queue_async_events(&ctrl->ctrl); nvme_queue_async_events(&ctrl->ctrl);
} }
...@@ -791,8 +788,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ...@@ -791,8 +788,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
return; return;
stop_admin_q:
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
requeue: requeue:
dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
ctrl->ctrl.opts->nr_reconnects); ctrl->ctrl.opts->nr_reconnects);
...@@ -823,6 +818,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) ...@@ -823,6 +818,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl); nvme_cancel_request, &ctrl->ctrl);
/*
* queues are not a live anymore, so restart the queues to fail fast
* new IO
*/
blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_reconnect_or_remove(ctrl); nvme_rdma_reconnect_or_remove(ctrl);
} }
...@@ -1433,7 +1435,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved) ...@@ -1433,7 +1435,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
/* /*
* We cannot accept any other command until the Connect command has completed. * We cannot accept any other command until the Connect command has completed.
*/ */
static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
struct request *rq) struct request *rq)
{ {
if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
...@@ -1441,11 +1443,22 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, ...@@ -1441,11 +1443,22 @@ static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
if (!blk_rq_is_passthrough(rq) || if (!blk_rq_is_passthrough(rq) ||
cmd->common.opcode != nvme_fabrics_command || cmd->common.opcode != nvme_fabrics_command ||
cmd->fabrics.fctype != nvme_fabrics_type_connect) cmd->fabrics.fctype != nvme_fabrics_type_connect) {
return false; /*
* reconnecting state means transport disruption, which
* can take a long time and even might fail permanently,
* so we can't let incoming I/O be requeued forever.
* fail it fast to allow upper layers a chance to
* failover.
*/
if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING)
return -EIO;
else
return -EAGAIN;
}
} }
return true; return 0;
} }
static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
...@@ -1463,8 +1476,9 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, ...@@ -1463,8 +1476,9 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
WARN_ON_ONCE(rq->tag < 0); WARN_ON_ONCE(rq->tag < 0);
if (!nvme_rdma_queue_is_ready(queue, rq)) ret = nvme_rdma_queue_is_ready(queue, rq);
return BLK_MQ_RQ_QUEUE_BUSY; if (unlikely(ret))
goto err;
dev = queue->device->dev; dev = queue->device->dev;
ib_dma_sync_single_for_cpu(dev, sqe->dma, ib_dma_sync_single_for_cpu(dev, sqe->dma,
......
...@@ -153,7 +153,7 @@ struct elevator_type ...@@ -153,7 +153,7 @@ struct elevator_type
#endif #endif
/* managed by elevator core */ /* managed by elevator core */
char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */
struct list_head list; struct list_head list;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment