Commit 36805aae authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.11/core' of git://git.kernel.dk/linux-block

Pull core block IO updates from Jens Axboe:
 "Here are the core IO block bits for 3.11. It contains:

   - A tweak to the reserved tag logic from Jan, for weirdo devices with
     just 3 free tags.  But for those it improves things substantially
     for random writes.

   - Periodic writeback fix from Jan.  Marked for stable as well.

   - Fix for a race condition in IO scheduler switching from Jianpeng.

   - The hierarchical blk-cgroup support from Tejun.  This is the grunt
     of the series.

   - blk-throttle fix from Vivek.

  Just a note that I'm in the middle of a relocation, whole family is
  flying out tomorrow.  Hence I will be awal the remainder of this week,
  but back at work again on Monday the 15th.  CC'ing Tejun, since any
  potential "surprises" will most likely be from the blk-cgroup work.
  But it's been brewing for a while and sitting in my tree and
  linux-next for a long time, so should be solid."

* 'for-3.11/core' of git://git.kernel.dk/linux-block: (36 commits)
  elevator: Fix a race in elevator switching
  block: Reserve only one queue tag for sync IO if only 3 tags are available
  writeback: Fix periodic writeback after fs mount
  blk-throttle: implement proper hierarchy support
  blk-throttle: implement throtl_grp->has_rules[]
  blk-throttle: Account for child group's start time in parent while bio climbs up
  blk-throttle: add throtl_qnode for dispatch fairness
  blk-throttle: make throtl_pending_timer_fn() ready for hierarchy
  blk-throttle: make tg_dispatch_one_bio() ready for hierarchy
  blk-throttle: make blk_throtl_bio() ready for hierarchy
  blk-throttle: make blk_throtl_drain() ready for hierarchy
  blk-throttle: dispatch from throtl_pending_timer_fn()
  blk-throttle: implement dispatch looping
  blk-throttle: separate out throtl_service_queue->pending_timer from throtl_data->dispatch_work
  blk-throttle: set REQ_THROTTLED from throtl_charge_bio() and gate stats update with it
  blk-throttle: implement sq_to_tg(), sq_to_td() and throtl_log()
  blk-throttle: add throtl_service_queue->parent_sq
  blk-throttle: generalize update_disptime optimization in blk_throtl_bio()
  blk-throttle: dispatch to throtl_data->service_queue.bio_lists[]
  blk-throttle: move bio_lists[] and friends to throtl_service_queue
  ...
parents 6d2fa9e1 d50235b7
...@@ -94,11 +94,13 @@ Throttling/Upper Limit policy ...@@ -94,11 +94,13 @@ Throttling/Upper Limit policy
Hierarchical Cgroups Hierarchical Cgroups
==================== ====================
- Currently only CFQ supports hierarchical groups. For throttling,
cgroup interface does allow creation of hierarchical cgroups and
internally it treats them as flat hierarchy.
If somebody created a hierarchy like as follows. Both CFQ and throttling implement hierarchy support; however,
throttling's hierarchy support is enabled iff "sane_behavior" is
enabled from cgroup side, which currently is a development option and
not publicly available.
If somebody created a hierarchy like as follows.
root root
/ \ / \
...@@ -106,21 +108,20 @@ Hierarchical Cgroups ...@@ -106,21 +108,20 @@ Hierarchical Cgroups
| |
test3 test3
CFQ will handle the hierarchy correctly but and throttling will CFQ by default and throttling with "sane_behavior" will handle the
practically treat all groups at same level. For details on CFQ hierarchy correctly. For details on CFQ hierarchy support, refer to
hierarchy support, refer to Documentation/block/cfq-iosched.txt. Documentation/block/cfq-iosched.txt. For throttling, all limits apply
Throttling will treat the hierarchy as if it looks like the to the whole subtree while all statistics are local to the IOs
following. directly generated by tasks in that cgroup.
Throttling without "sane_behavior" enabled from cgroup side will
practically treat all groups at same level as if it looks like the
following.
pivot pivot
/ / \ \ / / \ \
root test1 test2 test3 root test1 test2 test3
Nesting cgroups, while allowed, isn't officially supported and blkio
genereates warning when cgroups nest. Once throttling implements
hierarchy support, hierarchy will be supported and the warning will
be removed.
Various user visible config options Various user visible config options
=================================== ===================================
CONFIG_BLK_CGROUP CONFIG_BLK_CGROUP
......
...@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root); ...@@ -32,26 +32,6 @@ EXPORT_SYMBOL_GPL(blkcg_root);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q, bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
* subtree.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
static bool blkcg_policy_enabled(struct request_queue *q, static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol) const struct blkcg_policy *pol)
{ {
...@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg) ...@@ -71,18 +51,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (!blkg) if (!blkg)
return; return;
for (i = 0; i < BLKCG_MAX_POLS; i++) { for (i = 0; i < BLKCG_MAX_POLS; i++)
struct blkcg_policy *pol = blkcg_policy[i]; kfree(blkg->pd[i]);
struct blkg_policy_data *pd = blkg->pd[i];
if (!pd)
continue;
if (pol && pol->pd_exit_fn)
pol->pd_exit_fn(blkg);
kfree(pd);
}
blk_exit_rl(&blkg->rl); blk_exit_rl(&blkg->rl);
kfree(blkg); kfree(blkg);
...@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, ...@@ -134,10 +104,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->pd[i] = pd; blkg->pd[i] = pd;
pd->blkg = blkg; pd->blkg = blkg;
pd->plid = i; pd->plid = i;
/* invoke per-policy init */
if (pol->pd_init_fn)
pol->pd_init_fn(blkg);
} }
return blkg; return blkg;
...@@ -158,8 +124,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, ...@@ -158,8 +124,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
* @q's bypass state. If @update_hint is %true, the caller should be * @q's bypass state. If @update_hint is %true, the caller should be
* holding @q->queue_lock and lookup hint is updated on success. * holding @q->queue_lock and lookup hint is updated on success.
*/ */
static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
struct request_queue *q, bool update_hint) bool update_hint)
{ {
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
...@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, ...@@ -234,16 +200,25 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
} }
blkg = new_blkg; blkg = new_blkg;
/* link parent and insert */ /* link parent */
if (blkcg_parent(blkcg)) { if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) { if (WARN_ON_ONCE(!blkg->parent)) {
blkg = ERR_PTR(-EINVAL); ret = -EINVAL;
goto err_put_css; goto err_put_css;
} }
blkg_get(blkg->parent); blkg_get(blkg->parent);
} }
/* invoke per-policy init */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_init_fn)
pol->pd_init_fn(blkg);
}
/* insert */
spin_lock(&blkcg->lock); spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
if (likely(!ret)) { if (likely(!ret)) {
...@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q) ...@@ -394,30 +369,38 @@ static void blkg_destroy_all(struct request_queue *q)
q->root_rl.blkg = NULL; q->root_rl.blkg = NULL;
} }
static void blkg_rcu_free(struct rcu_head *rcu_head) /*
* A group is RCU protected, but having an rcu lock does not mean that one
* can access all the fields of blkg and assume these are valid. For
* example, don't try to follow throtl_data and request queue links.
*
* Having a reference to blkg under an rcu allows accesses to only values
* local to groups like group stats and group rate limits.
*/
void __blkg_release_rcu(struct rcu_head *rcu_head)
{ {
blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head)); struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
} int i;
/* tell policies that this one is being freed */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
if (blkg->pd[i] && pol->pd_exit_fn)
pol->pd_exit_fn(blkg);
}
void __blkg_release(struct blkcg_gq *blkg)
{
/* release the blkcg and parent blkg refs this blkg has been holding */ /* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css); css_put(&blkg->blkcg->css);
if (blkg->parent) if (blkg->parent) {
spin_lock_irq(blkg->q->queue_lock);
blkg_put(blkg->parent); blkg_put(blkg->parent);
spin_unlock_irq(blkg->q->queue_lock);
}
/* blkg_free(blkg);
* A group is freed in rcu manner. But having an rcu lock does not
* mean that one can access all the fields of blkg and assume these
* are valid. For example, don't try to follow throtl_data and
* request queue links.
*
* Having a reference to blkg under an rcu allows acess to only
* values local to groups like group stats and group rate limits
*/
call_rcu(&blkg->rcu_head, blkg_rcu_free);
} }
EXPORT_SYMBOL_GPL(__blkg_release); EXPORT_SYMBOL_GPL(__blkg_release_rcu);
/* /*
* The next function used by blk_queue_for_each_rl(). It's a bit tricky * The next function used by blk_queue_for_each_rl(). It's a bit tricky
...@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = { ...@@ -928,14 +911,6 @@ struct cgroup_subsys blkio_subsys = {
.subsys_id = blkio_subsys_id, .subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files, .base_cftypes = blkcg_files,
.module = THIS_MODULE, .module = THIS_MODULE,
/*
* blkio subsystem is utterly broken in terms of hierarchy support.
* It treats all cgroups equally regardless of where they're
* located in the hierarchy - all cgroups are treated as if they're
* right below the root. Fix it and remove the following.
*/
.broken_hierarchy = true,
}; };
EXPORT_SYMBOL_GPL(blkio_subsys); EXPORT_SYMBOL_GPL(blkio_subsys);
......
...@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg) ...@@ -266,7 +266,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
blkg->refcnt++; blkg->refcnt++;
} }
void __blkg_release(struct blkcg_gq *blkg); void __blkg_release_rcu(struct rcu_head *rcu);
/** /**
* blkg_put - put a blkg reference * blkg_put - put a blkg reference
...@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg) ...@@ -279,9 +279,43 @@ static inline void blkg_put(struct blkcg_gq *blkg)
lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(blkg->q->queue_lock);
WARN_ON_ONCE(blkg->refcnt <= 0); WARN_ON_ONCE(blkg->refcnt <= 0);
if (!--blkg->refcnt) if (!--blkg->refcnt)
__blkg_release(blkg); call_rcu(&blkg->rcu_head, __blkg_release_rcu);
} }
struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
* subtree.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
/**
* blkg_for_each_descendant_post - post-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Similar to blkg_for_each_descendant_pre() but performs post-order
* traversal instead. Synchronization rules are the same.
*/
#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \
cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
(p_blkg)->q, false)))
/** /**
* blk_get_rl - get request_list to use * blk_get_rl - get request_list to use
* @q: request_queue of interest * @q: request_queue of interest
......
...@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) ...@@ -348,9 +348,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
*/ */
max_depth = bqt->max_depth; max_depth = bqt->max_depth;
if (!rq_is_sync(rq) && max_depth > 1) { if (!rq_is_sync(rq) && max_depth > 1) {
max_depth -= 2; switch (max_depth) {
if (!max_depth) case 2:
max_depth = 1; max_depth = 1;
break;
case 3:
max_depth = 2;
break;
default:
max_depth -= 2;
}
if (q->in_flight[BLK_RW_ASYNC] > max_depth) if (q->in_flight[BLK_RW_ASYNC] > max_depth)
return 1; return 1;
} }
......
...@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl; ...@@ -25,18 +25,61 @@ static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */ /* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue; static struct workqueue_struct *kthrotld_workqueue;
static void throtl_schedule_delayed_work(struct throtl_data *td,
unsigned long delay); /*
* To implement hierarchical throttling, throtl_grps form a tree and bios
struct throtl_rb_root { * are dispatched upwards level by level until they reach the top and get
struct rb_root rb; * issued. When dispatching bios from the children and local group at each
struct rb_node *left; * level, if the bios are dispatched into a single bio_list, there's a risk
unsigned int count; * of a local or child group which can queue many bios at once filling up
unsigned long min_disptime; * the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
}; };
#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ struct throtl_service_queue {
.count = 0, .min_disptime = 0} struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root pending_tree; /* RB tree of active tgs */
struct rb_node *first_pending; /* first node in the tree */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
};
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
...@@ -52,9 +95,26 @@ struct throtl_grp { ...@@ -52,9 +95,26 @@ struct throtl_grp {
/* must be the first member */ /* must be the first member */
struct blkg_policy_data pd; struct blkg_policy_data pd;
/* active throtl group service_tree member */ /* active throtl group service_queue member */
struct rb_node rb_node; struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/* /*
* Dispatch time in jiffies. This is the estimated time when group * Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as * will unthrottle and is ready to dispatch more bio. It is used as
...@@ -64,11 +124,8 @@ struct throtl_grp { ...@@ -64,11 +124,8 @@ struct throtl_grp {
unsigned int flags; unsigned int flags;
/* Two lists for READ and WRITE */ /* are there any throtl rules between this group and td? */
struct bio_list bio_lists[2]; bool has_rules[2];
/* Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
/* bytes per second rate limits */ /* bytes per second rate limits */
uint64_t bps[2]; uint64_t bps[2];
...@@ -85,9 +142,6 @@ struct throtl_grp { ...@@ -85,9 +142,6 @@ struct throtl_grp {
unsigned long slice_start[2]; unsigned long slice_start[2];
unsigned long slice_end[2]; unsigned long slice_end[2];
/* Some throttle limits got updated for the group */
int limits_changed;
/* Per cpu stats pointer */ /* Per cpu stats pointer */
struct tg_stats_cpu __percpu *stats_cpu; struct tg_stats_cpu __percpu *stats_cpu;
...@@ -98,7 +152,7 @@ struct throtl_grp { ...@@ -98,7 +152,7 @@ struct throtl_grp {
struct throtl_data struct throtl_data
{ {
/* service tree for active throtl groups */ /* service tree for active throtl groups */
struct throtl_rb_root tg_service_tree; struct throtl_service_queue service_queue;
struct request_queue *queue; struct request_queue *queue;
...@@ -111,9 +165,7 @@ struct throtl_data ...@@ -111,9 +165,7 @@ struct throtl_data
unsigned int nr_undestroyed_grps; unsigned int nr_undestroyed_grps;
/* Work for dispatching throttled bios */ /* Work for dispatching throttled bios */
struct delayed_work throtl_work; struct work_struct dispatch_work;
int limits_changed;
}; };
/* list and work item to allocate percpu group stats */ /* list and work item to allocate percpu group stats */
...@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list); ...@@ -123,6 +175,8 @@ static LIST_HEAD(tg_stats_alloc_list);
static void tg_stats_alloc_fn(struct work_struct *); static void tg_stats_alloc_fn(struct work_struct *);
static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
static void throtl_pending_timer_fn(unsigned long arg);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{ {
return pd ? container_of(pd, struct throtl_grp, pd) : NULL; return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
...@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td) ...@@ -143,41 +197,65 @@ static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
return blkg_to_tg(td->queue->root_blkg); return blkg_to_tg(td->queue->root_blkg);
} }
enum tg_state_flags { /**
THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ * sq_to_tg - return the throl_grp the specified service queue belongs to
}; * @sq: the throtl_service_queue of interest
*
#define THROTL_TG_FNS(name) \ * Return the throtl_grp @sq belongs to. If @sq is the top-level one
static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ * embedded in throtl_data, %NULL is returned.
{ \ */
(tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
} \ {
static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ if (sq && sq->parent_sq)
{ \ return container_of(sq, struct throtl_grp, service_queue);
(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ else
} \ return NULL;
static inline int throtl_tg_##name(const struct throtl_grp *tg) \
{ \
return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
} }
THROTL_TG_FNS(on_rr); /**
* sq_to_td - return throtl_data the specified service queue belongs to
* @sq: the throtl_service_queue of interest
*
* A service_queue can be embeded in either a throtl_grp or throtl_data.
* Determine the associated throtl_data accordingly and return it.
*/
static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
{
struct throtl_grp *tg = sq_to_tg(sq);
#define throtl_log_tg(td, tg, fmt, args...) do { \ if (tg)
char __pbuf[128]; \ return tg->td;
else
return container_of(sq, struct throtl_data, service_queue);
}
/**
* throtl_log - log debug message via blktrace
* @sq: the service_queue being reported
* @fmt: printf format string
* @args: printf args
*
* The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
* throtl_grp; otherwise, just "throtl".
*
* TODO: this should be made a function and name formatting should happen
* after testing whether blktrace is enabled.
*/
#define throtl_log(sq, fmt, args...) do { \
struct throtl_grp *__tg = sq_to_tg((sq)); \
struct throtl_data *__td = sq_to_td((sq)); \
\
(void)__td; \
if ((__tg)) { \
char __pbuf[128]; \
\ \
blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \
} else { \
blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
} \
} while (0) } while (0)
#define throtl_log(td, fmt, args...) \
blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
static inline unsigned int total_nr_queued(struct throtl_data *td)
{
return td->nr_queued[0] + td->nr_queued[1];
}
/* /*
* Worker for allocating per cpu stat for tgs. This is scheduled on the * Worker for allocating per cpu stat for tgs. This is scheduled on the
* system_wq once there are some groups on the alloc_list waiting for * system_wq once there are some groups on the alloc_list waiting for
...@@ -215,15 +293,141 @@ static void tg_stats_alloc_fn(struct work_struct *work) ...@@ -215,15 +293,141 @@ static void tg_stats_alloc_fn(struct work_struct *work)
goto alloc_stats; goto alloc_stats;
} }
static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
{
INIT_LIST_HEAD(&qn->node);
bio_list_init(&qn->bios);
qn->tg = tg;
}
/**
* throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
* @bio: bio being added
* @qn: qnode to add bio to
* @queued: the service_queue->queued[] list @qn belongs to
*
* Add @bio to @qn and put @qn on @queued if it's not already on.
* @qn->tg's reference count is bumped when @qn is activated. See the
* comment on top of throtl_qnode definition for details.
*/
static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
struct list_head *queued)
{
bio_list_add(&qn->bios, bio);
if (list_empty(&qn->node)) {
list_add_tail(&qn->node, queued);
blkg_get(tg_to_blkg(qn->tg));
}
}
/**
* throtl_peek_queued - peek the first bio on a qnode list
* @queued: the qnode list to peek
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
struct bio *bio;
if (list_empty(queued))
return NULL;
bio = bio_list_peek(&qn->bios);
WARN_ON_ONCE(!bio);
return bio;
}
/**
* throtl_pop_queued - pop the first bio form a qnode list
* @queued: the qnode list to pop a bio from
* @tg_to_put: optional out argument for throtl_grp to put
*
* Pop the first bio from the qnode list @queued. After popping, the first
* qnode is removed from @queued if empty or moved to the end of @queued so
* that the popping order is round-robin.
*
* When the first qnode is removed, its associated throtl_grp should be put
* too. If @tg_to_put is NULL, this function automatically puts it;
* otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
* responsible for putting it.
*/
static struct bio *throtl_pop_queued(struct list_head *queued,
struct throtl_grp **tg_to_put)
{
struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
struct bio *bio;
if (list_empty(queued))
return NULL;
bio = bio_list_pop(&qn->bios);
WARN_ON_ONCE(!bio);
if (bio_list_empty(&qn->bios)) {
list_del_init(&qn->node);
if (tg_to_put)
*tg_to_put = qn->tg;
else
blkg_put(tg_to_blkg(qn->tg));
} else {
list_move_tail(&qn->node, queued);
}
return bio;
}
/* init a service_queue, assumes the caller zeroed it */
static void throtl_service_queue_init(struct throtl_service_queue *sq,
struct throtl_service_queue *parent_sq)
{
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
sq->pending_tree = RB_ROOT;
sq->parent_sq = parent_sq;
setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
(unsigned long)sq);
}
static void throtl_service_queue_exit(struct throtl_service_queue *sq)
{
del_timer_sync(&sq->pending_timer);
}
static void throtl_pd_init(struct blkcg_gq *blkg) static void throtl_pd_init(struct blkcg_gq *blkg)
{ {
struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_data *td = blkg->q->td;
struct throtl_service_queue *parent_sq;
unsigned long flags; unsigned long flags;
int rw;
/*
* If sane_hierarchy is enabled, we switch to properly hierarchical
* behavior where limits on a given throtl_grp are applied to the
* whole subtree rather than just the group itself. e.g. If 16M
* read_bps limit is set on the root group, the whole system can't
* exceed 16M for the device.
*
* If sane_hierarchy is not enabled, the broken flat hierarchy
* behavior is retained where all throtl_grps are treated as if
* they're all separate root groups right below throtl_data.
* Limits of a group don't interact with limits of other groups
* regardless of the position of the group in the hierarchy.
*/
parent_sq = &td->service_queue;
if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
throtl_service_queue_init(&tg->service_queue, parent_sq);
for (rw = READ; rw <= WRITE; rw++) {
throtl_qnode_init(&tg->qnode_on_self[rw], tg);
throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
}
RB_CLEAR_NODE(&tg->rb_node); RB_CLEAR_NODE(&tg->rb_node);
bio_list_init(&tg->bio_lists[0]); tg->td = td;
bio_list_init(&tg->bio_lists[1]);
tg->limits_changed = false;
tg->bps[READ] = -1; tg->bps[READ] = -1;
tg->bps[WRITE] = -1; tg->bps[WRITE] = -1;
...@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg) ...@@ -241,6 +445,30 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
} }
/*
* Set has_rules[] if @tg or any of its parents have limits configured.
* This doesn't require walking up to the top of the hierarchy as the
* parent's has_rules[] is guaranteed to be correct.
*/
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
int rw;
for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
(tg->bps[rw] != -1 || tg->iops[rw] != -1);
}
static void throtl_pd_online(struct blkcg_gq *blkg)
{
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
tg_update_has_rules(blkg_to_tg(blkg));
}
static void throtl_pd_exit(struct blkcg_gq *blkg) static void throtl_pd_exit(struct blkcg_gq *blkg)
{ {
struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_grp *tg = blkg_to_tg(blkg);
...@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg) ...@@ -251,6 +479,8 @@ static void throtl_pd_exit(struct blkcg_gq *blkg)
spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); spin_unlock_irqrestore(&tg_stats_alloc_lock, flags);
free_percpu(tg->stats_cpu); free_percpu(tg->stats_cpu);
throtl_service_queue_exit(&tg->service_queue);
} }
static void throtl_pd_reset_stats(struct blkcg_gq *blkg) static void throtl_pd_reset_stats(struct blkcg_gq *blkg)
...@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, ...@@ -309,17 +539,18 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
return tg; return tg;
} }
static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) static struct throtl_grp *
throtl_rb_first(struct throtl_service_queue *parent_sq)
{ {
/* Service tree is empty */ /* Service tree is empty */
if (!root->count) if (!parent_sq->nr_pending)
return NULL; return NULL;
if (!root->left) if (!parent_sq->first_pending)
root->left = rb_first(&root->rb); parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
if (root->left) if (parent_sq->first_pending)
return rb_entry_tg(root->left); return rb_entry_tg(parent_sq->first_pending);
return NULL; return NULL;
} }
...@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root) ...@@ -330,29 +561,30 @@ static void rb_erase_init(struct rb_node *n, struct rb_root *root)
RB_CLEAR_NODE(n); RB_CLEAR_NODE(n);
} }
static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) static void throtl_rb_erase(struct rb_node *n,
struct throtl_service_queue *parent_sq)
{ {
if (root->left == n) if (parent_sq->first_pending == n)
root->left = NULL; parent_sq->first_pending = NULL;
rb_erase_init(n, &root->rb); rb_erase_init(n, &parent_sq->pending_tree);
--root->count; --parent_sq->nr_pending;
} }
static void update_min_dispatch_time(struct throtl_rb_root *st) static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
{ {
struct throtl_grp *tg; struct throtl_grp *tg;
tg = throtl_rb_first(st); tg = throtl_rb_first(parent_sq);
if (!tg) if (!tg)
return; return;
st->min_disptime = tg->disptime; parent_sq->first_pending_disptime = tg->disptime;
} }
static void static void tg_service_queue_add(struct throtl_grp *tg)
tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
{ {
struct rb_node **node = &st->rb.rb_node; struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
struct rb_node **node = &parent_sq->pending_tree.rb_node;
struct rb_node *parent = NULL; struct rb_node *parent = NULL;
struct throtl_grp *__tg; struct throtl_grp *__tg;
unsigned long key = tg->disptime; unsigned long key = tg->disptime;
...@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) ...@@ -371,89 +603,135 @@ tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
} }
if (left) if (left)
st->left = &tg->rb_node; parent_sq->first_pending = &tg->rb_node;
rb_link_node(&tg->rb_node, parent, node); rb_link_node(&tg->rb_node, parent, node);
rb_insert_color(&tg->rb_node, &st->rb); rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
} }
static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) static void __throtl_enqueue_tg(struct throtl_grp *tg)
{ {
struct throtl_rb_root *st = &td->tg_service_tree; tg_service_queue_add(tg);
tg->flags |= THROTL_TG_PENDING;
tg->service_queue.parent_sq->nr_pending++;
}
tg_service_tree_add(st, tg); static void throtl_enqueue_tg(struct throtl_grp *tg)
throtl_mark_tg_on_rr(tg); {
st->count++; if (!(tg->flags & THROTL_TG_PENDING))
__throtl_enqueue_tg(tg);
} }
static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) static void __throtl_dequeue_tg(struct throtl_grp *tg)
{ {
if (!throtl_tg_on_rr(tg)) throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
__throtl_enqueue_tg(td, tg); tg->flags &= ~THROTL_TG_PENDING;
} }
static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) static void throtl_dequeue_tg(struct throtl_grp *tg)
{ {
throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); if (tg->flags & THROTL_TG_PENDING)
throtl_clear_tg_on_rr(tg); __throtl_dequeue_tg(tg);
} }
static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) /* Call with queue lock held */
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires)
{ {
if (throtl_tg_on_rr(tg)) mod_timer(&sq->pending_timer, expires);
__throtl_dequeue_tg(td, tg); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
expires - jiffies, jiffies);
} }
static void throtl_schedule_next_dispatch(struct throtl_data *td) /**
* throtl_schedule_next_dispatch - schedule the next dispatch cycle
* @sq: the service_queue to schedule dispatch for
* @force: force scheduling
*
* Arm @sq->pending_timer so that the next dispatch cycle starts on the
* dispatch time of the first pending child. Returns %true if either timer
* is armed or there's no pending child left. %false if the current
* dispatch window is still open and the caller should continue
* dispatching.
*
* If @force is %true, the dispatch timer is always scheduled and this
* function is guaranteed to return %true. This is to be used when the
* caller can't dispatch itself and needs to invoke pending_timer
* unconditionally. Note that forced scheduling is likely to induce short
* delay before dispatch starts even if @sq->first_pending_disptime is not
* in the future and thus shouldn't be used in hot paths.
*/
static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
bool force)
{ {
struct throtl_rb_root *st = &td->tg_service_tree; /* any pending children left? */
if (!sq->nr_pending)
return true;
/* update_min_dispatch_time(sq);
* If there are more bios pending, schedule more work.
*/
if (!total_nr_queued(td))
return;
BUG_ON(!st->count); /* is the next dispatch time in the future? */
if (force || time_after(sq->first_pending_disptime, jiffies)) {
throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
return true;
}
update_min_dispatch_time(st); /* tell the caller to continue dispatching */
return false;
}
if (time_before_eq(st->min_disptime, jiffies)) static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
throtl_schedule_delayed_work(td, 0); bool rw, unsigned long start)
else {
throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
/*
* Previous slice has expired. We must have trimmed it after last
* bio dispatch. That means since start of last slice, we never used
* that bandwidth. Do try to make use of that bandwidth while giving
* credit.
*/
if (time_after_eq(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
tg->slice_end[rw] = jiffies + throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
tg->slice_end[rw], jiffies);
} }
static inline void static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
{ {
tg->bytes_disp[rw] = 0; tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0; tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies; tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + throtl_slice; tg->slice_end[rw] = jiffies + throtl_slice;
throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", throtl_log(&tg->service_queue,
rw == READ ? 'R' : 'W', tg->slice_start[rw], "[%c] new slice start=%lu end=%lu jiffies=%lu",
tg->slice_end[rw], jiffies); rw == READ ? 'R' : 'W', tg->slice_start[rw],
tg->slice_end[rw], jiffies);
} }
static inline void throtl_set_slice_end(struct throtl_data *td, static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
struct throtl_grp *tg, bool rw, unsigned long jiffy_end) unsigned long jiffy_end)
{ {
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
} }
static inline void throtl_extend_slice(struct throtl_data *td, static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
struct throtl_grp *tg, bool rw, unsigned long jiffy_end) unsigned long jiffy_end)
{ {
tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", throtl_log(&tg->service_queue,
rw == READ ? 'R' : 'W', tg->slice_start[rw], "[%c] extend slice start=%lu end=%lu jiffies=%lu",
tg->slice_end[rw], jiffies); rw == READ ? 'R' : 'W', tg->slice_start[rw],
tg->slice_end[rw], jiffies);
} }
/* Determine if previously allocated or extended slice is complete or not */ /* Determine if previously allocated or extended slice is complete or not */
static bool static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
{ {
if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
return 0; return 0;
...@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) ...@@ -462,8 +740,7 @@ throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
} }
/* Trim the used slices and adjust slice start accordingly */ /* Trim the used slices and adjust slice start accordingly */
static inline void static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
{ {
unsigned long nr_slices, time_elapsed, io_trim; unsigned long nr_slices, time_elapsed, io_trim;
u64 bytes_trim, tmp; u64 bytes_trim, tmp;
...@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) ...@@ -475,7 +752,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
* renewed. Don't try to trim the slice if slice is used. A new * renewed. Don't try to trim the slice if slice is used. A new
* slice will start when appropriate. * slice will start when appropriate.
*/ */
if (throtl_slice_used(td, tg, rw)) if (throtl_slice_used(tg, rw))
return; return;
/* /*
...@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) ...@@ -486,7 +763,7 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start. * is bad because it does not allow new slice to start.
*/ */
throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw]; time_elapsed = jiffies - tg->slice_start[rw];
...@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) ...@@ -515,14 +792,14 @@ throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
tg->slice_start[rw] += nr_slices * throtl_slice; tg->slice_start[rw] += nr_slices * throtl_slice;
throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" throtl_log(&tg->service_queue,
" start=%lu end=%lu jiffies=%lu", "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
tg->slice_start[rw], tg->slice_end[rw], jiffies); tg->slice_start[rw], tg->slice_end[rw], jiffies);
} }
static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
struct bio *bio, unsigned long *wait) unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
unsigned int io_allowed; unsigned int io_allowed;
...@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, ...@@ -571,8 +848,8 @@ static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0; return 0;
} }
static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
struct bio *bio, unsigned long *wait) unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp; u64 bytes_allowed, extra_bytes, tmp;
...@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, ...@@ -613,18 +890,12 @@ static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
return 0; return 0;
} }
static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
return 1;
return 0;
}
/* /*
* Returns whether one can dispatch a bio or not. Also returns approx number * Returns whether one can dispatch a bio or not. Also returns approx number
* of jiffies to wait before this bio is with-in IO rate and can be dispatched * of jiffies to wait before this bio is with-in IO rate and can be dispatched
*/ */
static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
struct bio *bio, unsigned long *wait) unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
...@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, ...@@ -635,7 +906,8 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
* this function with a different bio if there are other bios * this function with a different bio if there are other bios
* queued. * queued.
*/ */
BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); BUG_ON(tg->service_queue.nr_queued[rw] &&
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */ /* If tg->bps = -1, then BW is unlimited */
if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
...@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, ...@@ -649,15 +921,15 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
* existing slice to make sure it is at least throtl_slice interval * existing slice to make sure it is at least throtl_slice interval
* long since now. * long since now.
*/ */
if (throtl_slice_used(td, tg, rw)) if (throtl_slice_used(tg, rw))
throtl_start_new_slice(td, tg, rw); throtl_start_new_slice(tg, rw);
else { else {
if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); throtl_extend_slice(tg, rw, jiffies + throtl_slice);
} }
if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
&& tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { tg_with_in_iops_limit(tg, bio, &iops_wait)) {
if (wait) if (wait)
*wait = 0; *wait = 0;
return 1; return 1;
...@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, ...@@ -669,7 +941,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
*wait = max_wait; *wait = max_wait;
if (time_before(tg->slice_end[rw], jiffies + max_wait)) if (time_before(tg->slice_end[rw], jiffies + max_wait))
throtl_extend_slice(td, tg, rw, jiffies + max_wait); throtl_extend_slice(tg, rw, jiffies + max_wait);
return 0; return 0;
} }
...@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) ...@@ -708,65 +980,136 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
tg->bytes_disp[rw] += bio->bi_size; tg->bytes_disp[rw] += bio->bi_size;
tg->io_disp[rw]++; tg->io_disp[rw]++;
throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); /*
* REQ_THROTTLED is used to prevent the same bio to be throttled
* more than once as a throttled bio will go through blk-throtl the
* second time when it eventually gets issued. Set it when a bio
* is being charged to a tg.
*
* Dispatch stats aren't recursive and each @bio should only be
* accounted by the @tg it was originally associated with. Let's
* update the stats when setting REQ_THROTTLED for the first time
* which is guaranteed to be for the @bio's original tg.
*/
if (!(bio->bi_rw & REQ_THROTTLED)) {
bio->bi_rw |= REQ_THROTTLED;
throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size,
bio->bi_rw);
}
} }
static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, /**
struct bio *bio) * throtl_add_bio_tg - add a bio to the specified throtl_grp
* @bio: bio to add
* @qn: qnode to use
* @tg: the target throtl_grp
*
* Add @bio to @tg's service_queue using @qn. If @qn is not specified,
* tg->qnode_on_self[] is used.
*/
static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
struct throtl_grp *tg)
{ {
struct throtl_service_queue *sq = &tg->service_queue;
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
bio_list_add(&tg->bio_lists[rw], bio); if (!qn)
/* Take a bio reference on tg */ qn = &tg->qnode_on_self[rw];
blkg_get(tg_to_blkg(tg));
tg->nr_queued[rw]++; /*
td->nr_queued[rw]++; * If @tg doesn't currently have any bios queued in the same
throtl_enqueue_tg(td, tg); * direction, queueing @bio can change when @tg should be
* dispatched. Mark that @tg was empty. This is automatically
* cleaered on the next tg_update_disptime().
*/
if (!sq->nr_queued[rw])
tg->flags |= THROTL_TG_WAS_EMPTY;
throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
sq->nr_queued[rw]++;
throtl_enqueue_tg(tg);
} }
static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) static void tg_update_disptime(struct throtl_grp *tg)
{ {
struct throtl_service_queue *sq = &tg->service_queue;
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
struct bio *bio; struct bio *bio;
if ((bio = bio_list_peek(&tg->bio_lists[READ]))) if ((bio = throtl_peek_queued(&sq->queued[READ])))
tg_may_dispatch(td, tg, bio, &read_wait); tg_may_dispatch(tg, bio, &read_wait);
if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
tg_may_dispatch(td, tg, bio, &write_wait); tg_may_dispatch(tg, bio, &write_wait);
min_wait = min(read_wait, write_wait); min_wait = min(read_wait, write_wait);
disptime = jiffies + min_wait; disptime = jiffies + min_wait;
/* Update dispatch time */ /* Update dispatch time */
throtl_dequeue_tg(td, tg); throtl_dequeue_tg(tg);
tg->disptime = disptime; tg->disptime = disptime;
throtl_enqueue_tg(td, tg); throtl_enqueue_tg(tg);
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
} }
static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
bool rw, struct bio_list *bl) struct throtl_grp *parent_tg, bool rw)
{ {
struct bio *bio; if (throtl_slice_used(parent_tg, rw)) {
throtl_start_new_slice_with_credit(parent_tg, rw,
child_tg->slice_start[rw]);
}
}
bio = bio_list_pop(&tg->bio_lists[rw]); static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
tg->nr_queued[rw]--; {
/* Drop bio reference on blkg */ struct throtl_service_queue *sq = &tg->service_queue;
blkg_put(tg_to_blkg(tg)); struct throtl_service_queue *parent_sq = sq->parent_sq;
struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
struct throtl_grp *tg_to_put = NULL;
struct bio *bio;
BUG_ON(td->nr_queued[rw] <= 0); /*
td->nr_queued[rw]--; * @bio is being transferred from @tg to @parent_sq. Popping a bio
* from @tg may put its reference and @parent_sq might end up
* getting released prematurely. Remember the tg to put and put it
* after @bio is transferred to @parent_sq.
*/
bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio); throtl_charge_bio(tg, bio);
bio_list_add(bl, bio);
bio->bi_rw |= REQ_THROTTLED;
throtl_trim_slice(td, tg, rw); /*
* If our parent is another tg, we just need to transfer @bio to
* the parent using throtl_add_bio_tg(). If our parent is
* @td->service_queue, @bio is ready to be issued. Put it on its
* bio_lists[] and decrease total number queued. The caller is
* responsible for issuing these bios.
*/
if (parent_tg) {
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
&parent_sq->queued[rw]);
BUG_ON(tg->td->nr_queued[rw] <= 0);
tg->td->nr_queued[rw]--;
}
throtl_trim_slice(tg, rw);
if (tg_to_put)
blkg_put(tg_to_blkg(tg_to_put));
} }
static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, static int throtl_dispatch_tg(struct throtl_grp *tg)
struct bio_list *bl)
{ {
struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0; unsigned int nr_reads = 0, nr_writes = 0;
unsigned int max_nr_reads = throtl_grp_quantum*3/4; unsigned int max_nr_reads = throtl_grp_quantum*3/4;
unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
...@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, ...@@ -774,20 +1117,20 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
/* Try to dispatch 75% READS and 25% WRITES */ /* Try to dispatch 75% READS and 25% WRITES */
while ((bio = bio_list_peek(&tg->bio_lists[READ])) while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
&& tg_may_dispatch(td, tg, bio, NULL)) { tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); tg_dispatch_one_bio(tg, bio_data_dir(bio));
nr_reads++; nr_reads++;
if (nr_reads >= max_nr_reads) if (nr_reads >= max_nr_reads)
break; break;
} }
while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
&& tg_may_dispatch(td, tg, bio, NULL)) { tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); tg_dispatch_one_bio(tg, bio_data_dir(bio));
nr_writes++; nr_writes++;
if (nr_writes >= max_nr_writes) if (nr_writes >= max_nr_writes)
...@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, ...@@ -797,14 +1140,13 @@ static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
return nr_reads + nr_writes; return nr_reads + nr_writes;
} }
static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
{ {
unsigned int nr_disp = 0; unsigned int nr_disp = 0;
struct throtl_grp *tg;
struct throtl_rb_root *st = &td->tg_service_tree;
while (1) { while (1) {
tg = throtl_rb_first(st); struct throtl_grp *tg = throtl_rb_first(parent_sq);
struct throtl_service_queue *sq = &tg->service_queue;
if (!tg) if (!tg)
break; break;
...@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) ...@@ -812,14 +1154,12 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
if (time_before(jiffies, tg->disptime)) if (time_before(jiffies, tg->disptime))
break; break;
throtl_dequeue_tg(td, tg); throtl_dequeue_tg(tg);
nr_disp += throtl_dispatch_tg(td, tg, bl); nr_disp += throtl_dispatch_tg(tg);
if (tg->nr_queued[0] || tg->nr_queued[1]) { if (sq->nr_queued[0] || sq->nr_queued[1])
tg_update_disptime(td, tg); tg_update_disptime(tg);
throtl_enqueue_tg(td, tg);
}
if (nr_disp >= throtl_quantum) if (nr_disp >= throtl_quantum)
break; break;
...@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) ...@@ -828,111 +1168,111 @@ static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
return nr_disp; return nr_disp;
} }
static void throtl_process_limit_change(struct throtl_data *td) /**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @arg: the throtl_service_queue being serviced
*
* This timer is armed when a child throtl_grp with active bio's become
* pending and queued on the service_queue's pending_tree and expires when
* the first child throtl_grp should be dispatched. This function
* dispatches bio's from the children throtl_grps to the parent
* service_queue.
*
* If the parent's parent is another throtl_grp, dispatching is propagated
* by either arming its pending_timer or repeating dispatch directly. If
* the top-level service_tree is reached, throtl_data->dispatch_work is
* kicked so that the ready bio's are issued.
*/
static void throtl_pending_timer_fn(unsigned long arg)
{ {
struct throtl_service_queue *sq = (void *)arg;
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
struct request_queue *q = td->queue; struct request_queue *q = td->queue;
struct blkcg_gq *blkg, *n; struct throtl_service_queue *parent_sq;
bool dispatched;
if (!td->limits_changed) int ret;
return;
xchg(&td->limits_changed, false);
throtl_log(td, "limits changed");
list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
struct throtl_grp *tg = blkg_to_tg(blkg);
if (!tg->limits_changed) spin_lock_irq(q->queue_lock);
continue; again:
parent_sq = sq->parent_sq;
dispatched = false;
while (true) {
throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
sq->nr_queued[READ] + sq->nr_queued[WRITE],
sq->nr_queued[READ], sq->nr_queued[WRITE]);
ret = throtl_select_dispatch(sq);
if (ret) {
throtl_log(sq, "bios disp=%u", ret);
dispatched = true;
}
if (!xchg(&tg->limits_changed, false)) if (throtl_schedule_next_dispatch(sq, false))
continue; break;
throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" /* this dispatch windows is still open, relax and repeat */
" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], spin_unlock_irq(q->queue_lock);
tg->iops[READ], tg->iops[WRITE]); cpu_relax();
spin_lock_irq(q->queue_lock);
}
/* if (!dispatched)
* Restart the slices for both READ and WRITES. It goto out_unlock;
* might happen that a group's limit are dropped
* suddenly and we don't want to account recently
* dispatched IO with new low rate
*/
throtl_start_new_slice(td, tg, 0);
throtl_start_new_slice(td, tg, 1);
if (throtl_tg_on_rr(tg)) if (parent_sq) {
tg_update_disptime(td, tg); /* @parent_sq is another throl_grp, propagate dispatch */
if (tg->flags & THROTL_TG_WAS_EMPTY) {
tg_update_disptime(tg);
if (!throtl_schedule_next_dispatch(parent_sq, false)) {
/* window is already open, repeat dispatching */
sq = parent_sq;
tg = sq_to_tg(sq);
goto again;
}
}
} else {
/* reached the top-level, queue issueing */
queue_work(kthrotld_workqueue, &td->dispatch_work);
} }
out_unlock:
spin_unlock_irq(q->queue_lock);
} }
/* Dispatch throttled bios. Should be called without queue lock held. */ /**
static int throtl_dispatch(struct request_queue *q) * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
* @work: work item being executed
*
* This function is queued for execution when bio's reach the bio_lists[]
* of throtl_data->service_queue. Those bio's are ready and issued by this
* function.
*/
void blk_throtl_dispatch_work_fn(struct work_struct *work)
{ {
struct throtl_data *td = q->td; struct throtl_data *td = container_of(work, struct throtl_data,
unsigned int nr_disp = 0; dispatch_work);
struct throtl_service_queue *td_sq = &td->service_queue;
struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack; struct bio_list bio_list_on_stack;
struct bio *bio; struct bio *bio;
struct blk_plug plug; struct blk_plug plug;
int rw;
spin_lock_irq(q->queue_lock);
throtl_process_limit_change(td);
if (!total_nr_queued(td))
goto out;
bio_list_init(&bio_list_on_stack); bio_list_init(&bio_list_on_stack);
throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", spin_lock_irq(q->queue_lock);
total_nr_queued(td), td->nr_queued[READ], for (rw = READ; rw <= WRITE; rw++)
td->nr_queued[WRITE]); while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
bio_list_add(&bio_list_on_stack, bio);
nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
if (nr_disp)
throtl_log(td, "bios disp=%u", nr_disp);
throtl_schedule_next_dispatch(td);
out:
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
/* if (!bio_list_empty(&bio_list_on_stack)) {
* If we dispatched some requests, unplug the queue to make sure
* immediate dispatch
*/
if (nr_disp) {
blk_start_plug(&plug); blk_start_plug(&plug);
while((bio = bio_list_pop(&bio_list_on_stack))) while((bio = bio_list_pop(&bio_list_on_stack)))
generic_make_request(bio); generic_make_request(bio);
blk_finish_plug(&plug); blk_finish_plug(&plug);
} }
return nr_disp;
}
void blk_throtl_work(struct work_struct *work)
{
struct throtl_data *td = container_of(work, struct throtl_data,
throtl_work.work);
struct request_queue *q = td->queue;
throtl_dispatch(q);
}
/* Call with queue lock held */
static void
throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
{
struct delayed_work *dwork = &td->throtl_work;
/* schedule work if limits changed even if no bio is queued */
if (total_nr_queued(td) || td->limits_changed) {
mod_delayed_work(kthrotld_workqueue, dwork, delay);
throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
delay, jiffies);
}
} }
static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
...@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, ...@@ -1007,7 +1347,9 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
struct blkcg *blkcg = cgroup_to_blkcg(cgrp); struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
struct blkg_conf_ctx ctx; struct blkg_conf_ctx ctx;
struct throtl_grp *tg; struct throtl_grp *tg;
struct throtl_data *td; struct throtl_service_queue *sq;
struct blkcg_gq *blkg;
struct cgroup *pos_cgrp;
int ret; int ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
...@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, ...@@ -1015,7 +1357,7 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
return ret; return ret;
tg = blkg_to_tg(ctx.blkg); tg = blkg_to_tg(ctx.blkg);
td = ctx.blkg->q->td; sq = &tg->service_queue;
if (!ctx.v) if (!ctx.v)
ctx.v = -1; ctx.v = -1;
...@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, ...@@ -1025,10 +1367,37 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
else else
*(unsigned int *)((void *)tg + cft->private) = ctx.v; *(unsigned int *)((void *)tg + cft->private) = ctx.v;
/* XXX: we don't need the following deferred processing */ throtl_log(&tg->service_queue,
xchg(&tg->limits_changed, true); "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
xchg(&td->limits_changed, true); tg->bps[READ], tg->bps[WRITE],
throtl_schedule_delayed_work(td, 0); tg->iops[READ], tg->iops[WRITE]);
/*
* Update has_rules[] flags for the updated tg's subtree. A tg is
* considered to have rules if either the tg itself or any of its
* ancestors has rules. This identifies groups without any
* restrictions in the whole hierarchy and allows them to bypass
* blk-throttle.
*/
tg_update_has_rules(tg);
blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg)
tg_update_has_rules(blkg_to_tg(blkg));
/*
* We're already holding queue_lock and know @tg is valid. Let's
* apply the new config directly.
*
* Restart the slices for both READ and WRITES. It might happen
* that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate.
*/
throtl_start_new_slice(tg, 0);
throtl_start_new_slice(tg, 1);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(sq->parent_sq, true);
}
blkg_conf_finish(&ctx); blkg_conf_finish(&ctx);
return 0; return 0;
...@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q) ...@@ -1092,7 +1461,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
{ {
struct throtl_data *td = q->td; struct throtl_data *td = q->td;
cancel_delayed_work_sync(&td->throtl_work); cancel_work_sync(&td->dispatch_work);
} }
static struct blkcg_policy blkcg_policy_throtl = { static struct blkcg_policy blkcg_policy_throtl = {
...@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = { ...@@ -1100,6 +1469,7 @@ static struct blkcg_policy blkcg_policy_throtl = {
.cftypes = throtl_files, .cftypes = throtl_files,
.pd_init_fn = throtl_pd_init, .pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
.pd_exit_fn = throtl_pd_exit, .pd_exit_fn = throtl_pd_exit,
.pd_reset_stats_fn = throtl_pd_reset_stats, .pd_reset_stats_fn = throtl_pd_reset_stats,
}; };
...@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = { ...@@ -1107,15 +1477,16 @@ static struct blkcg_policy blkcg_policy_throtl = {
bool blk_throtl_bio(struct request_queue *q, struct bio *bio) bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
{ {
struct throtl_data *td = q->td; struct throtl_data *td = q->td;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg; struct throtl_grp *tg;
bool rw = bio_data_dir(bio), update_disptime = true; struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
struct blkcg *blkcg; struct blkcg *blkcg;
bool throttled = false; bool throttled = false;
if (bio->bi_rw & REQ_THROTTLED) { /* see throtl_charge_bio() */
bio->bi_rw &= ~REQ_THROTTLED; if (bio->bi_rw & REQ_THROTTLED)
goto out; goto out;
}
/* /*
* A throtl_grp pointer retrieved under rcu can be used to access * A throtl_grp pointer retrieved under rcu can be used to access
...@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) ...@@ -1126,7 +1497,7 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
blkcg = bio_blkcg(bio); blkcg = bio_blkcg(bio);
tg = throtl_lookup_tg(td, blkcg); tg = throtl_lookup_tg(td, blkcg);
if (tg) { if (tg) {
if (tg_no_rule_group(tg, rw)) { if (!tg->has_rules[rw]) {
throtl_update_dispatch_stats(tg_to_blkg(tg), throtl_update_dispatch_stats(tg_to_blkg(tg),
bio->bi_size, bio->bi_rw); bio->bi_size, bio->bi_rw);
goto out_unlock_rcu; goto out_unlock_rcu;
...@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) ...@@ -1142,18 +1513,18 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
if (unlikely(!tg)) if (unlikely(!tg))
goto out_unlock; goto out_unlock;
if (tg->nr_queued[rw]) { sq = &tg->service_queue;
/*
* There is already another bio queued in same dir. No
* need to update dispatch time.
*/
update_disptime = false;
goto queue_bio;
} while (true) {
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
/* if above limits, break to queue */
if (!tg_may_dispatch(tg, bio, NULL))
break;
/* Bio is with-in rate limit of group */ /* within limits, let's charge and dispatch directly */
if (tg_may_dispatch(td, tg, bio, NULL)) {
throtl_charge_bio(tg, bio); throtl_charge_bio(tg, bio);
/* /*
...@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) ...@@ -1167,25 +1538,41 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
* *
* So keep on trimming slice even if bio is not queued. * So keep on trimming slice even if bio is not queued.
*/ */
throtl_trim_slice(td, tg, rw); throtl_trim_slice(tg, rw);
goto out_unlock;
/*
* @bio passed through this layer without being throttled.
* Climb up the ladder. If we''re already at the top, it
* can be executed directly.
*/
qn = &tg->qnode_on_parent[rw];
sq = sq->parent_sq;
tg = sq_to_tg(sq);
if (!tg)
goto out_unlock;
} }
queue_bio: /* out-of-limit, queue to @tg */
throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
" iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W',
rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], tg->io_disp[rw], tg->iops[rw],
tg->io_disp[rw], tg->iops[rw], sq->nr_queued[READ], sq->nr_queued[WRITE]);
tg->nr_queued[READ], tg->nr_queued[WRITE]);
bio_associate_current(bio); bio_associate_current(bio);
throtl_add_bio_tg(q->td, tg, bio); tg->td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true; throttled = true;
if (update_disptime) { /*
tg_update_disptime(td, tg); * Update @tg's dispatch time and force schedule dispatch if @tg
throtl_schedule_next_dispatch(td); * was empty before @bio. The forced scheduling isn't likely to
* cause undue delay as @bio is likely to be dispatched directly if
* its @tg's disptime is not in the future.
*/
if (tg->flags & THROTL_TG_WAS_EMPTY) {
tg_update_disptime(tg);
throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
} }
out_unlock: out_unlock:
...@@ -1193,9 +1580,38 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) ...@@ -1193,9 +1580,38 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
out_unlock_rcu: out_unlock_rcu:
rcu_read_unlock(); rcu_read_unlock();
out: out:
/*
* As multiple blk-throtls may stack in the same issue path, we
* don't want bios to leave with the flag set. Clear the flag if
* being issued.
*/
if (!throttled)
bio->bi_rw &= ~REQ_THROTTLED;
return throttled; return throttled;
} }
/*
* Dispatch all bios from all children tg's queued on @parent_sq. On
* return, @parent_sq is guaranteed to not have any active children tg's
* and all bios from previously active tg's are on @parent_sq->bio_lists[].
*/
static void tg_drain_bios(struct throtl_service_queue *parent_sq)
{
struct throtl_grp *tg;
while ((tg = throtl_rb_first(parent_sq))) {
struct throtl_service_queue *sq = &tg->service_queue;
struct bio *bio;
throtl_dequeue_tg(tg);
while ((bio = throtl_peek_queued(&sq->queued[READ])))
tg_dispatch_one_bio(tg, bio_data_dir(bio));
while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
tg_dispatch_one_bio(tg, bio_data_dir(bio));
}
}
/** /**
* blk_throtl_drain - drain throttled bios * blk_throtl_drain - drain throttled bios
* @q: request_queue to drain throttled bios for * @q: request_queue to drain throttled bios for
...@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q) ...@@ -1206,27 +1622,36 @@ void blk_throtl_drain(struct request_queue *q)
__releases(q->queue_lock) __acquires(q->queue_lock) __releases(q->queue_lock) __acquires(q->queue_lock)
{ {
struct throtl_data *td = q->td; struct throtl_data *td = q->td;
struct throtl_rb_root *st = &td->tg_service_tree; struct blkcg_gq *blkg;
struct throtl_grp *tg; struct cgroup *pos_cgrp;
struct bio_list bl;
struct bio *bio; struct bio *bio;
int rw;
queue_lockdep_assert_held(q); queue_lockdep_assert_held(q);
rcu_read_lock();
/*
* Drain each tg while doing post-order walk on the blkg tree, so
* that all bios are propagated to td->service_queue. It'd be
* better to walk service_queue tree directly but blkg walk is
* easier.
*/
blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg)
tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
bio_list_init(&bl); tg_drain_bios(&td_root_tg(td)->service_queue);
while ((tg = throtl_rb_first(st))) { /* finally, transfer bios from top-level tg's into the td */
throtl_dequeue_tg(td, tg); tg_drain_bios(&td->service_queue);
while ((bio = bio_list_peek(&tg->bio_lists[READ]))) rcu_read_unlock();
tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
}
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
while ((bio = bio_list_pop(&bl))) /* all bios now should be in td->service_queue, issue them */
generic_make_request(bio); for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
NULL)))
generic_make_request(bio);
spin_lock_irq(q->queue_lock); spin_lock_irq(q->queue_lock);
} }
...@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q) ...@@ -1240,9 +1665,8 @@ int blk_throtl_init(struct request_queue *q)
if (!td) if (!td)
return -ENOMEM; return -ENOMEM;
td->tg_service_tree = THROTL_RB_ROOT; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
td->limits_changed = false; throtl_service_queue_init(&td->service_queue, NULL);
INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
q->td = td; q->td = td;
td->queue = q; td->queue = q;
......
...@@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e) ...@@ -4347,18 +4347,28 @@ static void cfq_exit_queue(struct elevator_queue *e)
kfree(cfqd); kfree(cfqd);
} }
static int cfq_init_queue(struct request_queue *q) static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
{ {
struct cfq_data *cfqd; struct cfq_data *cfqd;
struct blkcg_gq *blkg __maybe_unused; struct blkcg_gq *blkg __maybe_unused;
int i, ret; int i, ret;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd) if (!cfqd) {
kobject_put(&eq->kobj);
return -ENOMEM; return -ENOMEM;
}
eq->elevator_data = cfqd;
cfqd->queue = q; cfqd->queue = q;
q->elevator->elevator_data = cfqd; spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
/* Init root service tree */ /* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT; cfqd->grp_service_tree = CFQ_RB_ROOT;
...@@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q) ...@@ -4433,6 +4443,7 @@ static int cfq_init_queue(struct request_queue *q)
out_free: out_free:
kfree(cfqd); kfree(cfqd);
kobject_put(&eq->kobj);
return ret; return ret;
} }
......
...@@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e) ...@@ -337,13 +337,21 @@ static void deadline_exit_queue(struct elevator_queue *e)
/* /*
* initialize elevator private data (deadline_data). * initialize elevator private data (deadline_data).
*/ */
static int deadline_init_queue(struct request_queue *q) static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{ {
struct deadline_data *dd; struct deadline_data *dd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node); dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!dd) if (!dd) {
kobject_put(&eq->kobj);
return -ENOMEM; return -ENOMEM;
}
eq->elevator_data = dd;
INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
...@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q) ...@@ -355,7 +363,9 @@ static int deadline_init_queue(struct request_queue *q)
dd->front_merges = 1; dd->front_merges = 1;
dd->fifo_batch = fifo_batch; dd->fifo_batch = fifo_batch;
q->elevator->elevator_data = dd; spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0; return 0;
} }
......
...@@ -150,7 +150,7 @@ void __init load_default_elevator_module(void) ...@@ -150,7 +150,7 @@ void __init load_default_elevator_module(void)
static struct kobj_type elv_ktype; static struct kobj_type elv_ktype;
static struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_type *e) struct elevator_type *e)
{ {
struct elevator_queue *eq; struct elevator_queue *eq;
...@@ -170,6 +170,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q, ...@@ -170,6 +170,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
elevator_put(e); elevator_put(e);
return NULL; return NULL;
} }
EXPORT_SYMBOL(elevator_alloc);
static void elevator_release(struct kobject *kobj) static void elevator_release(struct kobject *kobj)
{ {
...@@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name) ...@@ -221,16 +222,7 @@ int elevator_init(struct request_queue *q, char *name)
} }
} }
q->elevator = elevator_alloc(q, e); err = e->ops.elevator_init_fn(q, e);
if (!q->elevator)
return -ENOMEM;
err = e->ops.elevator_init_fn(q);
if (err) {
kobject_put(&q->elevator->kobj);
return err;
}
return 0; return 0;
} }
EXPORT_SYMBOL(elevator_init); EXPORT_SYMBOL(elevator_init);
...@@ -935,16 +927,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) ...@@ -935,16 +927,9 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
/* allocate, init and register new elevator */ /* allocate, init and register new elevator */
err = -ENOMEM; err = new_e->ops.elevator_init_fn(q, new_e);
q->elevator = elevator_alloc(q, new_e); if (err)
if (!q->elevator)
goto fail_init;
err = new_e->ops.elevator_init_fn(q);
if (err) {
kobject_put(&q->elevator->kobj);
goto fail_init; goto fail_init;
}
if (registered) { if (registered) {
err = elv_register_queue(q); err = elv_register_queue(q);
......
...@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq) ...@@ -59,16 +59,27 @@ noop_latter_request(struct request_queue *q, struct request *rq)
return list_entry(rq->queuelist.next, struct request, queuelist); return list_entry(rq->queuelist.next, struct request, queuelist);
} }
static int noop_init_queue(struct request_queue *q) static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{ {
struct noop_data *nd; struct noop_data *nd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd) if (!nd) {
kobject_put(&eq->kobj);
return -ENOMEM; return -ENOMEM;
}
eq->elevator_data = nd;
INIT_LIST_HEAD(&nd->queue); INIT_LIST_HEAD(&nd->queue);
q->elevator->elevator_data = nd;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0; return 0;
} }
......
...@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode, ...@@ -58,17 +58,24 @@ static void bdev_inode_switch_bdi(struct inode *inode,
struct backing_dev_info *dst) struct backing_dev_info *dst)
{ {
struct backing_dev_info *old = inode->i_data.backing_dev_info; struct backing_dev_info *old = inode->i_data.backing_dev_info;
bool wakeup_bdi = false;
if (unlikely(dst == old)) /* deadlock avoidance */ if (unlikely(dst == old)) /* deadlock avoidance */
return; return;
bdi_lock_two(&old->wb, &dst->wb); bdi_lock_two(&old->wb, &dst->wb);
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
inode->i_data.backing_dev_info = dst; inode->i_data.backing_dev_info = dst;
if (inode->i_state & I_DIRTY) if (inode->i_state & I_DIRTY) {
if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
wakeup_bdi = true;
list_move(&inode->i_wb_list, &dst->wb.b_dirty); list_move(&inode->i_wb_list, &dst->wb.b_dirty);
}
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
spin_unlock(&old->wb.list_lock); spin_unlock(&old->wb.list_lock);
spin_unlock(&dst->wb.list_lock); spin_unlock(&dst->wb.list_lock);
if (wakeup_bdi)
bdi_wakeup_thread_delayed(dst);
} }
/* Kill _all_ buffers and pagecache , dirty or not.. */ /* Kill _all_ buffers and pagecache , dirty or not.. */
......
...@@ -278,6 +278,8 @@ enum { ...@@ -278,6 +278,8 @@ enum {
* *
* - memcg: use_hierarchy is on by default and the cgroup file for * - memcg: use_hierarchy is on by default and the cgroup file for
* the flag is not created. * the flag is not created.
*
* - blkcg: blk-throttle becomes properly hierarchical.
*/ */
CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#ifdef CONFIG_BLOCK #ifdef CONFIG_BLOCK
struct io_cq; struct io_cq;
struct elevator_type;
typedef int (elevator_merge_fn) (struct request_queue *, struct request **, typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
struct bio *); struct bio *);
...@@ -35,7 +36,8 @@ typedef void (elevator_put_req_fn) (struct request *); ...@@ -35,7 +36,8 @@ typedef void (elevator_put_req_fn) (struct request *);
typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
typedef int (elevator_init_fn) (struct request_queue *); typedef int (elevator_init_fn) (struct request_queue *,
struct elevator_type *e);
typedef void (elevator_exit_fn) (struct elevator_queue *); typedef void (elevator_exit_fn) (struct elevator_queue *);
struct elevator_ops struct elevator_ops
...@@ -155,6 +157,8 @@ extern int elevator_init(struct request_queue *, char *); ...@@ -155,6 +157,8 @@ extern int elevator_init(struct request_queue *, char *);
extern void elevator_exit(struct elevator_queue *); extern void elevator_exit(struct elevator_queue *);
extern int elevator_change(struct request_queue *, const char *); extern int elevator_change(struct request_queue *, const char *);
extern bool elv_rq_merge_ok(struct request *, struct bio *); extern bool elv_rq_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *,
struct elevator_type *);
/* /*
* Helper functions. * Helper functions.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment