Commit 93f7d2db authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

blk-iocost: restructure surplus donation logic

The way the surplus donation logic is structured isn't great. There are two
separate paths for starting/increasing donations and decreasing them making
the logic harder to follow and is prone to unnecessary behavior differences.

In preparation for improved donation handling, this patch restructures the
code so that

* All donors - new, increasing and decreasing - are funneled through the
  same code path.

* The target donation calculation is factored into hweight_after_donation()
  which is called once from the same spot for all possible donors.

* Actual inuse adjustment is factored into trasnfer_surpluses().

This change introduces a few behavior differences - e.g. donation amount
reduction now uses the max usage of the recent three periods just like new
and increasing donations, and inuse now gets adjusted upwards the same way
it gets downwards. These differences are unlikely to have severely negative
implications and the whole logic will be revamped soon.

This patch also removes two tracepoints. The existing TPs don't quite fit
the new implementation. A later patch will update and reinstate them.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 065655c8
...@@ -494,6 +494,7 @@ struct ioc_gq { ...@@ -494,6 +494,7 @@ struct ioc_gq {
int hweight_gen; int hweight_gen;
u32 hweight_active; u32 hweight_active;
u32 hweight_inuse; u32 hweight_inuse;
u32 hweight_after_donation;
struct list_head walk_list; struct list_head walk_list;
struct list_head surplus_list; struct list_head surplus_list;
...@@ -1070,6 +1071,32 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep ...@@ -1070,6 +1071,32 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
*hw_inusep = iocg->hweight_inuse; *hw_inusep = iocg->hweight_inuse;
} }
/*
* Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
* other weights stay unchanged.
*/
static u32 current_hweight_max(struct ioc_gq *iocg)
{
u32 hwm = WEIGHT_ONE;
u32 inuse = iocg->active;
u64 child_inuse_sum;
int lvl;
lockdep_assert_held(&iocg->ioc->lock);
for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
struct ioc_gq *parent = iocg->ancestors[lvl];
struct ioc_gq *child = iocg->ancestors[lvl + 1];
child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
parent->child_active_sum);
}
return max_t(u32, hwm, 1);
}
static void weight_updated(struct ioc_gq *iocg) static void weight_updated(struct ioc_gq *iocg)
{ {
struct ioc *ioc = iocg->ioc; struct ioc *ioc = iocg->ioc;
...@@ -1488,20 +1515,58 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) ...@@ -1488,20 +1515,58 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
} }
} }
/* returns usage with margin added if surplus is large enough */ /*
static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) * Determine what @iocg's hweight_inuse should be after donating unused
* capacity. @hwm is the upper bound and used to signal no donation. This
* function also throws away @iocg's excess budget.
*/
static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
struct ioc_now *now)
{ {
struct ioc *ioc = iocg->ioc;
u64 vtime = atomic64_read(&iocg->vtime);
s64 excess;
/* see whether minimum margin requirement is met */
if (waitqueue_active(&iocg->waitq) ||
time_after64(vtime, now->vnow - ioc->margins.min))
return hwm;
/* throw away excess above max */
excess = now->vnow - vtime - ioc->margins.max;
if (excess > 0) {
atomic64_add(excess, &iocg->vtime);
atomic64_add(excess, &iocg->done_vtime);
vtime += excess;
}
/* add margin */ /* add margin */
usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
usage += SURPLUS_SCALE_ABS; usage += SURPLUS_SCALE_ABS;
/* don't bother if the surplus is too small */ /* don't bother if the surplus is too small */
if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) if (usage + SURPLUS_MIN_ADJ_DELTA > hwm)
return 0; return hwm;
return usage; return usage;
} }
static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
{
struct ioc_gq *iocg;
list_for_each_entry(iocg, surpluses, surplus_list) {
u32 old_hwi, new_hwi, new_inuse;
current_hweight(iocg, NULL, &old_hwi);
new_hwi = iocg->hweight_after_donation;
new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
old_hwi);
__propagate_weights(iocg, iocg->weight, new_inuse);
}
}
static void ioc_timer_fn(struct timer_list *timer) static void ioc_timer_fn(struct timer_list *timer)
{ {
struct ioc *ioc = container_of(timer, struct ioc, timer); struct ioc *ioc = container_of(timer, struct ioc, timer);
...@@ -1560,9 +1625,9 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1560,9 +1625,9 @@ static void ioc_timer_fn(struct timer_list *timer)
/* calc usages and see whether some weights need to be moved around */ /* calc usages and see whether some weights need to be moved around */
list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
u64 vdone, vtime, usage_us, vmin; u64 vdone, vtime, usage_us;
u32 hw_active, hw_inuse, usage; u32 hw_active, hw_inuse, usage;
int uidx; int uidx, nr_valid;
/* /*
* Collect unused and wind vtime closer to vnow to prevent * Collect unused and wind vtime closer to vnow to prevent
...@@ -1618,92 +1683,54 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1618,92 +1683,54 @@ static void ioc_timer_fn(struct timer_list *timer)
started_at = ioc->period_at; started_at = ioc->period_at;
dur = max_t(u64, now.now - started_at, 1); dur = max_t(u64, now.now - started_at, 1);
usage = clamp_t(u32,
iocg->usage_idx = uidx;
iocg->usages[uidx] = clamp_t(u32,
DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
1, WEIGHT_ONE); 1, WEIGHT_ONE);
}
iocg->usage_idx = uidx; /* base the decision on max historical usage */
iocg->usages[uidx] = usage; for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) {
} else { if (iocg->usages[i]) {
usage = 0; usage = max(usage, iocg->usages[i]);
nr_valid++;
}
} }
if (nr_valid < MIN_VALID_USAGES)
usage = WEIGHT_ONE;
/* see whether there's surplus vtime */ /* see whether there's surplus vtime */
vmin = now.vnow - ioc->margins.max;
WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
if (!waitqueue_active(&iocg->waitq) && if (hw_inuse < hw_active ||
time_before64(vtime, vmin)) { (!waitqueue_active(&iocg->waitq) &&
u64 delta = vmin - vtime; time_before64(vtime, now.vnow - ioc->margins.max))) {
u32 hwm, new_hwi;
/* throw away surplus vtime */
atomic64_add(delta, &iocg->vtime);
atomic64_add(delta, &iocg->done_vtime);
/* if usage is sufficiently low, maybe it can donate */
if (surplus_adjusted_hweight_inuse(usage, hw_inuse))
list_add(&iocg->surplus_list, &surpluses);
} else if (hw_inuse < hw_active) {
u32 new_hwi, new_inuse;
/* was donating but might need to take back some */ /*
if (waitqueue_active(&iocg->waitq)) { * Already donating or accumulated enough to start.
new_hwi = hw_active; * Determine the donation amount.
*/
hwm = current_hweight_max(iocg);
new_hwi = hweight_after_donation(iocg, hwm, usage,
&now);
if (new_hwi < hwm) {
iocg->hweight_after_donation = new_hwi;
list_add(&iocg->surplus_list, &surpluses);
} else { } else {
new_hwi = max(hw_inuse, __propagate_weights(iocg, iocg->active,
usage * SURPLUS_SCALE_PCT / 100 + iocg->active);
SURPLUS_SCALE_ABS); nr_shortages++;
}
new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
hw_inuse);
new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
if (new_inuse > iocg->inuse) {
TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
iocg->inuse, new_inuse,
hw_inuse, new_hwi);
__propagate_weights(iocg, iocg->weight,
new_inuse);
} }
} else { } else {
/* genuninely out of vtime */ /* genuinely short on vtime */
nr_shortages++; nr_shortages++;
} }
} }
if (!nr_shortages || list_empty(&surpluses)) if (!list_empty(&surpluses) && nr_shortages)
goto skip_surplus_transfers; transfer_surpluses(&surpluses, &now);
/* there are both shortages and surpluses, transfer surpluses */
list_for_each_entry(iocg, &surpluses, surplus_list) {
u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
int nr_valid = 0;
/* base the decision on max historical usage */
for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
if (iocg->usages[i]) {
usage = max(usage, iocg->usages[i]);
nr_valid++;
}
}
if (nr_valid < MIN_VALID_USAGES)
continue;
current_hweight(iocg, &hw_active, &hw_inuse);
new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
if (!new_hwi)
continue;
new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
hw_inuse);
if (new_inuse < iocg->inuse) {
TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
iocg->inuse, new_inuse,
hw_inuse, new_hwi);
__propagate_weights(iocg, iocg->weight, new_inuse);
}
}
skip_surplus_transfers:
commit_weights(ioc); commit_weights(ioc);
/* surplus list should be dissolved after use */ /* surplus list should be dissolved after use */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment