Commit f1de2439 authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

blk-iocost: revamp donation amount determination

iocost has various safety nets to combat inuse adjustment calculation
inaccuracies. With Andy's method implemented in transfer_surpluses(), inuse
adjustment calculations are now accurate and we can make donation amount
determinations accurate too.

* Stop keeping track of past usage history and using the maximum. Act on the
  immediate usage information.

* Remove donation constraints defined by SURPLUS_* constants. Donate
  whatever isn't used.

* Determine the donation amount so that the iocg will end up with
  MARGIN_TARGET_PCT budget at the end of the coming period assuming the same
  usage as the previous period. TARGET is set at 50% of period, which is the
  previous maximum. This provides smooth convergence for most repetitive IO
  patterns.

* Apply donation logic early at 20% budget. There's no risk in doing so as
  the calculation is based on the delta between the current budget and the
  target budget at the end of the coming period.

* Remove preemptive iocg activation for zero cost IOs. As donation can reach
  near zero now, the mere activation doesn't provide any protection anymore.
  In the unlikely case that this becomes a problem, the right solution is
  assigning appropriate costs for such IOs.

This significantly improves the donation determination logic while also
simplifying it. Now all donations are immediate, exact and smooth.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Cc: Andy Newell <newella@fb.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent e08d02aa
...@@ -217,12 +217,14 @@ enum { ...@@ -217,12 +217,14 @@ enum {
MAX_PERIOD = USEC_PER_SEC, MAX_PERIOD = USEC_PER_SEC,
/* /*
* A cgroup's vtime can run 50% behind the device vtime, which * iocg->vtime is targeted at 50% behind the device vtime, which
* serves as its IO credit buffer. Surplus weight adjustment is * serves as its IO credit buffer. Surplus weight adjustment is
* immediately canceled if the vtime margin runs below 10%. * immediately canceled if the vtime margin runs below 10%.
*/ */
MARGIN_MIN_PCT = 10, MARGIN_MIN_PCT = 10,
MARGIN_MAX_PCT = 50, MARGIN_LOW_PCT = 20,
MARGIN_TARGET_PCT = 50,
MARGIN_MAX_PCT = 100,
/* Have some play in timer operations */ /* Have some play in timer operations */
TIMER_SLACK_PCT = 1, TIMER_SLACK_PCT = 1,
...@@ -234,17 +236,6 @@ enum { ...@@ -234,17 +236,6 @@ enum {
*/ */
VTIME_VALID_DUR = 300 * USEC_PER_SEC, VTIME_VALID_DUR = 300 * USEC_PER_SEC,
/*
* Remember the past three non-zero usages and use the max for
* surplus calculation. Three slots guarantee that we remember one
* full period usage from the last active stretch even after
* partial deactivation and re-activation periods. Don't start
* giving away weight before collecting two data points to prevent
* hweight adjustments based on one partial activation period.
*/
NR_USAGE_SLOTS = 3,
MIN_VALID_USAGES = 2,
/* 1/64k is granular enough and can easily be handled w/ u32 */ /* 1/64k is granular enough and can easily be handled w/ u32 */
WEIGHT_ONE = 1 << 16, WEIGHT_ONE = 1 << 16,
...@@ -280,14 +271,6 @@ enum { ...@@ -280,14 +271,6 @@ enum {
/* don't let cmds which take a very long time pin lagging for too long */ /* don't let cmds which take a very long time pin lagging for too long */
MAX_LAGGING_PERIODS = 10, MAX_LAGGING_PERIODS = 10,
/*
* If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
* donate the surplus.
*/
SURPLUS_SCALE_PCT = 125, /* * 125% */
SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */
SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */
/* switch iff the conditions are met for longer than this */ /* switch iff the conditions are met for longer than this */
AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
...@@ -376,6 +359,8 @@ struct ioc_params { ...@@ -376,6 +359,8 @@ struct ioc_params {
struct ioc_margins { struct ioc_margins {
s64 min; s64 min;
s64 low;
s64 target;
s64 max; s64 max;
}; };
...@@ -514,11 +499,7 @@ struct ioc_gq { ...@@ -514,11 +499,7 @@ struct ioc_gq {
struct iocg_stat desc_stat; struct iocg_stat desc_stat;
struct iocg_stat last_stat; struct iocg_stat last_stat;
u64 last_stat_abs_vusage; u64 last_stat_abs_vusage;
u64 usage_delta_us;
/* usage is recorded as fractions of WEIGHT_ONE */
u32 usage_delta_us;
int usage_idx;
u32 usages[NR_USAGE_SLOTS];
/* this iocg's depth in the hierarchy and ancestors including self */ /* this iocg's depth in the hierarchy and ancestors including self */
int level; int level;
...@@ -737,6 +718,8 @@ static void ioc_refresh_margins(struct ioc *ioc) ...@@ -737,6 +718,8 @@ static void ioc_refresh_margins(struct ioc *ioc)
u64 vrate = atomic64_read(&ioc->vtime_rate); u64 vrate = atomic64_read(&ioc->vtime_rate);
margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
} }
...@@ -1228,7 +1211,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) ...@@ -1228,7 +1211,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
return false; return false;
} }
if (!atomic_read(&blkg->use_delay) && if (!atomic_read(&blkg->use_delay) &&
time_before_eq64(vtime, now->vnow + ioc->margins.max)) time_before_eq64(vtime, now->vnow + ioc->margins.target))
return false; return false;
/* use delay */ /* use delay */
...@@ -1527,7 +1510,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, ...@@ -1527,7 +1510,7 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
{ {
struct ioc *ioc = iocg->ioc; struct ioc *ioc = iocg->ioc;
u64 vtime = atomic64_read(&iocg->vtime); u64 vtime = atomic64_read(&iocg->vtime);
s64 excess; s64 excess, delta, target, new_hwi;
/* see whether minimum margin requirement is met */ /* see whether minimum margin requirement is met */
if (waitqueue_active(&iocg->waitq) || if (waitqueue_active(&iocg->waitq) ||
...@@ -1542,15 +1525,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, ...@@ -1542,15 +1525,28 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
vtime += excess; vtime += excess;
} }
/* add margin */ /*
usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); * Let's say the distance between iocg's and device's vtimes as a
usage += SURPLUS_SCALE_ABS; * fraction of period duration is delta. Assuming that the iocg will
* consume the usage determined above, we want to determine new_hwi so
/* don't bother if the surplus is too small */ * that delta equals MARGIN_TARGET at the end of the next period.
if (usage + SURPLUS_MIN_ADJ_DELTA > hwm) *
return hwm; * We need to execute usage worth of IOs while spending the sum of the
* new budget (1 - MARGIN_TARGET) and the leftover from the last period
* (delta):
*
* usage = (1 - MARGIN_TARGET + delta) * new_hwi
*
* Therefore, the new_hwi is:
*
* new_hwi = usage / (1 - MARGIN_TARGET + delta)
*/
delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
now->vnow - ioc->period_at_vtime);
target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
return usage; return clamp_t(s64, new_hwi, 1, hwm);
} }
/* /*
...@@ -1812,7 +1808,7 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1812,7 +1808,7 @@ static void ioc_timer_fn(struct timer_list *timer)
u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
u32 missed_ppm[2], rq_wait_pct; u32 missed_ppm[2], rq_wait_pct;
u64 period_vtime; u64 period_vtime;
int prev_busy_level, i; int prev_busy_level;
/* how were the latencies during the period? */ /* how were the latencies during the period? */
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
...@@ -1857,11 +1853,10 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1857,11 +1853,10 @@ static void ioc_timer_fn(struct timer_list *timer)
} }
commit_weights(ioc); commit_weights(ioc);
/* calc usages and see whether some weights need to be moved around */ /* calc usage and see whether some weights need to be moved around */
list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
u64 vdone, vtime, usage_us; u64 vdone, vtime, usage_us, usage_dur;
u32 hw_active, hw_inuse, usage; u32 usage, hw_active, hw_inuse;
int uidx, nr_valid;
/* /*
* Collect unused and wind vtime closer to vnow to prevent * Collect unused and wind vtime closer to vnow to prevent
...@@ -1886,15 +1881,11 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1886,15 +1881,11 @@ static void ioc_timer_fn(struct timer_list *timer)
nr_lagging++; nr_lagging++;
/* /*
* Determine absolute usage factoring in pending and in-flight * Determine absolute usage factoring in in-flight IOs to avoid
* IOs to avoid stalls and high-latency completions appearing as * high-latency completions appearing as idle.
* idle.
*/ */
usage_us = iocg->usage_delta_us; usage_us = iocg->usage_delta_us;
if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
usage_us += DIV64_U64_ROUND_UP(
cost_to_abs_cost(now.vnow - vtime, hw_inuse),
now.vrate);
if (vdone != vtime) { if (vdone != vtime) {
u64 inflight_us = DIV64_U64_ROUND_UP( u64 inflight_us = DIV64_U64_ROUND_UP(
cost_to_abs_cost(vtime - vdone, hw_inuse), cost_to_abs_cost(vtime - vdone, hw_inuse),
...@@ -1902,43 +1893,22 @@ static void ioc_timer_fn(struct timer_list *timer) ...@@ -1902,43 +1893,22 @@ static void ioc_timer_fn(struct timer_list *timer)
usage_us = max(usage_us, inflight_us); usage_us = max(usage_us, inflight_us);
} }
/* convert to hweight based usage ratio and record */ /* convert to hweight based usage ratio */
uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; if (time_after64(iocg->activated_at, ioc->period_at))
usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
if (time_after64(vtime, now.vnow - ioc->margins.min)) { else
iocg->usage_idx = uidx; usage_dur = max_t(u64, now.now - ioc->period_at, 1);
iocg->usages[uidx] = WEIGHT_ONE;
} else if (usage_us) {
u64 started_at, dur;
if (time_after64(iocg->activated_at, ioc->period_at))
started_at = iocg->activated_at;
else
started_at = ioc->period_at;
dur = max_t(u64, now.now - started_at, 1);
iocg->usage_idx = uidx; usage = clamp_t(u32,
iocg->usages[uidx] = clamp_t(u32, DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur), usage_dur),
1, WEIGHT_ONE); 1, WEIGHT_ONE);
}
/* base the decision on max historical usage */
for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) {
if (iocg->usages[i]) {
usage = max(usage, iocg->usages[i]);
nr_valid++;
}
}
if (nr_valid < MIN_VALID_USAGES)
usage = WEIGHT_ONE;
/* see whether there's surplus vtime */ /* see whether there's surplus vtime */
WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
if (hw_inuse < hw_active || if (hw_inuse < hw_active ||
(!waitqueue_active(&iocg->waitq) && (!waitqueue_active(&iocg->waitq) &&
time_before64(vtime, now.vnow - ioc->margins.max))) { time_before64(vtime, now.vnow - ioc->margins.low))) {
u32 hwa, hwm, new_hwi; u32 hwa, hwm, new_hwi;
/* /*
...@@ -2175,15 +2145,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) ...@@ -2175,15 +2145,14 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
if (!ioc->enabled || !iocg->level) if (!ioc->enabled || !iocg->level)
return; return;
/* always activate so that even 0 cost IOs get protected to some level */
if (!iocg_activate(iocg, &now))
return;
/* calculate the absolute vtime cost */ /* calculate the absolute vtime cost */
abs_cost = calc_vtime_cost(bio, iocg, false); abs_cost = calc_vtime_cost(bio, iocg, false);
if (!abs_cost) if (!abs_cost)
return; return;
if (!iocg_activate(iocg, &now))
return;
iocg->cursor = bio_end_sector(bio); iocg->cursor = bio_end_sector(bio);
vtime = atomic64_read(&iocg->vtime); vtime = atomic64_read(&iocg->vtime);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment