Commit 11137d38 authored by Vincent Guittot's avatar Vincent Guittot Committed by Ingo Molnar

sched/fair: Simplify util_est

With UTIL_EST_FASTUP now being permanent, we can take advantage of the
fact that the ewma jumps directly to a higher utilization at dequeue to
simplify util_est and remove the enqueued field.
Signed-off-by: default avatarVincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Tested-by: default avatarLukasz Luba <lukasz.luba@arm.com>
Reviewed-by: default avatarLukasz Luba <lukasz.luba@arm.com>
Reviewed-by: default avatarDietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: default avatarHongyan Xia <hongyan.xia2@arm.com>
Reviewed-by: default avatarAlex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org
parent 7736ae55
...@@ -415,42 +415,6 @@ struct load_weight { ...@@ -415,42 +415,6 @@ struct load_weight {
u32 inv_weight; u32 inv_weight;
}; };
/**
* struct util_est - Estimation utilization of FAIR tasks
* @enqueued: instantaneous estimated utilization of a task/cpu
* @ewma: the Exponential Weighted Moving Average (EWMA)
* utilization of a task
*
* Support data structure to track an Exponential Weighted Moving Average
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
* average each time a task completes an activation. Sample's weight is chosen
* so that the EWMA will be relatively insensitive to transient changes to the
* task's workload.
*
* The enqueued attribute has a slightly different meaning for tasks and cpus:
* - task: the task's util_avg at last task dequeue time
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
* Thus, the util_est.enqueued of a task represents the contribution on the
* estimated utilization of the CPU where that task is currently enqueued.
*
* Only for tasks we track a moving average of the past instantaneous
* estimated utilization. This allows to absorb sporadic drops in utilization
* of an otherwise almost periodic task.
*
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
* updates. When a task is dequeued, its util_est should not be updated if its
* util_avg has not been updated in the meantime.
* This information is mapped into the MSB bit of util_est.enqueued at dequeue
* time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
* for a task) it is safe to use MSB.
*/
struct util_est {
unsigned int enqueued;
unsigned int ewma;
#define UTIL_EST_WEIGHT_SHIFT 2
#define UTIL_AVG_UNCHANGED 0x80000000
} __attribute__((__aligned__(sizeof(u64))));
/* /*
* The load/runnable/util_avg accumulates an infinite geometric series * The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
...@@ -505,9 +469,20 @@ struct sched_avg { ...@@ -505,9 +469,20 @@ struct sched_avg {
unsigned long load_avg; unsigned long load_avg;
unsigned long runnable_avg; unsigned long runnable_avg;
unsigned long util_avg; unsigned long util_avg;
struct util_est util_est; unsigned int util_est;
} ____cacheline_aligned; } ____cacheline_aligned;
/*
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
* updates. When a task is dequeued, its util_est should not be updated if its
* util_avg has not been updated in the meantime.
* This information is mapped into the MSB bit of util_est at dequeue time.
* Since max value of util_est for a task is 1024 (PELT util_avg for a task)
* it is safe to use MSB.
*/
#define UTIL_EST_WEIGHT_SHIFT 2
#define UTIL_AVG_UNCHANGED 0x80000000
struct sched_statistics { struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
u64 wait_start; u64 wait_start;
......
...@@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) ...@@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->avg.runnable_avg); cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg", SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg); cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued", SEQ_printf(m, " .%-30s: %u\n", "util_est",
cfs_rq->avg.util_est.enqueued); cfs_rq->avg.util_est);
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg); cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
...@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ...@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.runnable_avg); P(se.avg.runnable_avg);
P(se.avg.util_avg); P(se.avg.util_avg);
P(se.avg.last_update_time); P(se.avg.last_update_time);
P(se.avg.util_est.ewma); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
#endif #endif
#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
......
...@@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p) ...@@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p)
static inline unsigned long _task_util_est(struct task_struct *p) static inline unsigned long _task_util_est(struct task_struct *p)
{ {
struct util_est ue = READ_ONCE(p->se.avg.util_est); return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
} }
static inline unsigned long task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p)
...@@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, ...@@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
return; return;
/* Update root cfs_rq's estimated utilization */ /* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued; enqueued = cfs_rq->avg.util_est;
enqueued += _task_util_est(p); enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq); trace_sched_util_est_cfs_tp(cfs_rq);
} }
...@@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, ...@@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
return; return;
/* Update root cfs_rq's estimated utilization */ /* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued; enqueued = cfs_rq->avg.util_est;
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq); trace_sched_util_est_cfs_tp(cfs_rq);
} }
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
/*
* Check if a (signed) value is within a specified (unsigned) margin,
* based on the observation that:
*
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
*
* NOTE: this only works when value + margin < INT_MAX.
*/
static inline bool within_margin(int value, int margin)
{
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
static inline void util_est_update(struct cfs_rq *cfs_rq, static inline void util_est_update(struct cfs_rq *cfs_rq,
struct task_struct *p, struct task_struct *p,
bool task_sleep) bool task_sleep)
{ {
long last_ewma_diff, last_enqueued_diff; unsigned int ewma, dequeued, last_ewma_diff;
struct util_est ue;
if (!sched_feat(UTIL_EST)) if (!sched_feat(UTIL_EST))
return; return;
...@@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, ...@@ -4855,23 +4839,25 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
if (!task_sleep) if (!task_sleep)
return; return;
/* Get current estimate of utilization */
ewma = READ_ONCE(p->se.avg.util_est);
/* /*
* If the PELT values haven't changed since enqueue time, * If the PELT values haven't changed since enqueue time,
* skip the util_est update. * skip the util_est update.
*/ */
ue = p->se.avg.util_est; if (ewma & UTIL_AVG_UNCHANGED)
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return; return;
last_enqueued_diff = ue.enqueued; /* Get utilization at dequeue */
dequeued = task_util(p);
/* /*
* Reset EWMA on utilization increases, the moving average is used only * Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases. * to smooth utilization decreases.
*/ */
ue.enqueued = task_util(p); if (ewma <= dequeued) {
if (ue.ewma < ue.enqueued) { ewma = dequeued;
ue.ewma = ue.enqueued;
goto done; goto done;
} }
...@@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, ...@@ -4879,27 +4865,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* Skip update of task's estimated utilization when its members are * Skip update of task's estimated utilization when its members are
* already ~1% close to its last activation value. * already ~1% close to its last activation value.
*/ */
last_ewma_diff = ue.enqueued - ue.ewma; last_ewma_diff = ewma - dequeued;
last_enqueued_diff -= ue.enqueued; if (last_ewma_diff < UTIL_EST_MARGIN)
if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { goto done;
if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
goto done;
return;
}
/* /*
* To avoid overestimation of actual task utilization, skip updates if * To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU. * we cannot grant there is idle time in this CPU.
*/ */
if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
return; return;
/* /*
* To avoid underestimate of task utilization, skip updates of EWMA if * To avoid underestimate of task utilization, skip updates of EWMA if
* we cannot grant that thread got all CPU time it wanted. * we cannot grant that thread got all CPU time it wanted.
*/ */
if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p)) if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
goto done; goto done;
...@@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, ...@@ -4907,25 +4888,24 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
* Update Task's estimated utilization * Update Task's estimated utilization
* *
* When *p completes an activation we can consolidate another sample * When *p completes an activation we can consolidate another sample
* of the task size. This is done by storing the current PELT value * of the task size. This is done by using this value to update the
* as ue.enqueued and by using this value to update the Exponential * Exponential Weighted Moving Average (EWMA):
* Weighted Moving Average (EWMA):
* *
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
* = w * ( last_ewma_diff ) + ewma(t-1) * = w * ( -last_ewma_diff ) + ewma(t-1)
* = w * (last_ewma_diff + ewma(t-1) / w) * = w * (-last_ewma_diff + ewma(t-1) / w)
* *
* Where 'w' is the weight of new samples, which is configured to be * Where 'w' is the weight of new samples, which is configured to be
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
*/ */
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff; ewma -= last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; ewma >>= UTIL_EST_WEIGHT_SHIFT;
done: done:
ue.enqueued |= UTIL_AVG_UNCHANGED; ewma |= UTIL_AVG_UNCHANGED;
WRITE_ONCE(p->se.avg.util_est, ue); WRITE_ONCE(p->se.avg.util_est, ewma);
trace_sched_util_est_se_tp(&p->se); trace_sched_util_est_se_tp(&p->se);
} }
...@@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) ...@@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
if (sched_feat(UTIL_EST)) { if (sched_feat(UTIL_EST)) {
unsigned long util_est; unsigned long util_est;
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); util_est = READ_ONCE(cfs_rq->avg.util_est);
/* /*
* During wake-up @p isn't enqueued yet and doesn't contribute * During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. * to any cpu_rq(cpu)->cfs.avg.util_est.
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
* has been enqueued. * has been enqueued.
* *
* During exec (@dst_cpu = -1) @p is enqueued and does * During exec (@dst_cpu = -1) @p is enqueued and does
* contribute to cpu_rq(cpu)->cfs.util_est.enqueued. * contribute to cpu_rq(cpu)->cfs.util_est.
* Remove it to "simulate" cpu_util without @p's contribution. * Remove it to "simulate" cpu_util without @p's contribution.
* *
* Despite the task_on_rq_queued(@p) check there is still a * Despite the task_on_rq_queued(@p) check there is still a
......
...@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg) ...@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
return; return;
/* Avoid store if the flag has been already reset */ /* Avoid store if the flag has been already reset */
enqueued = avg->util_est.enqueued; enqueued = avg->util_est;
if (!(enqueued & UTIL_AVG_UNCHANGED)) if (!(enqueued & UTIL_AVG_UNCHANGED))
return; return;
/* Reset flag to report util_avg has been updated */ /* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED; enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est.enqueued, enqueued); WRITE_ONCE(avg->util_est, enqueued);
} }
static inline u64 rq_clock_pelt(struct rq *rq) static inline u64 rq_clock_pelt(struct rq *rq)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment