Commit 6ae71436 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Borislav Petkov:
 "Mostly minor things this time; some highlights:

   - core-sched: Add 'Forced Idle' accounting; this allows to track how
     much CPU time is 'lost' due to core scheduling constraints.

   - psi: Fix for MEM_FULL; a task running reclaim would be counted as a
     runnable task and prevent MEM_FULL from being reported.

   - cpuacct: Long standing fixes for some cgroup accounting issues.

   - rt: Bandwidth timer could, under unusual circumstances, be failed
     to armed, leading to indefinite throttling."

[ Description above by Peter Zijlstra ]

* tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs()
  sched/fair: Cleanup task_util and capacity type
  sched/rt: Try to restart rt period timer when rt runtime exceeded
  sched/fair: Document the slow path and fast path in select_task_rq_fair
  sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity
  sched/fair: Fix detection of per-CPU kthreads waking a task
  sched/cpuacct: Make user/system times in cpuacct.stat more precise
  sched/cpuacct: Fix user/system in shown cpuacct.usage*
  cpuacct: Convert BUG_ON() to WARN_ON_ONCE()
  cputime, cpuacct: Include guest time in user time in cpuacct.stat
  psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim
  sched/core: Forced idle accounting
  psi: Add a missing SPDX license header
  psi: Remove repeated verbose comment
parents 01367e86 82762d2a
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_H
#define _LINUX_PSI_H
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PSI_TYPES_H
#define _LINUX_PSI_TYPES_H
......@@ -21,7 +22,17 @@ enum psi_task_count {
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
NR_PSI_TASK_COUNTS = 4,
/*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
* For memory it's not so simple because of page reclaimers:
* they are running/oncpu while representing a stall. To tell
* whether a domain has productivity left or not, we need to
* distinguish between regular running (i.e. productive)
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5,
};
/* Task state bitmasks */
......@@ -29,6 +40,7 @@ enum psi_task_count {
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Resources that workloads could be stalled on */
enum psi_res {
......
......@@ -523,7 +523,11 @@ struct sched_statistics {
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
#ifdef CONFIG_SCHED_CORE
u64 core_forceidle_sum;
#endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;
struct sched_entity {
......
......@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
if (prio_less(b, a, task_rq(a)->core->core_forceidle))
if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
......@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
void sched_core_dequeue(struct rq *rq, struct task_struct *p)
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
if (!sched_core_enqueued(p))
return;
if (sched_core_enqueued(p)) {
rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
}
rb_erase(&p->core_node, &rq->core_tree);
RB_CLEAR_NODE(&p->core_node);
/*
* Migrating the last task off the cpu, with the cpu in forced idle
* state. Reschedule to create an accounting edge for forced idle,
* and re-examine whether the core is still in forced idle state.
*/
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
rq->core->core_forceidle_count && rq->curr == rq->idle)
resched_curr(rq);
}
/*
......@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
cpu_rq(cpu)->core->core_forceidle_start = 0;
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
......@@ -364,7 +374,8 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
......@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p);
sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
......@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
rq_unlock(rq, &rf);
......@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
......@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
if (rq->core->core_forceidle_count) {
if (!core_clock_updated) {
update_rq_clock(rq->core);
core_clock_updated = true;
}
sched_core_account_forceidle(rq);
/* reset after accounting force idle */
rq->core->core_forceidle_start = 0;
rq->core->core_forceidle_count = 0;
rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
rq->core->core_forceidle = false;
}
/*
......@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
if (i != cpu)
/*
* Current cpu always has its clock updated on entrance to
* pick_next_task(). If the current cpu is not the core,
* the core may also have been updated above.
*/
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i);
......@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (p == rq_i->idle) {
if (rq_i->nr_running) {
rq->core->core_forceidle = true;
rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
......@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}
if (schedstat_enabled() && rq->core->core_forceidle_count) {
if (cookie)
rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
......@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
if (!(fi_before && rq->core->core_forceidle))
task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
if (!(fi_before && rq->core->core_forceidle_count))
task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
......@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;
/* copy the shared state to the new leader */
core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle = rq->core_forceidle;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_task_seq = rq->core_task_seq;
core_rq->core_pick_seq = rq->core_pick_seq;
core_rq->core_cookie = rq->core_cookie;
core_rq->core_forceidle_count = rq->core_forceidle_count;
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
/*
* Accounting edge for forced idle is handled in pick_next_task().
* Don't need another one here, since the hotplug thread shouldn't
* have a cookie.
*/
core_rq->core_forceidle_start = 0;
/* install new leader */
for_each_cpu(t, smt_mask) {
......@@ -7126,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long sched_cpu_util(int cpu, unsigned long max)
{
return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
......@@ -9409,7 +9449,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
rq->core_forceidle = false;
rq->core_forceidle_count = 0;
rq->core_forceidle_occupation = 0;
rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
......
......@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
enqueued = sched_core_enqueued(p);
if (enqueued)
sched_core_dequeue(rq, p);
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
......@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* If task is currently running, it may not be compatible anymore after
* the cookie change, so enter the scheduler on its CPU to schedule it
* away.
*
* Note that it is possible that as a result of this cookie change, the
* core has now entered/left forced idle state. Defer accounting to the
* next scheduling edge, rather than always forcing a reschedule here.
*/
if (task_running(rq, p))
resched_curr(rq);
......@@ -232,3 +236,63 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
return err;
}
#ifdef CONFIG_SCHEDSTATS
/* REQUIRES: rq->core's clock recently updated. */
void __sched_core_account_forceidle(struct rq *rq)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
u64 delta, now = rq_clock(rq->core);
struct rq *rq_i;
struct task_struct *p;
int i;
lockdep_assert_rq_held(rq);
WARN_ON_ONCE(!rq->core->core_forceidle_count);
if (rq->core->core_forceidle_start == 0)
return;
delta = now - rq->core->core_forceidle_start;
if (unlikely((s64)delta <= 0))
return;
rq->core->core_forceidle_start = now;
if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
/* can't be forced idle without a running task */
} else if (rq->core->core_forceidle_count > 1 ||
rq->core->core_forceidle_occupation > 1) {
/*
* For larger SMT configurations, we need to scale the charged
* forced idle amount since there can be more than one forced
* idle sibling and more than one running cookied task.
*/
delta *= rq->core->core_forceidle_count;
delta = div_u64(delta, rq->core->core_forceidle_occupation);
}
for_each_cpu(i, smt_mask) {
rq_i = cpu_rq(i);
p = rq_i->core_pick ?: rq_i->curr;
if (!p->core_cookie)
continue;
__schedstat_add(p->stats.core_forceidle_sum, delta);
}
}
void __sched_core_tick(struct rq *rq)
{
if (!rq->core->core_forceidle_count)
return;
if (rq != rq->core)
update_rq_clock(rq->core);
__sched_core_account_forceidle(rq);
}
#endif /* CONFIG_SCHEDSTATS */
......@@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_SYSTEM] = "system",
};
struct cpuacct_usage {
u64 usages[CPUACCT_STAT_NSTATS];
};
/* track CPU usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every CPU */
struct cpuacct_usage __percpu *cpuusage;
u64 __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
......@@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
......@@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
ca->cpuusage = alloc_percpu(struct cpuacct_usage);
ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage)
goto out_free_ca;
......@@ -99,14 +95,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
enum cpuacct_stat_index index)
{
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
u64 data;
/*
* We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of usages.
*/
BUG_ON(index > CPUACCT_STAT_NSTATS);
if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS))
return 0;
#ifndef CONFIG_64BIT
/*
......@@ -115,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
if (index == CPUACCT_STAT_NSTATS) {
int i = 0;
data = 0;
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i];
} else {
data = cpuusage->usages[index];
switch (index) {
case CPUACCT_STAT_USER:
data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE];
break;
case CPUACCT_STAT_SYSTEM:
data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] +
cpustat[CPUTIME_SOFTIRQ];
break;
case CPUACCT_STAT_NSTATS:
data = *cpuusage;
break;
}
#ifndef CONFIG_64BIT
......@@ -132,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
return data;
}
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu)
{
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
int i;
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
/* Don't allow to reset global kernel_cpustat */
if (ca == &root_cpuacct)
return;
#ifndef CONFIG_64BIT
/*
......@@ -143,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
*/
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
*cpuusage = 0;
cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0;
cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0;
cpustat[CPUTIME_SOFTIRQ] = 0;
#ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu));
......@@ -196,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
return -EINVAL;
for_each_possible_cpu(cpu)
cpuacct_cpuusage_write(ca, cpu, 0);
cpuacct_cpuusage_write(ca, cpu);
return 0;
}
......@@ -243,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
seq_puts(m, "\n");
for_each_possible_cpu(cpu) {
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
seq_printf(m, "%d", cpu);
for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit
* platforms.
*/
raw_spin_rq_lock_irq(cpu_rq(cpu));
#endif
seq_printf(m, " %llu", cpuusage->usages[index]);
#ifndef CONFIG_64BIT
raw_spin_rq_unlock_irq(cpu_rq(cpu));
#endif
}
for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
seq_printf(m, " %llu",
cpuacct_cpuusage_read(ca, cpu, index));
seq_puts(m, "\n");
}
return 0;
......@@ -270,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
s64 val[CPUACCT_STAT_NSTATS];
struct task_cputime cputime;
u64 val[CPUACCT_STAT_NSTATS];
int cpu;
int stat;
memset(val, 0, sizeof(val));
memset(&cputime, 0, sizeof(cputime));
for_each_possible_cpu(cpu) {
u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
cputime.utime += cpustat[CPUTIME_USER];
cputime.utime += cpustat[CPUTIME_NICE];
cputime.stime += cpustat[CPUTIME_SYSTEM];
cputime.stime += cpustat[CPUTIME_IRQ];
cputime.stime += cpustat[CPUTIME_SOFTIRQ];
cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu);
}
cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime,
&val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]);
for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
seq_printf(sf, "%s %lld\n",
cpuacct_stat_desc[stat],
(long long)nsec_to_clock_t(val[stat]));
seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat],
nsec_to_clock_t(val[stat]));
}
return 0;
......@@ -339,16 +335,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk);
if (regs && user_mode(regs))
index = CPUACCT_STAT_USER;
rcu_read_lock();
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
__this_cpu_add(ca->cpuusage->usages[index], cputime);
__this_cpu_add(*ca->cpuusage, cputime);
rcu_read_unlock();
}
......
......@@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
FREQUENCY_UTIL, NULL);
}
......
......@@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += cputime;
task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
cpustat[CPUTIME_USER] += cputime;
task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
......
......@@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PN(avg_atom);
__PN(avg_per_cpu);
#ifdef CONFIG_SCHED_CORE
PN_SCHEDSTAT(core_forceidle_sum);
#endif
}
__P(nr_switches);
......
......@@ -1502,7 +1502,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
static inline long adjust_numa_imbalance(int imbalance,
int dst_running, int dst_weight);
......@@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu);
ns->util += cpu_util_cfs(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
......@@ -3240,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
* See cpu_util_cfs().
*/
cpufreq_update_util(rq, flags);
}
......@@ -4070,7 +4069,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
static inline int task_fits_capacity(struct task_struct *p,
unsigned long capacity)
{
return fits_capacity(uclamp_task_util(p), capacity);
}
......@@ -5509,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
......@@ -6345,7 +6343,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
static inline bool asym_fits_capacity(int task_util, int cpu)
static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
return fits_capacity(task_util, capacity_of(cpu));
......@@ -6398,8 +6396,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1) {
this_rq()->nr_running <= 1 &&
asym_fits_capacity(task_util, prev)) {
return prev;
}
......@@ -6456,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
/**
* cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks.
* @cpu: the CPU to get the utilization of
*
* The unit of the return value must be the one of capacity so we can compare
* the utilization with the capacity of the CPU that is available for CFS task
* (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
* the amount of utilization of a CPU in the range [0..capacity_orig] where
* capacity_orig is the cpu_capacity available at the highest frequency
* (arch_scale_freq_capacity()).
* The utilization of a CPU converges towards a sum equal to or less than the
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
* The estimated utilization of a CPU is defined to be the maximum between its
* cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
* currently RUNNABLE on that CPU.
* This allows to properly represent the expected utilization of a CPU which
* has just got a big task running since a long sleep period. At the same time
* however it preserves the benefits of the "blocked utilization" in
* describing the potential for other tasks waking up on the same CPU.
*
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
* the average stabilizes with the new running time. We need to check that the
* utilization stays within the range of [0..capacity_orig] and cap it if
* necessary. Without utilization capping, a group could be seen as overloaded
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
*
* Return: the (estimated) utilization for the specified CPU
*/
static inline unsigned long cpu_util(int cpu)
{
struct cfs_rq *cfs_rq;
unsigned int util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST))
util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
......@@ -6528,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
return cpu_util_cfs(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
......@@ -6592,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
* the cpu_util call.
* cpu_util.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
......@@ -6624,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util() after the task has been enqueued.
* cpu_util after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
......@@ -6915,6 +6863,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
break;
}
/*
* Usually only true for WF_EXEC and WF_FORK, as sched_domains
* usually do not have SD_BALANCE_WAKE set. That means wakeup
* will usually go to the fast path.
*/
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
......@@ -8681,7 +8634,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct rq *rq = cpu_rq(i);
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
......@@ -9699,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
util = cpu_util(cpu_of(rq));
util = cpu_util_cfs(i);
/*
* Don't try to pull utilization from a CPU with one
......@@ -11068,7 +11021,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
......
// SPDX-License-Identifier: GPL-2.0
/*
* Pressure stall information for CPU, memory and IO
*
......@@ -34,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level, means all non-idle tasks
* in a cgroup are delayed on the CPU resource which used by others outside
* of the cgroup or throttled by the cgroup cpu.max configuration.
*
* SOME = nr_delayed_tasks != 0
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
* FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
*
* What it means for a task to be productive is defined differently
* for each resource. For IO, productive means a running task. For
* memory, productive means a running task that isn't a reclaimer. For
* CPU, productive means an oncpu task.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level. At the cgroup level,
* FULL means all non-idle tasks in the cgroup are delayed on the CPU
* resource which is being used by others outside of the cgroup or
* throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
......@@ -81,13 +88,13 @@
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
* FULL = (threads - min(nr_running_tasks, threads)) / threads
* FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
* FULL = (256 - min(257, 256)) / 256 = 0%
* FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
......@@ -112,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
......@@ -233,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
......@@ -710,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set);
groupc->tasks[3], groupc->tasks[4],
clear, set);
psi_bug = 1;
}
}
......@@ -833,7 +842,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
* runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
*/
......@@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0;
/*
* When we're going to sleep, psi_dequeue() lets us handle
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
* with TSK_ONCPU and save walking common ancestors twice.
* When we're going to sleep, psi_dequeue() lets us
* handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
* TSK_IOWAIT here, where we can combine it with
* TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
clear |= TSK_RUNNING;
if (prev->in_memstall)
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
}
......@@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
psi_task_change(current, 0, TSK_MEMSTALL);
psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
......@@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0);
psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}
......
......@@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
......@@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
do_start_rt_bandwidth(rt_b);
}
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
......@@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq)
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
exceeded = sched_rt_runtime_exceeded(rt_rq);
if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
if (exceeded)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
}
......@@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
unsigned long flags;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
......
......@@ -1111,8 +1111,10 @@ struct rq {
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
unsigned char core_forceidle;
unsigned int core_forceidle_count;
unsigned int core_forceidle_seq;
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
#endif
};
......@@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p)
}
extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
......@@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { }
#include "stats.h"
#include "autogroup.h"
#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
extern void __sched_core_account_forceidle(struct rq *rq);
static inline void sched_core_account_forceidle(struct rq *rq)
{
if (schedstat_enabled())
__sched_core_account_forceidle(rq);
}
extern void __sched_core_tick(struct rq *rq);
static inline void sched_core_tick(struct rq *rq)
{
if (sched_core_enabled(rq) && schedstat_enabled())
__sched_core_tick(rq);
}
#else
static inline void sched_core_account_forceidle(struct rq *rq) {}
static inline void sched_core_tick(struct rq *rq) {}
#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
#ifdef CONFIG_CGROUP_SCHED
/*
......@@ -2938,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
static inline unsigned long cpu_util_cfs(struct rq *rq)
/**
* cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
* @cpu: the CPU to get the utilization for.
*
* The unit of the return value must be the same as the one of CPU capacity
* so that CPU utilization can be compared with CPU capacity.
*
* CPU utilization is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on that CPU.
* It represents the amount of CPU capacity currently used by CFS tasks in
* the range [0..max CPU capacity] with max CPU capacity being the CPU
* capacity at f_max.
*
* The estimated CPU utilization is defined as the maximum between CPU
* utilization and sum of the estimated utilization of the currently
* runnable tasks on that CPU. It preserves a utilization "snapshot" of
* previously-executed tasks, which helps better deduce how busy a CPU will
* be when a long-sleeping task wakes up. The contribution to CPU utilization
* of such a task would be significantly decayed at this point of time.
*
* CPU utilization can be higher than the current CPU capacity
* (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
* of rounding errors as well as task migrations or wakeups of new tasks.
* CPU utilization has to be capped to fit into the [0..max CPU capacity]
* range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
* could be seen as over-utilized even though CPU1 has 20% of spare CPU
* capacity. CPU utilization is allowed to overshoot current CPU capacity
* though since this is useful for predicting the CPU capacity required
* after task migrations (scheduler-driven DVFS).
*
* Return: (Estimated) utilization for the specified CPU.
*/
static inline unsigned long cpu_util_cfs(int cpu)
{
unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
struct cfs_rq *cfs_rq;
unsigned long util;
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
if (sched_feat(UTIL_EST)) {
util = max_t(unsigned long, util,
READ_ONCE(rq->cfs.avg.util_est.enqueued));
READ_ONCE(cfs_rq->avg.util_est.enqueued));
}
return util;
return min(util, capacity_orig_of(cpu));
}
static inline unsigned long cpu_util_rt(struct rq *rq)
......
......@@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled))
return;
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;
if (!wakeup || p->sched_psi_wake_requeue) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
......@@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return;
if (p->in_memstall)
clear |= TSK_MEMSTALL;
clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
psi_task_change(p, clear, 0);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment