Commit 77a05940 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The biggest changes in this cycle were:

   - Make kcpustat vtime aware (Frederic Weisbecker)

   - Rework the CFS load_balance() logic (Vincent Guittot)

   - Misc cleanups, smaller enhancements, fixes.

  The load-balancing rework is the most intrusive change: it replaces
  the old heuristics that have become less meaningful after the
  introduction of the PELT metrics, with a grounds-up load-balancing
  algorithm.

  As such it's not really an iterative series, but replaces the old
  load-balancing logic with the new one. We hope there are no
  performance regressions left - but statistically it's highly probable
  that there *is* going to be some workload that is hurting from these
  chnages. If so then we'd prefer to have a look at that workload and
  fix its scheduling, instead of reverting the changes"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits)
  rackmeter: Use vtime aware kcpustat accessor
  leds: Use all-in-one vtime aware kcpustat accessor
  cpufreq: Use vtime aware kcpustat accessors for user time
  procfs: Use all-in-one vtime aware kcpustat accessor
  sched/vtime: Bring up complete kcpustat accessor
  sched/cputime: Support other fields on kcpustat_field()
  sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util()
  sched/fair: Add comments for group_type and balancing at SD_NUMA level
  sched/fair: Fix rework of find_idlest_group()
  sched/uclamp: Fix overzealous type replacement
  sched/Kconfig: Fix spelling mistake in user-visible help text
  sched/core: Further clarify sched_class::set_next_task()
  sched/fair: Use mul_u32_u32()
  sched/core: Simplify sched_class::pick_next_task()
  sched/core: Optimize pick_next_task()
  sched/core: Make pick_next_task_idle() more consistent
  sched/fair: Better document newidle_balance()
  leds: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
  cpufreq: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
  procfs: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
  ...
parents 3f59dbca de881a34
......@@ -132,7 +132,7 @@ static __u64 vtime_delta(struct task_struct *tsk)
return delta_stime;
}
void vtime_account_system(struct task_struct *tsk)
void vtime_account_kernel(struct task_struct *tsk)
{
struct thread_info *ti = task_thread_info(tsk);
__u64 stime = vtime_delta(tsk);
......@@ -146,7 +146,7 @@ void vtime_account_system(struct task_struct *tsk)
else
ti->stime += stime;
}
EXPORT_SYMBOL_GPL(vtime_account_system);
EXPORT_SYMBOL_GPL(vtime_account_kernel);
void vtime_account_idle(struct task_struct *tsk)
{
......
......@@ -338,7 +338,7 @@ static unsigned long vtime_delta(struct task_struct *tsk,
return stime;
}
void vtime_account_system(struct task_struct *tsk)
void vtime_account_kernel(struct task_struct *tsk)
{
unsigned long stime, stime_scaled, steal_time;
struct cpu_accounting_data *acct = get_accounting(tsk);
......@@ -366,7 +366,7 @@ void vtime_account_system(struct task_struct *tsk)
#endif
}
}
EXPORT_SYMBOL_GPL(vtime_account_system);
EXPORT_SYMBOL_GPL(vtime_account_kernel);
void vtime_account_idle(struct task_struct *tsk)
{
......@@ -395,7 +395,7 @@ static void vtime_flush_scaled(struct task_struct *tsk,
/*
* Account the whole cputime accumulated in the paca
* Must be called with interrupts disabled.
* Assumes that vtime_account_system/idle() has been called
* Assumes that vtime_account_kernel/idle() has been called
* recently (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/
......
......@@ -247,9 +247,9 @@ void vtime_account_irq_enter(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
void vtime_account_system(struct task_struct *tsk)
void vtime_account_kernel(struct task_struct *tsk)
__attribute__((alias("vtime_account_irq_enter")));
EXPORT_SYMBOL_GPL(vtime_account_system);
EXPORT_SYMBOL_GPL(vtime_account_kernel);
/*
* Sorted add to a list. List is linear searched until first bigger
......
......@@ -354,7 +354,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
#ifdef CONFIG_JUMP_LABEL
STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0
#endif
call enter_from_user_mode
.Lafter_call_\@:
......
......@@ -113,18 +113,21 @@ EXPORT_SYMBOL_GPL(get_governor_parent_kobj);
static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
{
u64 idle_time;
struct kernel_cpustat kcpustat;
u64 cur_wall_time;
u64 idle_time;
u64 busy_time;
cur_wall_time = jiffies64_to_nsecs(get_jiffies_64());
busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
kcpustat_cpu_fetch(&kcpustat, cpu);
busy_time = kcpustat.cpustat[CPUTIME_USER];
busy_time += kcpustat.cpustat[CPUTIME_SYSTEM];
busy_time += kcpustat.cpustat[CPUTIME_IRQ];
busy_time += kcpustat.cpustat[CPUTIME_SOFTIRQ];
busy_time += kcpustat.cpustat[CPUTIME_STEAL];
busy_time += kcpustat.cpustat[CPUTIME_NICE];
idle_time = cur_wall_time - busy_time;
if (wall)
......
......@@ -105,7 +105,7 @@ void gov_update_cpu_data(struct dbs_data *dbs_data)
j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_update_time,
dbs_data->io_is_busy);
if (dbs_data->ignore_nice_load)
j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
j_cdbs->prev_cpu_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
}
}
}
......@@ -149,7 +149,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
j_cdbs->prev_cpu_idle = cur_idle_time;
if (ignore_nice) {
u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
u64 cur_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
idle_time += div_u64(cur_nice - j_cdbs->prev_cpu_nice, NSEC_PER_USEC);
j_cdbs->prev_cpu_nice = cur_nice;
......@@ -530,7 +530,7 @@ int cpufreq_dbs_governor_start(struct cpufreq_policy *policy)
j_cdbs->prev_load = 0;
if (ignore_nice)
j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
j_cdbs->prev_cpu_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
}
gov->start(policy);
......
......@@ -57,11 +57,15 @@ static void led_activity_function(struct timer_list *t)
curr_used = 0;
for_each_possible_cpu(i) {
curr_used += kcpustat_cpu(i).cpustat[CPUTIME_USER]
+ kcpustat_cpu(i).cpustat[CPUTIME_NICE]
+ kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]
+ kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]
+ kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
struct kernel_cpustat kcpustat;
kcpustat_cpu_fetch(&kcpustat, i);
curr_used += kcpustat.cpustat[CPUTIME_USER]
+ kcpustat.cpustat[CPUTIME_NICE]
+ kcpustat.cpustat[CPUTIME_SYSTEM]
+ kcpustat.cpustat[CPUTIME_SOFTIRQ]
+ kcpustat.cpustat[CPUTIME_IRQ];
cpus++;
}
......
......@@ -81,13 +81,14 @@ static int rackmeter_ignore_nice;
*/
static inline u64 get_cpu_idle_time(unsigned int cpu)
{
struct kernel_cpustat *kcpustat = &kcpustat_cpu(cpu);
u64 retval;
retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] +
kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
retval = kcpustat->cpustat[CPUTIME_IDLE] +
kcpustat->cpustat[CPUTIME_IOWAIT];
if (rackmeter_ignore_nice)
retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
retval += kcpustat_field(kcpustat, CPUTIME_NICE, cpu);
return retval;
}
......
......@@ -120,18 +120,21 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
struct kernel_cpustat *kcs = &kcpustat_cpu(i);
user += kcs->cpustat[CPUTIME_USER];
nice += kcs->cpustat[CPUTIME_NICE];
system += kcs->cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(kcs, i);
iowait += get_iowait_time(kcs, i);
irq += kcs->cpustat[CPUTIME_IRQ];
softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
steal += kcs->cpustat[CPUTIME_STEAL];
guest += kcs->cpustat[CPUTIME_GUEST];
guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
kcpustat_cpu_fetch(&kcpustat, i);
user += cpustat[CPUTIME_USER];
nice += cpustat[CPUTIME_NICE];
system += cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(&kcpustat, i);
iowait += get_iowait_time(&kcpustat, i);
irq += cpustat[CPUTIME_IRQ];
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
guest += cpustat[CPUTIME_GUEST];
guest_nice += cpustat[CPUTIME_USER];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
......@@ -157,19 +160,22 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
struct kernel_cpustat *kcs = &kcpustat_cpu(i);
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
kcpustat_cpu_fetch(&kcpustat, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = kcs->cpustat[CPUTIME_USER];
nice = kcs->cpustat[CPUTIME_NICE];
system = kcs->cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(kcs, i);
iowait = get_iowait_time(kcs, i);
irq = kcs->cpustat[CPUTIME_IRQ];
softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
steal = kcs->cpustat[CPUTIME_STEAL];
guest = kcs->cpustat[CPUTIME_GUEST];
guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
user = cpustat[CPUTIME_USER];
nice = cpustat[CPUTIME_NICE];
system = cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(&kcpustat, i);
iowait = get_iowait_time(&kcpustat, i);
irq = cpustat[CPUTIME_IRQ];
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
guest = cpustat[CPUTIME_GUEST];
guest_nice = cpustat[CPUTIME_USER];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
......
......@@ -22,26 +22,26 @@ extern void context_tracking_user_exit(void);
static inline void user_enter(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
context_tracking_enter(CONTEXT_USER);
}
static inline void user_exit(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
context_tracking_exit(CONTEXT_USER);
}
/* Called with interrupts disabled. */
static inline void user_enter_irqoff(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_USER);
}
static inline void user_exit_irqoff(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_USER);
}
......@@ -49,7 +49,7 @@ static inline enum ctx_state exception_enter(void)
{
enum ctx_state prev_ctx;
if (!context_tracking_is_enabled())
if (!context_tracking_enabled())
return 0;
prev_ctx = this_cpu_read(context_tracking.state);
......@@ -61,7 +61,7 @@ static inline enum ctx_state exception_enter(void)
static inline void exception_exit(enum ctx_state prev_ctx)
{
if (context_tracking_is_enabled()) {
if (context_tracking_enabled()) {
if (prev_ctx != CONTEXT_KERNEL)
context_tracking_enter(prev_ctx);
}
......@@ -77,7 +77,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
*/
static inline enum ctx_state ct_state(void)
{
return context_tracking_is_enabled() ?
return context_tracking_enabled() ?
this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
}
#else
......@@ -90,7 +90,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) { }
static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
#endif /* !CONFIG_CONTEXT_TRACKING */
#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
extern void context_tracking_init(void);
......@@ -103,12 +103,12 @@ static inline void context_tracking_init(void) { }
/* must be called with irqs disabled */
static inline void guest_enter_irqoff(void)
{
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_GUEST);
/* KVM does not hold any references to rcu protected data when it
......@@ -118,16 +118,16 @@ static inline void guest_enter_irqoff(void)
* one time slice). Lets treat guest mode as quiescent state, just like
* we do with user-mode execution.
*/
if (!context_tracking_cpu_is_enabled())
if (!context_tracking_enabled_this_cpu())
rcu_virt_note_context_switch(smp_processor_id());
}
static inline void guest_exit_irqoff(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_GUEST);
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
......@@ -141,7 +141,7 @@ static inline void guest_enter_irqoff(void)
* to assume that it's the stime pending cputime
* to flush.
*/
vtime_account_system(current);
vtime_account_kernel(current);
current->flags |= PF_VCPU;
rcu_virt_note_context_switch(smp_processor_id());
}
......@@ -149,7 +149,7 @@ static inline void guest_enter_irqoff(void)
static inline void guest_exit_irqoff(void)
{
/* Flush the guest cputime we spent on the guest */
vtime_account_system(current);
vtime_account_kernel(current);
current->flags &= ~PF_VCPU;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
......
......@@ -23,17 +23,22 @@ struct context_tracking {
};
#ifdef CONFIG_CONTEXT_TRACKING
extern struct static_key_false context_tracking_enabled;
extern struct static_key_false context_tracking_key;
DECLARE_PER_CPU(struct context_tracking, context_tracking);
static inline bool context_tracking_is_enabled(void)
static inline bool context_tracking_enabled(void)
{
return static_branch_unlikely(&context_tracking_enabled);
return static_branch_unlikely(&context_tracking_key);
}
static inline bool context_tracking_cpu_is_enabled(void)
static inline bool context_tracking_enabled_cpu(int cpu)
{
return __this_cpu_read(context_tracking.active);
return context_tracking_enabled() && per_cpu(context_tracking.active, cpu);
}
static inline bool context_tracking_enabled_this_cpu(void)
{
return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
}
static inline bool context_tracking_in_user(void)
......@@ -42,9 +47,9 @@ static inline bool context_tracking_in_user(void)
}
#else
static inline bool context_tracking_in_user(void) { return false; }
static inline bool context_tracking_active(void) { return false; }
static inline bool context_tracking_is_enabled(void) { return false; }
static inline bool context_tracking_cpu_is_enabled(void) { return false; }
static inline bool context_tracking_enabled(void) { return false; }
static inline bool context_tracking_enabled_cpu(int cpu) { return false; }
static inline bool context_tracking_enabled_this_cpu(void) { return false; }
#endif /* CONFIG_CONTEXT_TRACKING */
#endif
......@@ -78,6 +78,24 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
return kstat_cpu(cpu).irqs_sum;
}
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern u64 kcpustat_field(struct kernel_cpustat *kcpustat,
enum cpu_usage_stat usage, int cpu);
extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu);
#else
static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat,
enum cpu_usage_stat usage, int cpu)
{
return kcpustat->cpustat[usage];
}
static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
*dst = kcpustat_cpu(cpu);
}
#endif
extern void account_user_time(struct task_struct *, u64);
extern void account_guest_time(struct task_struct *, u64);
extern void account_system_time(struct task_struct *, int, u64);
......
......@@ -250,16 +250,21 @@ struct prev_cputime {
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task is idle */
VTIME_IDLE,
/* Task runs in kernelspace in a CPU with VTIME active: */
VTIME_SYS,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task runs as guests in a CPU with VTIME active: */
VTIME_GUEST,
};
struct vtime {
seqcount_t seqcount;
unsigned long long starttime;
enum vtime_state state;
unsigned int cpu;
u64 utime;
u64 stime;
u64 gtime;
......
......@@ -174,7 +174,7 @@ extern cpumask_var_t tick_nohz_full_mask;
static inline bool tick_nohz_full_enabled(void)
{
if (!context_tracking_is_enabled())
if (!context_tracking_enabled())
return false;
return tick_nohz_full_running;
......
......@@ -11,11 +11,15 @@
struct task_struct;
/*
* vtime_accounting_cpu_enabled() definitions/declarations
* vtime_accounting_enabled_this_cpu() definitions/declarations
*/
#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)
static inline bool vtime_accounting_cpu_enabled(void) { return true; }
static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
extern void vtime_task_switch(struct task_struct *prev);
#elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
/*
* Checks if vtime is enabled on some CPU. Cputime readers want to be careful
* in that case and compute the tickless cputime.
......@@ -24,46 +28,43 @@ static inline bool vtime_accounting_cpu_enabled(void) { return true; }
*/
static inline bool vtime_accounting_enabled(void)
{
return context_tracking_is_enabled();
return context_tracking_enabled();
}
static inline bool vtime_accounting_cpu_enabled(void)
static inline bool vtime_accounting_enabled_cpu(int cpu)
{
if (vtime_accounting_enabled()) {
if (context_tracking_cpu_is_enabled())
return true;
}
return false;
return context_tracking_enabled_cpu(cpu);
}
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline bool vtime_accounting_cpu_enabled(void) { return false; }
#endif
static inline bool vtime_accounting_enabled_this_cpu(void)
{
return context_tracking_enabled_this_cpu();
}
/*
* Common vtime APIs
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_task_switch_generic(struct task_struct *prev);
#ifdef __ARCH_HAS_VTIME_TASK_SWITCH
extern void vtime_task_switch(struct task_struct *prev);
#else
extern void vtime_common_task_switch(struct task_struct *prev);
static inline void vtime_task_switch(struct task_struct *prev)
{
if (vtime_accounting_cpu_enabled())
vtime_common_task_switch(prev);
if (vtime_accounting_enabled_this_cpu())
vtime_task_switch_generic(prev);
}
#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; }
static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
static inline void vtime_task_switch(struct task_struct *prev) { }
static inline void vtime_account_system(struct task_struct *tsk) { }
#endif
/*
* Common vtime APIs
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_account_kernel(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline void vtime_account_kernel(struct task_struct *tsk) { }
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
......@@ -86,7 +87,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk);
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
/* On hard|softirq exit we always account to hard|softirq cputime */
vtime_account_system(tsk);
vtime_account_kernel(tsk);
}
extern void vtime_flush(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
......
......@@ -65,7 +65,7 @@ config PREEMPT_RT
preemptible priority-inheritance aware variants, enforcing
interrupt threading and introducing mechanisms to break up long
non-preemptible sections. This makes the kernel, except for very
low level and critical code pathes (entry code, scheduler, low
low level and critical code paths (entry code, scheduler, low
level interrupt handling) fully preemptible and brings most
execution contexts under scheduler control.
......
......@@ -25,8 +25,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
......@@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu)
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
static_branch_inc(&context_tracking_enabled);
static_branch_inc(&context_tracking_key);
}
if (initialized)
......
......@@ -811,7 +811,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
......@@ -854,7 +854,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
}
static inline
enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
......@@ -919,7 +919,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
......@@ -3918,13 +3918,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, rf);
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
}
return p;
}
......@@ -3948,7 +3950,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
put_prev_task(rq, prev);
for_each_class(class) {
p = class->pick_next_task(rq, NULL, NULL);
p = class->pick_next_task(rq);
if (p)
return p;
}
......@@ -6217,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
struct task_struct *next;
for_each_class(class) {
next = class->pick_next_task(rq, NULL, NULL);
next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;
......
......@@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
/*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_common_task_switch(struct task_struct *prev)
void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
vtime_account_system(prev);
vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
* vtime_account_system() and vtime_account_idle(). Archs that
* vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
......@@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
vtime_account_system(tsk);
vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
......@@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
u64 cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
......@@ -711,7 +709,7 @@ static u64 get_vtime_delta(struct vtime *vtime)
return delta - other;
}
static void __vtime_account_system(struct task_struct *tsk,
static void vtime_account_system(struct task_struct *tsk,
struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
......@@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk,
}
}
void vtime_account_system(struct task_struct *tsk)
static void __vtime_account_kernel(struct task_struct *tsk,
struct vtime *vtime)
{
/* We might have scheduled out from guest path */
if (vtime->state == VTIME_GUEST)
vtime_account_guest(tsk, vtime);
else
vtime_account_system(tsk, vtime);
}
void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
......@@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk)
return;
write_seqcount_begin(&vtime->seqcount);
/* We might have scheduled out from guest path */
if (tsk->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
__vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
......@@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
......@@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
......@@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
tsk->flags &= ~PF_VCPU;
vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
......@@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(get_vtime_delta(&tsk->vtime));
}
void arch_vtime_task_switch(struct task_struct *prev)
void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
if (vtime->state == VTIME_IDLE)
vtime_account_idle(prev);
else
__vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&vtime->seqcount);
if (is_idle_task(current))
vtime->state = VTIME_IDLE;
else if (current->flags & PF_VCPU)
vtime->state = VTIME_GUEST;
else
vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
......@@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS;
vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
......@@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t)
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
......@@ -877,20 +895,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping, nothing to add */
if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
/* Task is sleeping or idle, nothing to add */
if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
*utime += vtime->utime + delta;
else if (vtime->state == VTIME_SYS)
if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
}
static int vtime_state_check(struct vtime *vtime, int cpu)
{
/*
* We raced against a context switch, fetch the
* kcpustat task again.
*/
if (vtime->cpu != cpu && vtime->cpu != -1)
return -EAGAIN;
/*
* Two possible things here:
* 1) We are seeing the scheduling out task (prev) or any past one.
* 2) We are seeing the scheduling in task (next) but it hasn't
* passed though vtime_task_switch() yet so the pending
* cputime of the prev task may not be flushed yet.
*
* Case 1) is ok but 2) is not. So wait for a safe VTIME state.
*/
if (vtime->state == VTIME_INACTIVE)
return -EAGAIN;
return 0;
}
static u64 kcpustat_user_vtime(struct vtime *vtime)
{
if (vtime->state == VTIME_USER)
return vtime->utime + vtime_delta(vtime);
else if (vtime->state == VTIME_GUEST)
return vtime->gtime + vtime_delta(vtime);
return 0;
}
static int kcpustat_field_vtime(u64 *cpustat,
struct task_struct *tsk,
enum cpu_usage_stat usage,
int cpu, u64 *val)
{
struct vtime *vtime = &tsk->vtime;
unsigned int seq;
int err;
do {
seq = read_seqcount_begin(&vtime->seqcount);
err = vtime_state_check(vtime, cpu);
if (err < 0)
return err;
*val = cpustat[usage];
/*
* Nice VS unnice cputime accounting may be inaccurate if
* the nice value has changed since the last vtime update.
* But proper fix would involve interrupting target on nice
* updates which is a no go on nohz_full (although the scheduler
* may still interrupt the target if rescheduling is needed...)
*/
switch (usage) {
case CPUTIME_SYSTEM:
if (vtime->state == VTIME_SYS)
*val += vtime->stime + vtime_delta(vtime);
break;
case CPUTIME_USER:
if (task_nice(tsk) <= 0)
*val += kcpustat_user_vtime(vtime);
break;
case CPUTIME_NICE:
if (task_nice(tsk) > 0)
*val += kcpustat_user_vtime(vtime);
break;
case CPUTIME_GUEST:
if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
case CPUTIME_GUEST_NICE:
if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
default:
break;
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
return 0;
}
u64 kcpustat_field(struct kernel_cpustat *kcpustat,
enum cpu_usage_stat usage, int cpu)
{
u64 *cpustat = kcpustat->cpustat;
struct rq *rq;
u64 val;
int err;
if (!vtime_accounting_enabled_cpu(cpu))
return cpustat[usage];
rq = cpu_rq(cpu);
for (;;) {
struct task_struct *curr;
rcu_read_lock();
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
return cpustat[usage];
}
err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
rcu_read_unlock();
if (!err)
return val;
cpu_relax();
}
}
EXPORT_SYMBOL_GPL(kcpustat_field);
static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
const struct kernel_cpustat *src,
struct task_struct *tsk, int cpu)
{
struct vtime *vtime = &tsk->vtime;
unsigned int seq;
int err;
do {
u64 *cpustat;
u64 delta;
seq = read_seqcount_begin(&vtime->seqcount);
err = vtime_state_check(vtime, cpu);
if (err < 0)
return err;
*dst = *src;
cpustat = dst->cpustat;
/* Task is sleeping, dead or idle, nothing to add */
if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
if (vtime->state == VTIME_SYS) {
cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
} else if (vtime->state == VTIME_USER) {
if (task_nice(tsk) > 0)
cpustat[CPUTIME_NICE] += vtime->utime + delta;
else
cpustat[CPUTIME_USER] += vtime->utime + delta;
} else {
WARN_ON_ONCE(vtime->state != VTIME_GUEST);
if (task_nice(tsk) > 0) {
cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
cpustat[CPUTIME_NICE] += vtime->gtime + delta;
} else {
cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
cpustat[CPUTIME_USER] += vtime->gtime + delta;
}
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
return err;
}
void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
struct rq *rq;
int err;
if (!vtime_accounting_enabled_cpu(cpu)) {
*dst = *src;
return;
}
rq = cpu_rq(cpu);
for (;;) {
struct task_struct *curr;
rcu_read_lock();
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
*dst = *src;
return;
}
err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
rcu_read_unlock();
if (!err)
return;
cpu_relax();
}
}
EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
......@@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
static void set_next_task_dl(struct rq *rq, struct task_struct *p)
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
if (!first)
return;
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
......@@ -1770,22 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
WARN_ON_ONCE(prev || rf);
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
set_next_task_dl(rq, p);
set_next_task_dl(rq, p, true);
return p;
}
......
......@@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
}
}
/* hint to use a 32x32->64 mul */
fact = (u64)(u32)fact * lw->inv_weight;
fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
......@@ -1474,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static unsigned long cpu_runnable_load(struct rq *rq);
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
}
/* Cached statistics for all CPUs within a node */
struct numa_stats {
......@@ -3504,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
if (decayed)
cfs_rq_util_change(cfs_rq, 0);
return decayed;
}
......@@ -3616,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed && (flags & UPDATE_TG))
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
update_tg_load_avg(cfs_rq, 0);
}
}
#ifndef CONFIG_64BIT
......@@ -3763,11 +3768,22 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (ue.enqueued & UTIL_AVG_UNCHANGED)
return;
/*
* Reset EWMA on utilization increases, the moving average is used only
* to smooth utilization decreases.
*/
ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
if (sched_feat(UTIL_EST_FASTUP)) {
if (ue.ewma < ue.enqueued) {
ue.ewma = ue.enqueued;
goto done;
}
}
/*
* Skip update of task's estimated utilization when its EWMA is
* already ~1% close to its last activation value.
*/
ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
return;
......@@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
WRITE_ONCE(p->se.avg.util_est, ue);
}
......@@ -5370,26 +5387,45 @@ static int sched_idle_cpu(int cpu)
rq->nr_running);
}
static unsigned long cpu_runnable_load(struct rq *rq)
static unsigned long cpu_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
return cfs_rq_load_avg(&rq->cfs);
}
static unsigned long capacity_of(int cpu)
/*
* cpu_load_without - compute CPU load without any contributions from *p
* @cpu: the CPU which load is requested
* @p: the task which load should be discounted
*
* The load of a CPU is defined by the load of tasks currently enqueued on that
* CPU as well as tasks which are currently sleeping after an execution on that
* CPU.
*
* This method returns the load of the specified CPU by discounting the load of
* the specified task, whenever the task is currently contributing to the CPU
* load.
*/
static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
return cpu_rq(cpu)->cpu_capacity;
}
struct cfs_rq *cfs_rq;
unsigned int load;
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
unsigned long load_avg = cpu_runnable_load(rq);
/* Task has no contribution or is new */
if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_load(rq);
cfs_rq = &rq->cfs;
load = READ_ONCE(cfs_rq->avg.load_avg);
if (nr_running)
return load_avg / nr_running;
/* Discount task's util from CPU's util */
lsub_positive(&load, task_h_load(p));
return 0;
return load;
}
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
......@@ -5482,7 +5518,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
......@@ -5500,7 +5536,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
......@@ -5538,149 +5574,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
static unsigned long cpu_util_without(int cpu, struct task_struct *p);
static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
{
return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
}
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
unsigned long min_runnable_load = ULONG_MAX;
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
do {
unsigned long load, avg_load, runnable_load;
unsigned long spare_cap, max_spare_cap;
int local_group;
int i;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
/*
* Tally up the load of all CPUs in the group and find
* the group containing the CPU with most spare capacity.
*/
avg_load = 0;
runnable_load = 0;
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
load = cpu_runnable_load(cpu_rq(i));
runnable_load += load;
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
spare_cap = capacity_spare_without(i, p);
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
group->sgc->capacity;
runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
group->sgc->capacity;
if (local_group) {
this_runnable_load = runnable_load;
this_avg_load = avg_load;
this_spare = max_spare_cap;
} else {
if (min_runnable_load > (runnable_load + imbalance)) {
/*
* The runnable load is significantly smaller
* so we can pick this new CPU:
*/
min_runnable_load = runnable_load;
min_avg_load = avg_load;
idlest = group;
} else if ((runnable_load < (min_runnable_load + imbalance)) &&
(100*min_avg_load > imbalance_scale*avg_load)) {
/*
* The runnable loads are close so take the
* blocked load into account through avg_load:
*/
min_avg_load = avg_load;
idlest = group;
}
if (most_spare < max_spare_cap) {
most_spare = max_spare_cap;
most_spare_sg = group;
}
}
} while (group = group->next, group != sd->groups);
/*
* The cross-over point between using spare capacity or least load
* is too conservative for high utilization tasks on partially
* utilized systems if we require spare_capacity > task_util(p),
* so we allow for some task stuffing by using
* spare_capacity > task_util(p)/2.
*
* Spare capacity can't be used for fork because the utilization has
* not been set yet, we must first select a rq to compute the initial
* utilization.
*/
if (sd_flag & SD_BALANCE_FORK)
goto skip_spare;
if (this_spare > task_util(p) / 2 &&
imbalance_scale*this_spare > 100*most_spare)
return NULL;
if (most_spare > task_util(p) / 2)
return most_spare_sg;
skip_spare:
if (!idlest)
return NULL;
/*
* When comparing groups across NUMA domains, it's possible for the
* local domain to be very lightly loaded relative to the remote
* domains but "imbalance" skews the comparison making remote CPUs
* look much more favourable. When considering cross-domain, add
* imbalance to the runnable load on the remote node and consider
* staying local.
*/
if ((sd->flags & SD_NUMA) &&
min_runnable_load + imbalance >= this_runnable_load)
return NULL;
if (min_runnable_load > (this_runnable_load + imbalance))
return NULL;
if ((this_runnable_load < (min_runnable_load + imbalance)) &&
(100*this_avg_load < imbalance_scale*min_avg_load))
return NULL;
return idlest;
}
int this_cpu, int sd_flag);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
......@@ -5729,7 +5625,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
continue;
}
load = cpu_runnable_load(cpu_rq(i));
load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
......@@ -5753,7 +5649,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
* We need task's util for capacity_spare_without, sync it up to
* We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
......@@ -6746,7 +6642,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
set_last_buddy(se);
}
static struct task_struct *
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
......@@ -6890,6 +6786,11 @@ done: __maybe_unused;
return NULL;
}
static struct task_struct *__pick_next_task_fair(struct rq *rq)
{
return pick_next_task_fair(rq, NULL, NULL);
}
/*
* Account for a descheduled task:
*/
......@@ -7079,11 +6980,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
/*
* 'group_type' describes the group of CPUs at the moment of load balancing.
*
* The enum is ordered by pulling priority, with the group with lowest priority
* first so the group_type can simply be compared when selecting the busiest
* group. See update_sd_pick_busiest().
*/
enum group_type {
group_other = 0,
/* The group has spare capacity that can be used to run more tasks. */
group_has_spare = 0,
/*
* The group is fully used and the tasks don't compete for more CPU
* cycles. Nevertheless, some tasks might wait before running.
*/
group_fully_busy,
/*
* SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
* and must be migrated to a more powerful CPU.
*/
group_misfit_task,
/*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
*/
group_asym_packing,
/*
* The tasks' affinity constraints previously prevented the scheduler
* from balancing the load across the system.
*/
group_imbalanced,
group_overloaded,
/*
* The CPU is overloaded and can't provide expected CPU cycles to all
* tasks.
*/
group_overloaded
};
enum migration_type {
migrate_load = 0,
migrate_util,
migrate_task,
migrate_misfit
};
#define LBF_ALL_PINNED 0x01
......@@ -7116,7 +7055,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
enum group_type src_grp_type;
enum migration_type migration_type;
struct list_head tasks;
};
......@@ -7339,7 +7278,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
* detach_tasks() -- tries to detach up to imbalance runnable load from
* detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
......@@ -7347,8 +7286,8 @@ static const unsigned int sched_nr_migrate_break = 32;
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
unsigned long util, load;
struct task_struct *p;
unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
......@@ -7381,19 +7320,46 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
switch (env->migration_type) {
case migrate_load:
load = task_h_load(p);
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
if (sched_feat(LB_MIN) &&
load < 16 && !env->sd->nr_balance_failed)
goto next;
if (load/2 > env->imbalance)
goto next;
if ((load / 2) > env->imbalance)
env->imbalance -= load;
break;
case migrate_util:
util = task_util_est(p);
if (util > env->imbalance)
goto next;
env->imbalance -= util;
break;
case migrate_task:
env->imbalance--;
break;
case migrate_misfit:
/* This is not a misfit task */
if (task_fits_capacity(p, capacity_of(env->src_cpu)))
goto next;
env->imbalance = 0;
break;
}
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPTION
/*
......@@ -7407,7 +7373,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
* runnable load.
* load/util/tasks.
*/
if (env->imbalance <= 0)
break;
......@@ -7517,6 +7483,28 @@ static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
#endif
static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
bool decayed;
/*
* update_load_avg() can call cpufreq_update_util(). Make sure that RT,
* DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
*done = false;
return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
......@@ -7536,29 +7524,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
return true;
}
static void update_blocked_averages(int cpu)
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
/*
* update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
* that RT, DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
done = false;
bool decayed = false;
int cpu = cpu_of(rq);
/*
* Iterates the task_group tree in a bottom up fashion, see
......@@ -7567,9 +7537,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
if (cfs_rq == &rq->cfs)
decayed = true;
}
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
......@@ -7584,11 +7558,10 @@ static void update_blocked_averages(int cpu)
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
*done = false;
}
update_blocked_load_status(rq, !done);
rq_unlock_irqrestore(rq, &rf);
return decayed;
}
/*
......@@ -7638,29 +7611,16 @@ static unsigned long task_h_load(struct task_struct *p)
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static inline void update_blocked_averages(int cpu)
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
const struct sched_class *curr_class;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
bool decayed;
/*
* update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
* that RT, DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
if (cfs_rq_has_blocked(cfs_rq))
*done = false;
update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
rq_unlock_irqrestore(rq, &rf);
return decayed;
}
static unsigned long task_h_load(struct task_struct *p)
......@@ -7669,6 +7629,24 @@ static unsigned long task_h_load(struct task_struct *p)
}
#endif
static void update_blocked_averages(int cpu)
{
bool decayed = false, done = true;
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
decayed |= __update_blocked_others(rq, &done);
decayed |= __update_blocked_fair(rq, &done);
update_blocked_load_status(rq, !done);
if (decayed)
cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}
/********** Helpers for find_busiest_group ************************/
/*
......@@ -7677,14 +7655,14 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
unsigned int sum_nr_running; /* Nr of tasks running in the group */
unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
......@@ -7699,10 +7677,10 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
......@@ -7713,19 +7691,18 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
/*
* Skimp on the clearing to avoid duplicate work. We can avoid clearing
* local_stat because update_sg_lb_stats() does a full clear/assignment.
* We must however clear busiest_stat::avg_load because
* update_sd_pick_busiest() reads this before assignment.
* We must however set busiest_stat::group_type and
* busiest_stat::idle_cpus to the worst busiest group because
* update_sd_pick_busiest() reads these before assignment.
*/
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
.total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
.sum_nr_running = 0,
.group_type = group_other,
.idle_cpus = UINT_MAX,
.group_type = group_has_spare,
},
};
}
......@@ -7913,13 +7890,13 @@ static inline int sg_imbalanced(struct sched_group *group)
* any benefit for the load balance.
*/
static inline bool
group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running < sgs->group_weight)
return true;
if ((sgs->group_capacity * 100) >
(sgs->group_util * env->sd->imbalance_pct))
(sgs->group_util * imbalance_pct))
return true;
return false;
......@@ -7934,13 +7911,13 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
* false.
*/
static inline bool
group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running <= sgs->group_weight)
return false;
if ((sgs->group_capacity * 100) <
(sgs->group_util * env->sd->imbalance_pct))
(sgs->group_util * imbalance_pct))
return true;
return false;
......@@ -7967,19 +7944,26 @@ group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
}
static inline enum
group_type group_classify(struct sched_group *group,
group_type group_classify(unsigned int imbalance_pct,
struct sched_group *group,
struct sg_lb_stats *sgs)
{
if (sgs->group_no_capacity)
if (group_is_overloaded(imbalance_pct, sgs))
return group_overloaded;
if (sg_imbalanced(group))
return group_imbalanced;
if (sgs->group_asym_packing)
return group_asym_packing;
if (sgs->group_misfit_task_load)
return group_misfit_task;
return group_other;
if (!group_has_capacity(imbalance_pct, sgs))
return group_fully_busy;
return group_has_spare;
}
static bool update_nohz_stats(struct rq *rq, bool force)
......@@ -8016,21 +8000,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
int i, nr_running;
int i, nr_running, local_group;
memset(sgs, 0, sizeof(*sgs));
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
sgs->group_load += cpu_runnable_load(rq);
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
......@@ -8044,9 +8032,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
/*
* No need to call idle_cpu() if nr_running is not 0
*/
if (!nr_running && idle_cpu(i))
if (!nr_running && idle_cpu(i)) {
sgs->idle_cpus++;
/* Idle cpu can't have misfit task */
continue;
}
if (local_group)
continue;
/* Check for a misfit task on the cpu */
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
......@@ -8054,17 +8049,24 @@ static inline void update_sg_lb_stats(struct lb_env *env,
}
}
/* Adjust by relative CPU capacity of the group */
sgs->group_capacity = group->sgc->capacity;
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
/* Check if dst CPU is idle and preferred to this group */
if (env->sd->flags & SD_ASYM_PACKING &&
env->idle != CPU_NOT_IDLE &&
sgs->sum_h_nr_running &&
sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
sgs->group_asym_packing = 1;
}
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
sgs->group_no_capacity = group_is_overloaded(env, sgs);
sgs->group_type = group_classify(group, sgs);
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
if (sgs->group_type == group_overloaded)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
/**
......@@ -8087,6 +8089,10 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
/* Make sure that there is at least one task to pull */
if (!sgs->sum_h_nr_running)
return false;
/*
* Don't try to pull misfit tasks we can't help.
* We can use max_capacity here as reduction in capacity on some
......@@ -8095,7 +8101,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if (sgs->group_type == group_misfit_task &&
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
!group_has_capacity(env, &sds->local_stat)))
sds->local_stat.group_type != group_has_spare))
return false;
if (sgs->group_type > busiest->group_type)
......@@ -8104,62 +8110,88 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
/*
* The candidate and the current busiest group are the same type of
* group. Let check which one is the busiest according to the type.
*/
switch (sgs->group_type) {
case group_overloaded:
/* Select the overloaded group with highest avg_load. */
if (sgs->avg_load <= busiest->avg_load)
return false;
break;
if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
goto asym_packing;
case group_imbalanced:
/*
* Candidate sg has no more than one task per CPU and
* has higher per-CPU capacity. Migrating tasks to less
* capable CPUs may harm throughput. Maximize throughput,
* power/energy consequences are not considered.
* Select the 1st imbalanced group as we don't have any way to
* choose one more than another.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
group_smaller_min_cpu_capacity(sds->local, sg))
return false;
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
return false;
break;
case group_misfit_task:
/*
* If we have more than one misfit sg go with the biggest misfit.
* If we have more than one misfit sg go with the biggest
* misfit.
*/
if (sgs->group_type == group_misfit_task &&
sgs->group_misfit_task_load < busiest->group_misfit_task_load)
if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
return false;
break;
asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/* No ASYM_PACKING if target CPU is already busy */
if (env->idle == CPU_NOT_IDLE)
return true;
case group_fully_busy:
/*
* ASYM_PACKING needs to move all the work to the highest
* prority CPUs in the group, therefore mark all groups
* of lower priority than ourself as busy.
* Select the fully busy group with highest avg_load. In
* theory, there is no need to pull task from such kind of
* group because tasks have all compute capacity that they need
* but we can still improve the overall throughput by reducing
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
* select the 1st one.
*/
if (sgs->sum_nr_running &&
sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
if (!sds->busiest)
return true;
/* Prefer to move from lowest priority CPU's work */
if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
sg->asym_prefer_cpu))
return true;
}
if (sgs->avg_load <= busiest->avg_load)
return false;
}
break;
case group_has_spare:
/*
* Select not overloaded group with lowest number of
* idle cpus. We could also compare the spare capacity
* which is more stable but it can end up that the
* group has less spare capacity but finally more idle
* CPUs which means less opportunity to pull tasks.
*/
if (sgs->idle_cpus >= busiest->idle_cpus)
return false;
break;
}
/*
* Candidate sg has no more than one task per CPU and has higher
* per-CPU capacity. Migrating tasks to less capable CPUs may harm
* throughput. Maximize throughput, power/energy consequences are not
* considered.
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
(group_smaller_min_cpu_capacity(sds->local, sg)))
return false;
return true;
}
#ifdef CONFIG_NUMA_BALANCING
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
if (sgs->sum_nr_running > sgs->nr_numa_running)
if (sgs->sum_h_nr_running > sgs->nr_numa_running)
return regular;
if (sgs->sum_nr_running > sgs->nr_preferred_running)
if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
return remote;
return all;
}
......@@ -8184,18 +8216,310 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
/*
* task_running_on_cpu - return 1 if @p is running on @cpu.
*/
static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return 0;
if (task_on_rq_queued(p))
return 1;
return 0;
}
/**
* idle_cpu_without - would a given CPU be idle without p ?
* @cpu: the processor on which idleness is tested.
* @p: task which should be ignored.
*
* Return: 1 if the CPU would be idle. 0 otherwise.
*/
static int idle_cpu_without(int cpu, struct task_struct *p)
{
struct rq *rq = cpu_rq(cpu);
if (rq->curr != rq->idle && rq->curr != p)
return 0;
/*
* rq->nr_running can't be used but an updated version without the
* impact of p on cpu must be used instead. The updated nr_running
* be computed and tested before calling idle_cpu_without().
*/
#ifdef CONFIG_SMP
if (!llist_empty(&rq->wake_list))
return 0;
#endif
return 1;
}
/*
* update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
* @sd: The sched_domain level to look for idlest group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @p: The task for which we look for the idlest group/CPU.
*/
static inline void update_sg_wakeup_stats(struct sched_domain *sd,
struct sched_group *group,
struct sg_lb_stats *sgs,
struct task_struct *p)
{
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
sgs->group_load += cpu_load_without(rq, p);
sgs->group_util += cpu_util_without(i, p);
local = task_running_on_cpu(i, p);
sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
/*
* No need to call idle_cpu_without() if nr_running is not 0
*/
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
}
/* Check if task fits in the group */
if (sd->flags & SD_ASYM_CPUCAPACITY &&
!task_fits_capacity(p, group->sgc->max_capacity)) {
sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
/*
* Computing avg_load makes sense only when group is fully busy or
* overloaded
*/
if (sgs->group_type < group_fully_busy)
sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
sgs->group_capacity;
}
static bool update_pick_idlest(struct sched_group *idlest,
struct sg_lb_stats *idlest_sgs,
struct sched_group *group,
struct sg_lb_stats *sgs)
{
if (sgs->group_type < idlest_sgs->group_type)
return true;
if (sgs->group_type > idlest_sgs->group_type)
return false;
/*
* The candidate and the current idlest group are the same type of
* group. Let check which one is the idlest according to the type.
*/
switch (sgs->group_type) {
case group_overloaded:
case group_fully_busy:
/* Select the group with lowest avg_load. */
if (idlest_sgs->avg_load <= sgs->avg_load)
return false;
break;
case group_imbalanced:
case group_asym_packing:
/* Those types are not used in the slow wakeup path */
return false;
case group_misfit_task:
/* Select group with the highest max capacity */
if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
return false;
break;
case group_has_spare:
/* Select group with most idle CPUs */
if (idlest_sgs->idle_cpus >= sgs->idle_cpus)
return false;
break;
}
return true;
}
/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
struct sg_lb_stats local_sgs, tmp_sgs;
struct sg_lb_stats *sgs;
unsigned long imbalance;
struct sg_lb_stats idlest_sgs = {
.avg_load = UINT_MAX,
.group_type = group_overloaded,
};
imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
do {
int local_group;
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
sched_group_span(group));
if (local_group) {
sgs = &local_sgs;
local = group;
} else {
sgs = &tmp_sgs;
}
update_sg_wakeup_stats(sd, group, sgs, p);
if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
idlest = group;
idlest_sgs = *sgs;
}
} while (group = group->next, group != sd->groups);
/* There is no idlest group to push tasks to */
if (!idlest)
return NULL;
/*
* If the local group is idler than the selected idlest group
* don't try and push the task.
*/
if (local_sgs.group_type < idlest_sgs.group_type)
return NULL;
/*
* If the local group is busier than the selected idlest group
* try and push the task.
*/
if (local_sgs.group_type > idlest_sgs.group_type)
return idlest;
switch (local_sgs.group_type) {
case group_overloaded:
case group_fully_busy:
/*
* When comparing groups across NUMA domains, it's possible for
* the local domain to be very lightly loaded relative to the
* remote domains but "imbalance" skews the comparison making
* remote CPUs look much more favourable. When considering
* cross-domain, add imbalance to the load on the remote node
* and consider staying local.
*/
if ((sd->flags & SD_NUMA) &&
((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
return NULL;
/*
* If the local group is less loaded than the selected
* idlest group don't try and push any tasks.
*/
if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
return NULL;
if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
return NULL;
break;
case group_imbalanced:
case group_asym_packing:
/* Those type are not used in the slow wakeup path */
return NULL;
case group_misfit_task:
/* Select group with the highest max capacity */
if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
return NULL;
break;
case group_has_spare:
if (sd->flags & SD_NUMA) {
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
* If there is spare capacity at NUMA, try to select
* the preferred node
*/
if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
return NULL;
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
#endif
/*
* Otherwise, keep the task on this node to stay close
* its wakeup source and improve locality. If there is
* a real need of migration, periodic load balance will
* take care of it.
*/
if (local_sgs.idle_cpus)
return NULL;
}
/*
* Select group with highest number of idle CPUs. We could also
* compare the utilization which is more stable but it can end
* up that the group has less spare capacity but finally more
* idle CPUs which means more opportunity to run task.
*/
if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
return NULL;
break;
}
return idlest;
}
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
......@@ -8222,22 +8546,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (local_group)
goto next_group;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks. The extra check prevents the case where
* you always pull from the heaviest group when it is already
* under-utilized (possible with a large weight task outweighs
* the tasks on the system).
*/
if (prefer_sibling && sds->local &&
group_has_capacity(env, local) &&
(sgs->sum_nr_running > local->sum_nr_running + 1)) {
sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs);
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
......@@ -8246,13 +8554,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
#ifdef CONFIG_NO_HZ_COMMON
if ((env->flags & LBF_NOHZ_AGAIN) &&
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
......@@ -8283,203 +8593,160 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
/**
* check_asym_packing - Check to see if the group is packed into the
* sched domain.
*
* This is primarily intended to used at the sibling level. Some
* cores like POWER7 prefer to use lower numbered SMT threads. In the
* case of POWER7, it can move to lower SMT modes only when higher
* threads are idle. When in lower SMT modes, the threads will
* perform better since they share less core resources. Hence when we
* have idle threads, we want them to be the higher ones.
*
* This packing function is run on idle threads. It checks to see if
* the busiest CPU in this domain (core in the P7 case) has a higher
* CPU number than the packing function is being run on. Here we are
* assuming lower CPU number will be equivalent to lower a SMT thread
* number.
*
* Return: 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
*/
static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
{
int busiest_cpu;
if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
if (env->idle == CPU_NOT_IDLE)
return 0;
if (!sds->busiest)
return 0;
busiest_cpu = sds->busiest->asym_prefer_cpu;
if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
return 0;
env->imbalance = sds->busiest_stat.group_load;
return 1;
}
/**
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (!local->sum_nr_running)
local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
else if (busiest->load_per_task > local->load_per_task)
imbn = 1;
scaled_busy_load_per_task =
(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
busiest->group_capacity;
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
env->imbalance = busiest->load_per_task;
if (busiest->group_type == group_misfit_task) {
/* Set imbalance to allow misfit tasks to be balanced. */
env->migration_type = migrate_misfit;
env->imbalance = 1;
return;
}
if (busiest->group_type == group_asym_packing) {
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU capacity used by
* moving them.
* In case of asym capacity, we will try to migrate all load to
* the preferred CPU.
*/
capa_now += busiest->group_capacity *
min(busiest->load_per_task, busiest->avg_load);
capa_now += local->group_capacity *
min(local->load_per_task, local->avg_load);
capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
if (busiest->avg_load > scaled_busy_load_per_task) {
capa_move += busiest->group_capacity *
min(busiest->load_per_task,
busiest->avg_load - scaled_busy_load_per_task);
env->migration_type = migrate_task;
env->imbalance = busiest->sum_h_nr_running;
return;
}
/* Amount of load we'd add */
if (busiest->avg_load * busiest->group_capacity <
busiest->load_per_task * SCHED_CAPACITY_SCALE) {
tmp = (busiest->avg_load * busiest->group_capacity) /
local->group_capacity;
} else {
tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
local->group_capacity;
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure CPU-load equilibrium, try to move any task to fix
* the imbalance. The next load balance will take care of
* balancing back the system.
*/
env->migration_type = migrate_task;
env->imbalance = 1;
return;
}
capa_move += local->group_capacity *
min(local->load_per_task, local->avg_load + tmp);
capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
if (capa_move > capa_now)
env->imbalance = busiest->load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
* XXX Spreading tasks across NUMA nodes is not always the best policy
* and special care should be taken for SD_NUMA domain level before
* spreading the tasks. For now, load_balance() fully relies on
* NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
struct sg_lb_stats *local, *busiest;
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (local->group_type == group_has_spare) {
if (busiest->group_type > group_fully_busy) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
* in busiest or busiest still being overloaded but
* there is no simple way to directly compute the
* amount of load to migrate in order to balance the
* system.
*/
env->migration_type = migrate_util;
env->imbalance = max(local->group_capacity, local->group_util) -
local->group_util;
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure CPU-load equilibrium, look at wider averages. XXX
* In some cases, the group's utilization is max or even
* higher than capacity because of migrations but the
* local CPU is (newly) idle. There is at least one
* waiting task in this overloaded busiest group. Let's
* try to pull it.
*/
busiest->load_per_task =
min(busiest->load_per_task, sds->avg_load);
if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
env->migration_type = migrate_task;
env->imbalance = 1;
}
return;
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
unsigned int nr_diff = busiest->sum_nr_running;
/*
* Avg load of busiest sg can be less and avg load of local sg can
* be greater than avg load across all sgs of sd because avg load
* factors in sg capacity and sgs with smaller group_type are
* skipped when updating the busiest sg:
* When prefer sibling, evenly spread running tasks on
* groups.
*/
if (busiest->group_type != group_misfit_task &&
(busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load)) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
return;
}
/*
* If there aren't any idle CPUs, avoid creating some.
* If there is no overload, we just want to even the number of
* idle cpus.
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
load_above_capacity *= scale_load_down(NICE_0_LOAD);
load_above_capacity /= busiest->group_capacity;
} else
load_above_capacity = ~0UL;
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
return;
}
/*
* We're trying to get all the CPUs to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded CPU below the average load. At the same time,
* we also don't want to reduce the group load below the group
* capacity. Thus we look for the minimum possible imbalance.
* Local is fully busy but has to take more load to relieve the
* busiest group
*/
if (local->group_type < group_overloaded) {
/*
* Local will become overloaded so the avg_load metrics are
* finally needed.
*/
max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
max_pull * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
/* Boost imbalance to allow misfit task to be balanced. */
if (busiest->group_type == group_misfit_task) {
env->imbalance = max_t(long, env->imbalance,
busiest->group_misfit_task_load);
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
}
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
* Both group are or will become overloaded and we're trying to get all
* the CPUs to the average_load, so we don't want to push ourselves
* above the average load, nor do we wish to reduce the max loaded CPU
* below the average load. At the same time, we also don't want to
* reduce the group load below the group capacity. Thus we look for
* the minimum possible imbalance.
*/
if (env->imbalance < busiest->load_per_task)
return fix_small_imbalance(env, sds);
env->migration_type = migrate_load;
env->imbalance = min(
(busiest->avg_load - sds->avg_load) * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
}
/******* find_busiest_group() helpers end here *********************/
/*
* Decision matrix according to the local and busiest group type:
*
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
* misfit_task force N/A N/A N/A force force
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
*
* N/A : Not Applicable because already filtered while updating
* statistics.
* balanced : The system is balanced for these 2 groups.
* force : Calculate the imbalance as load migration is probably needed.
* avg_load : Only if imbalance is significant enough.
* nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
* different in groups.
*/
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
......@@ -8499,7 +8766,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
init_sd_lb_stats(&sds);
/*
* Compute the various statistics relavent for load balancing at
* Compute the various statistics relevant for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
......@@ -8514,17 +8781,17 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
local = &sds.local_stat;
busiest = &sds.busiest_stat;
/* ASYM feature bypasses nice load balance check */
if (check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest || busiest->sum_nr_running == 0)
if (!sds.busiest)
goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
/*
* If the busiest group is imbalanced the below checks don't
......@@ -8535,55 +8802,80 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/*
* When dst_cpu is idle, prevent SMP nice and/or asymmetric group
* capacities from resulting in underutilization due to avg_load.
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
if (local->group_type > busiest->group_type)
goto out_balanced;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
* When groups are overloaded, use the avg_load to ensure fairness
* between tasks.
*/
if (local->group_type == group_overloaded) {
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/* XXX broken for overlapping NUMA groups */
sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
sds.total_capacity;
/*
* Don't pull any tasks if this group is already above the domain
* average load.
* Don't pull any tasks if this group is already above the
* domain average load.
*/
if (local->avg_load >= sds.avg_load)
goto out_balanced;
if (env->idle == CPU_IDLE) {
/*
* This CPU is idle. If the busiest group is not overloaded
* and there is no imbalance between this and busiest group
* wrt idle CPUs, it is balanced. The imbalance becomes
* significant if the diff is greater than 1 otherwise we
* might end up to just move the imbalance on another group
* If the busiest group is more loaded, use imbalance_pct to be
* conservative.
*/
if ((busiest->group_type != group_overloaded) &&
(local->idle_cpus <= (busiest->idle_cpus + 1)))
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
} else {
}
/* Try to move all excess tasks to child's sibling domain */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
if (env->idle == CPU_NOT_IDLE)
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
if (busiest->group_weight > 1 &&
local->idle_cpus <= (busiest->idle_cpus + 1))
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
* group wrt idle CPUs, it is balanced. The imbalance
* becomes significant if the diff is greater than 1
* otherwise we might end up to just move the imbalance
* on another group. Of course this applies only if
* there is more than 1 CPU per group.
*/
goto out_balanced;
if (busiest->sum_h_nr_running == 1)
/*
* busiest doesn't have any tasks waiting to run
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
......@@ -8599,11 +8891,13 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_load = 0, busiest_capacity = 1;
unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
unsigned int busiest_nr = 0;
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
unsigned long capacity, load;
unsigned long capacity, load, util;
unsigned int nr_running;
enum fbq_type rt;
rq = cpu_rq(i);
......@@ -8631,20 +8925,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
/*
* For ASYM_CPUCAPACITY domains with misfit tasks we simply
* seek the "biggest" misfit task.
*/
if (env->src_grp_type == group_misfit_task) {
if (rq->misfit_task_load > busiest_load) {
busiest_load = rq->misfit_task_load;
busiest = rq;
}
continue;
}
capacity = capacity_of(i);
nr_running = rq->cfs.h_nr_running;
/*
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
......@@ -8654,36 +8936,70 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
capacity_of(env->dst_cpu) < capacity &&
rq->nr_running == 1)
nr_running == 1)
continue;
load = cpu_runnable_load(rq);
switch (env->migration_type) {
case migrate_load:
/*
* When comparing with imbalance, use cpu_runnable_load()
* When comparing with load imbalance, use cpu_load()
* which is not scaled with the CPU capacity.
*/
load = cpu_load(rq);
if (rq->nr_running == 1 && load > env->imbalance &&
if (nr_running == 1 && load > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
continue;
break;
/*
* For the load comparisons with the other CPU's, consider
* the cpu_runnable_load() scaled with the CPU capacity, so
* that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
* For the load comparisons with the other CPUs,
* consider the cpu_load() scaled with the CPU
* capacity, so that the load can be moved away
* from the CPU that is potentially running at a
* lower capacity.
*
* Thus we're looking for max(load_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
* to: load_i * capacity_j > load_j * capacity_i; where j is
* our previous maximum.
* Thus we're looking for max(load_i / capacity_i),
* crosswise multiplication to rid ourselves of the
* division works out to:
* load_i * capacity_j > load_j * capacity_i;
* where j is our previous maximum.
*/
if (load * busiest_capacity > busiest_load * capacity) {
busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
}
break;
case migrate_util:
util = cpu_util(cpu_of(rq));
if (busiest_util < util) {
busiest_util = util;
busiest = rq;
}
break;
case migrate_task:
if (busiest_nr < nr_running) {
busiest_nr = nr_running;
busiest = rq;
}
break;
case migrate_misfit:
/*
* For ASYM_CPUCAPACITY domains with misfit tasks we
* simply seek the "biggest" misfit task.
*/
if (rq->misfit_task_load > busiest_load) {
busiest_load = rq->misfit_task_load;
busiest = rq;
}
break;
}
}
return busiest;
......@@ -8728,7 +9044,7 @@ voluntary_active_balance(struct lb_env *env)
return 1;
}
if (env->src_grp_type == group_misfit_task)
if (env->migration_type == migrate_misfit)
return 1;
return 0;
......@@ -9757,6 +10073,11 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*
* Returns:
* < 0 - we released the lock and there are !fair tasks present
* 0 - failed, no new tasks
* > 0 - success, new (fair) tasks present
*/
int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
......@@ -10151,7 +10472,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
static void set_next_task_fair(struct rq *rq, struct task_struct *p)
static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
{
struct sched_entity *se = &p->se;
......@@ -10433,7 +10754,7 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
.pick_next_task = __pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
......
......@@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true)
......@@ -385,21 +385,17 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next)
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
if (prev)
put_prev_task(rq, prev);
set_next_task_idle(rq, next);
set_next_task_idle(rq, next, true);
return next;
}
......
......@@ -1515,13 +1515,16 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
if (!first)
return;
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
......@@ -1564,18 +1567,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
static struct task_struct *
pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p;
WARN_ON_ONCE(prev || rf);
if (!sched_rt_runnable(rq))
return NULL;
p = _pick_next_task_rt(rq);
set_next_task_rt(rq, p);
set_next_task_rt(rq, p, true);
return p;
}
......
......@@ -1713,22 +1713,10 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
* Both @prev and @rf are optional and may be NULL, in which case the
* caller must already have invoked put_prev_task(rq, prev, rf).
*
* Otherwise it is the responsibility of the pick_next_task() to call
* put_prev_task() on the @prev task or something equivalent, IFF it
* returns a next task.
*
* In that case (@rf != NULL) it may return RETRY_TASK when it finds a
* higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
struct task_struct *(*pick_next_task)(struct rq *rq);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
......@@ -1780,7 +1768,7 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next);
next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
......@@ -1821,6 +1809,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_running > 0;
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
......@@ -2309,7 +2300,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
......
......@@ -29,20 +29,17 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
stop->se.exec_start = rq_clock_task(rq);
}
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static struct task_struct *pick_next_task_stop(struct rq *rq)
{
WARN_ON_ONCE(prev || rf);
if (!sched_stop_runnable(rq))
return NULL;
set_next_task_stop(rq, rq->stop);
set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}
......
......@@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd,
if (!attr || attr->relax_domain_level < 0) {
if (default_relax_domain_level < 0)
return;
else
request = default_relax_domain_level;
} else
request = attr->relax_domain_level;
if (request < sd->level) {
if (sd->level > request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
} else {
/* Turn on idle balance on this domain: */
sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}
......
......@@ -1119,7 +1119,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
return;
/*
* We stopped the tick in idle. Update process times would miss the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment