Commit 10e2f1ac authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

sched/core: Rewrite and improve select_idle_siblings()

select_idle_siblings() is a known pain point for a number of
workloads; it either does too much or not enough and sometimes just
does plain wrong.

This rewrite attempts to address a number of issues (but sadly not
all).

The current code does an unconditional sched_domain iteration; with
the intent of finding an idle core (on SMT hardware). The problems
which this patch tries to address are:

 - its pointless to look for idle cores if the machine is real busy;
   at which point you're just wasting cycles.

 - it's behaviour is inconsistent between SMT and !SMT hardware in
   that !SMT hardware ends up doing a scan for any idle CPU in the LLC
   domain, while SMT hardware does a scan for idle cores and if that
   fails, falls back to a scan for idle threads on the 'target' core.

The new code replaces the sched_domain scan with 3 explicit scans:

 1) search for an idle core in the LLC
 2) search for an idle CPU in the LLC
 3) search for an idle thread in the 'target' core

where 1 and 3 are conditional on SMT support and 1 and 2 have runtime
heuristics to skip the step.

Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu
goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT
siblings of the CPU going idle. Similarly, we clear
sd_llc_shared->has_idle_cores when we fail to find an idle core.

Step 2) tracks the average cost of the scan and compares this to the
average idle time guestimate for the CPU doing the wakeup. There is a
significant fudge factor involved to deal with the variability of the
averages. Esp. hackbench was sensitive to this.

Step 3) is unconditional; we assume (also per step 1) that scanning
all SMT siblings in a core is 'cheap'.

With this; SMT systems gain step 2, which cures a few benchmarks --
notably one from Facebook.

One 'feature' of the sched_domain iteration, which we preserve in the
new code, is that it would start scanning from the 'target' CPU,
instead of scanning the cpumask in cpu id order. This avoids multiple
CPUs in the LLC scanning for idle to gang up and find the same CPU
quite as much. The down side is that tasks can end up hopping across
the LLC for no apparent reason.
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 0e369d75
...@@ -1070,6 +1070,7 @@ struct sched_group; ...@@ -1070,6 +1070,7 @@ struct sched_group;
struct sched_domain_shared { struct sched_domain_shared {
atomic_t ref; atomic_t ref;
atomic_t nr_busy_cpus; atomic_t nr_busy_cpus;
int has_idle_cores;
}; };
struct sched_domain { struct sched_domain {
...@@ -1102,6 +1103,8 @@ struct sched_domain { ...@@ -1102,6 +1103,8 @@ struct sched_domain {
u64 max_newidle_lb_cost; u64 max_newidle_lb_cost;
unsigned long next_decay_max_lb_cost; unsigned long next_decay_max_lb_cost;
u64 avg_scan_cost; /* select_idle_sibling */
#ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */ /* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_count[CPU_MAX_IDLE_TYPES];
......
...@@ -7478,6 +7478,7 @@ static struct kmem_cache *task_group_cache __read_mostly; ...@@ -7478,6 +7478,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif #endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void) void __init sched_init(void)
{ {
...@@ -7514,6 +7515,8 @@ void __init sched_init(void) ...@@ -7514,6 +7515,8 @@ void __init sched_init(void)
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i)); cpumask_size(), GFP_KERNEL, cpu_to_node(i));
per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
} }
#endif /* CONFIG_CPUMASK_OFFSTACK */ #endif /* CONFIG_CPUMASK_OFFSTACK */
......
...@@ -1582,9 +1582,16 @@ static void task_numa_compare(struct task_numa_env *env, ...@@ -1582,9 +1582,16 @@ static void task_numa_compare(struct task_numa_env *env,
* One idle CPU per node is evaluated for a task numa move. * One idle CPU per node is evaluated for a task numa move.
* Call select_idle_sibling to maybe find a better one. * Call select_idle_sibling to maybe find a better one.
*/ */
if (!cur) if (!cur) {
/*
* select_idle_siblings() uses an per-cpu cpumask that
* can be used from IRQ context.
*/
local_irq_disable();
env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
env->dst_cpu); env->dst_cpu);
local_irq_enable();
}
assign: assign:
task_numa_assign(env, cur, imp); task_numa_assign(env, cur, imp);
...@@ -4616,6 +4623,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -4616,6 +4623,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON
/* /*
* per rq 'load' arrray crap; XXX kill this. * per rq 'load' arrray crap; XXX kill this.
...@@ -5280,65 +5292,231 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) ...@@ -5280,65 +5292,231 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
} }
/* /*
* Try and locate an idle CPU in the sched_domain. * Implement a for_each_cpu() variant that starts the scan at a given cpu
* (@start), and wraps around.
*
* This is used to scan for idle CPUs; such that not all CPUs looking for an
* idle CPU find the same CPU. The down-side is that tasks tend to cycle
* through the LLC domain.
*
* Especially tbench is found sensitive to this.
*/
static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
{
int next;
again:
next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
if (*wrapped) {
if (next >= start)
return nr_cpumask_bits;
} else {
if (next >= nr_cpumask_bits) {
*wrapped = 1;
n = -1;
goto again;
}
}
return next;
}
#define for_each_cpu_wrap(cpu, mask, start, wrap) \
for ((wrap) = 0, (cpu) = (start)-1; \
(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
(cpu) < nr_cpumask_bits; )
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
WRITE_ONCE(sds->has_idle_cores, val);
}
static inline bool test_idle_cores(int cpu, bool def)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
return READ_ONCE(sds->has_idle_cores);
return def;
}
/*
* Scans the local SMT mask to see if the entire core is idle, and records this
* information in sd_llc_shared->has_idle_cores.
*
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
*/
void update_idle_core(struct rq *rq)
{
int core = cpu_of(rq);
int cpu;
rcu_read_lock();
if (test_idle_cores(core, true))
goto unlock;
for_each_cpu(cpu, cpu_smt_mask(core)) {
if (cpu == core)
continue;
if (!idle_cpu(cpu))
goto unlock;
}
set_idle_cores(core, 1);
unlock:
rcu_read_unlock();
}
/*
* Scan the entire LLC domain for idle cores; this dynamically switches off if
* there are no idle cores left in the system; tracked through
* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
*/
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int core, cpu, wrap;
if (!test_idle_cores(target, false))
return -1;
cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
if (!idle_cpu(cpu))
idle = false;
}
if (idle)
return core;
}
/*
* Failed to find an idle core; stop looking for one.
*/
set_idle_cores(target, 0);
return -1;
}
/*
* Scan the local SMT mask for idle CPUs.
*/
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
if (idle_cpu(cpu))
return cpu;
}
return -1;
}
#else /* CONFIG_SCHED_SMT */
static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
#endif /* CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
* average idle time for this rq (as found in rq->avg_idle).
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
u64 avg_idle = this_rq()->avg_idle;
u64 avg_cost = this_sd->avg_scan_cost;
u64 time, cost;
s64 delta;
int cpu, wrap;
/*
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
if ((avg_idle / 512) < avg_cost)
return -1;
time = local_clock();
for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
if (idle_cpu(cpu))
break;
}
time = local_clock() - time;
cost = this_sd->avg_scan_cost;
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;
return cpu;
}
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/ */
static int select_idle_sibling(struct task_struct *p, int prev, int target) static int select_idle_sibling(struct task_struct *p, int prev, int target)
{ {
struct sched_domain *sd; struct sched_domain *sd;
struct sched_group *sg; int i;
if (idle_cpu(target)) if (idle_cpu(target))
return target; return target;
/* /*
* If the prevous cpu is cache affine and idle, don't be stupid. * If the previous cpu is cache affine and idle, don't be stupid.
*/ */
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
return prev; return prev;
/*
* Otherwise, iterate the domains and find an eligible idle cpu.
*
* A completely idle sched group at higher domains is more
* desirable than an idle group at a lower level, because lower
* domains have smaller groups and usually share hardware
* resources which causes tasks to contend on them, e.g. x86
* hyperthread siblings in the lowest domain (SMT) can contend
* on the shared cpu pipeline.
*
* However, while we prefer idle groups at higher domains
* finding an idle cpu at the lowest domain is still better than
* returning 'target', which we've already established, isn't
* idle.
*/
sd = rcu_dereference(per_cpu(sd_llc, target)); sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) { if (!sd)
sg = sd->groups; return target;
do {
int i;
if (!cpumask_intersects(sched_group_cpus(sg), i = select_idle_core(p, sd, target);
tsk_cpus_allowed(p))) if ((unsigned)i < nr_cpumask_bits)
goto next; return i;
/* Ensure the entire group is idle */ i = select_idle_cpu(p, sd, target);
for_each_cpu(i, sched_group_cpus(sg)) { if ((unsigned)i < nr_cpumask_bits)
if (i == target || !idle_cpu(i)) return i;
goto next;
} i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
/*
* It doesn't matter which cpu we pick, the
* whole group is idle.
*/
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
next:
sg = sg->next;
} while (sg != sd->groups);
}
done:
return target; return target;
} }
...@@ -7397,9 +7575,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, ...@@ -7397,9 +7575,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/ */
#define MAX_PINNED_INTERVAL 512 #define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static int need_active_balance(struct lb_env *env) static int need_active_balance(struct lb_env *env)
{ {
struct sched_domain *sd = env->sd; struct sched_domain *sd = env->sd;
......
...@@ -27,7 +27,7 @@ static struct task_struct * ...@@ -27,7 +27,7 @@ static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{ {
put_prev_task(rq, prev); put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle); schedstat_inc(rq->sched_goidle);
return rq->idle; return rq->idle;
} }
......
...@@ -36,6 +36,12 @@ extern void cpu_load_update_active(struct rq *this_rq); ...@@ -36,6 +36,12 @@ extern void cpu_load_update_active(struct rq *this_rq);
static inline void cpu_load_update_active(struct rq *this_rq) { } static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif #endif
#ifdef CONFIG_SCHED_SMT
extern void update_idle_core(struct rq *rq);
#else
static inline void update_idle_core(struct rq *rq) { }
#endif
/* /*
* Helpers for converting nanosecond timing to jiffy resolution * Helpers for converting nanosecond timing to jiffy resolution
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment