Commit 970e1789 authored by Mike Galbraith's avatar Mike Galbraith Committed by Ingo Molnar

sched: Improve scalability via 'CPU buddies', which withstand random perturbations

Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package.  Fix
that up by assigning a 'buddy' CPU to try to motivate.  Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try its
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better:

 clients     1       2       4        8       16       32       64      128
 ..........................................................................
 pre        30      41     118      645     3769     6214    12233    14312
 post      299     603    1211     2418     4697     6847    11606    14557

A nice increase in performance.
Signed-off-by: default avatarMike Galbraith <efault@gmx.de>
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1339471112.7352.32.camel@marge.simpson.netSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent a1cd2b13
...@@ -949,6 +949,7 @@ struct sched_domain { ...@@ -949,6 +949,7 @@ struct sched_domain {
unsigned int smt_gain; unsigned int smt_gain;
int flags; /* See SD_* */ int flags; /* See SD_* */
int level; int level;
int idle_buddy; /* cpu assigned to select_idle_sibling() */
/* Runtime fields. */ /* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */ unsigned long last_balance; /* init to jiffies. units in jiffies */
......
...@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) ...@@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling(). * allows us to avoid some pointer chasing select_idle_sibling().
* *
* Iterate domains and sched_groups downward, assigning CPUs to be
* select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
* due to random perturbation self canceling, ie sw buddies pull
* their counterpart to their CPU's hw counterpart.
*
* Also keep a unique ID per domain (we use the first cpu number in * Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if * the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache(). * two cpus are in the same cache domain, see cpus_share_cache().
...@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu) ...@@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
int id = cpu; int id = cpu;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
if (sd) if (sd) {
struct sched_domain *tmp = sd;
struct sched_group *sg, *prev;
bool right;
/*
* Traverse to first CPU in group, and count hops
* to cpu from there, switching direction on each
* hop, never ever pointing the last CPU rightward.
*/
do {
id = cpumask_first(sched_domain_span(tmp));
prev = sg = tmp->groups;
right = 1;
while (cpumask_first(sched_group_cpus(sg)) != id)
sg = sg->next;
while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
prev = sg;
sg = sg->next;
right = !right;
}
/* A CPU went down, never point back to domain start. */
if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
right = false;
sg = right ? sg->next : prev;
tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
} while ((tmp = tmp->child));
id = cpumask_first(sched_domain_span(sd)); id = cpumask_first(sched_domain_span(sd));
}
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id; per_cpu(sd_llc_id, cpu) = id;
......
...@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id(); int cpu = smp_processor_id();
int prev_cpu = task_cpu(p); int prev_cpu = task_cpu(p);
struct sched_domain *sd; struct sched_domain *sd;
struct sched_group *sg;
int i;
/* /*
* If the task is going to be woken-up on this cpu and if it is * If the task is going to be woken-up on this cpu and if it is
...@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) ...@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
return prev_cpu; return prev_cpu;
/* /*
* Otherwise, iterate the domains and find an elegible idle cpu. * Otherwise, check assigned siblings to find an elegible idle cpu.
*/ */
sd = rcu_dereference(per_cpu(sd_llc, target)); sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) {
sg = sd->groups;
do {
if (!cpumask_intersects(sched_group_cpus(sg),
tsk_cpus_allowed(p)))
goto next;
for_each_cpu(i, sched_group_cpus(sg)) { for_each_lower_domain(sd) {
if (!idle_cpu(i)) if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
goto next; continue;
if (idle_cpu(sd->idle_buddy))
return sd->idle_buddy;
} }
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
next:
sg = sg->next;
} while (sg != sd->groups);
}
done:
return target; return target;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment