Commit 3dfa303d authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] scheduler domain balancing improvements

From: Nick Piggin <piggin@cyberone.com.au>

This patch gets the sched_domain scheduler working better WRT balancing.
Its been tested on the NUMAQ.  Among other things it changes to the way SMT
load calculation works so as not to active load blances when it shouldn't.

It still has a problem with SMT and NUMA: it will put a task on each
sibling in a node before moving tasks to another node.  It should probably
start moving tasks after each *physical* CPU is filled.

To fix, you need "how much CPU power in this domain?" At the moment we
approximate # runqueues == CPU power, and hack around it at the CPU
physical domain by counting all sibling runqueues as 1.

It isn't hard to correctly work the CPU power out, but once CPU hotplug is
in the equation it becomes much more hotplug events.  If anyone is actually
interested in getting this fixed, that is.
parent b45bb339
...@@ -736,7 +736,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -736,7 +736,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
/* Passive load balancing */ /* Passive load balancing */
load = get_low_cpu_load(cpu, 1); load = get_low_cpu_load(cpu, 1);
this_load = get_high_cpu_load(this_cpu, 1); this_load = get_high_cpu_load(this_cpu, 1) + SCHED_LOAD_SCALE;
if (load > this_load) { if (load > this_load) {
new_cpu = sched_balance_wake(this_cpu, p); new_cpu = sched_balance_wake(this_cpu, p);
set_task_cpu(p, new_cpu); set_task_cpu(p, new_cpu);
...@@ -1201,6 +1201,9 @@ void sched_balance_exec(void) ...@@ -1201,6 +1201,9 @@ void sched_balance_exec(void)
if (numnodes == 1) if (numnodes == 1)
return; return;
if (this_rq()->nr_running <= 1)
return;
while (domain->parent && !(domain->flags & SD_FLAG_EXEC)) while (domain->parent && !(domain->flags & SD_FLAG_EXEC))
domain = domain->parent; domain = domain->parent;
...@@ -1367,7 +1370,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1367,7 +1370,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
unsigned long *imbalance, enum idle_type idle) unsigned long *imbalance, enum idle_type idle)
{ {
unsigned long max_load, avg_load, total_load, this_load; unsigned long max_load, avg_load, total_load, this_load;
int modify, total_nr_cpus, busiest_nr_cpus = 0; int modify, total_nr_cpus, busiest_nr_cpus, this_nr_cpus;
enum idle_type package_idle = IDLE; enum idle_type package_idle = IDLE;
struct sched_group *busiest = NULL, *group = domain->groups; struct sched_group *busiest = NULL, *group = domain->groups;
...@@ -1375,6 +1378,8 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1375,6 +1378,8 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
this_load = 0; this_load = 0;
total_load = 0; total_load = 0;
total_nr_cpus = 0; total_nr_cpus = 0;
busiest_nr_cpus = 0;
this_nr_cpus = 0;
if (group == NULL) if (group == NULL)
goto out_balanced; goto out_balanced;
...@@ -1418,14 +1423,30 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1418,14 +1423,30 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
goto nextgroup; goto nextgroup;
total_load += avg_load; total_load += avg_load;
total_nr_cpus += nr_cpus;
avg_load /= nr_cpus; /*
* Load is cumulative over SD_FLAG_IDLE domains, but
* spread over !SD_FLAG_IDLE domains. For example, 2
* processes running on an SMT CPU puts a load of 2 on
* that CPU, however 2 processes running on 2 CPUs puts
* a load of 1 on that domain.
*
* This should be configurable so as SMT siblings become
* more powerful, they can "spread" more load - for example,
* the above case might only count as a load of 1.7.
*/
if (!(domain->flags & SD_FLAG_IDLE)) {
avg_load /= nr_cpus;
total_nr_cpus += nr_cpus;
} else
total_nr_cpus++;
if (avg_load > max_load) if (avg_load > max_load)
max_load = avg_load; max_load = avg_load;
if (local_group) { if (local_group) {
this_load = avg_load; this_load = avg_load;
this_nr_cpus = nr_cpus;
} else if (avg_load >= max_load) { } else if (avg_load >= max_load) {
busiest = group; busiest = group;
busiest_nr_cpus = nr_cpus; busiest_nr_cpus = nr_cpus;
...@@ -1438,7 +1459,8 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1438,7 +1459,8 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
goto out_balanced; goto out_balanced;
avg_load = total_load / total_nr_cpus; avg_load = total_load / total_nr_cpus;
if (idle == NOT_IDLE && this_load >= avg_load)
if (this_load >= avg_load)
goto out_balanced; goto out_balanced;
if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load) if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load)
...@@ -1455,21 +1477,16 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1455,21 +1477,16 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
* by pulling tasks to us. Be careful of negative numbers as they'll * by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs. * appear as very large values with unsigned longs.
*/ */
if (avg_load >= this_load) { *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2;
*imbalance = min(max_load - avg_load, avg_load - this_load); /* Get rid of the scaling factor, rounding *up* as we divide */
/* Get rid of the scaling factor, rounding *up* as we divide */ *imbalance = (*imbalance + SCHED_LOAD_SCALE/2 + 1)
*imbalance = (*imbalance + SCHED_LOAD_SCALE - 1) >> SCHED_LOAD_SHIFT;
>> SCHED_LOAD_SHIFT;
} else
*imbalance = 0;
if (*imbalance == 0) { if (*imbalance == 0)
if (package_idle != NOT_IDLE && domain->flags & SD_FLAG_IDLE goto out_balanced;
&& max_load * busiest_nr_cpus > (3*SCHED_LOAD_SCALE/2))
*imbalance = 1; /* How many tasks to actually move to equalise the imbalance */
else *imbalance *= min(busiest_nr_cpus, this_nr_cpus);
busiest = NULL;
}
return busiest; return busiest;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment