Commit f16c759b authored by Andrew Theurer's avatar Andrew Theurer Committed by Linus Torvalds

[PATCH] sched: more agressive wake_idle()

This patch addresses some problems with wake_idle().  Currently wake_idle()
will wake a task on an alternate cpu if:

1) task->cpu is not idle
2) an idle cpu can be found

However the span of cpus to look for is very limited (only the task->cpu's
sibling).  The scheduler should find the closest idle cpu, starting with
the lowest level domain, then going to higher level domains if allowed
(doamin has flag SD_WAKE_IDLE).  This patch does this.

This and the other two patches (also to be submitted) combined have
provided as much at 5% improvement on that "online transaction DB workload"
and 2% on the industry standard J@EE workload.

I asked Martin Bligh to test these for regression, and he did not find any.
 I would like to submit for inclusion to -mm and barring any problems
eventually to mainline.

Signed-off-by: <habanero@us.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 39a488d1
...@@ -80,6 +80,7 @@ static inline cpumask_t pcibus_to_cpumask(int bus) ...@@ -80,6 +80,7 @@ static inline cpumask_t pcibus_to_cpumask(int bus)
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
| SD_WAKE_IDLE \
| SD_WAKE_BALANCE, \ | SD_WAKE_BALANCE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
......
...@@ -56,6 +56,7 @@ void build_cpu_to_node_map(void); ...@@ -56,6 +56,7 @@ void build_cpu_to_node_map(void);
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
| SD_WAKE_IDLE \
| SD_WAKE_BALANCE, \ | SD_WAKE_BALANCE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
......
...@@ -51,6 +51,7 @@ static inline int node_to_first_cpu(int node) ...@@ -51,6 +51,7 @@ static inline int node_to_first_cpu(int node)
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
| SD_WAKE_IDLE \
| SD_WAKE_BALANCE, \ | SD_WAKE_BALANCE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
......
...@@ -53,6 +53,7 @@ static inline cpumask_t __pcibus_to_cpumask(int bus) ...@@ -53,6 +53,7 @@ static inline cpumask_t __pcibus_to_cpumask(int bus)
.per_cpu_gain = 100, \ .per_cpu_gain = 100, \
.flags = SD_LOAD_BALANCE \ .flags = SD_LOAD_BALANCE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
| SD_WAKE_IDLE \
| SD_WAKE_BALANCE, \ | SD_WAKE_BALANCE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
......
...@@ -123,6 +123,7 @@ static inline int __next_node_with_cpus(int node) ...@@ -123,6 +123,7 @@ static inline int __next_node_with_cpus(int node)
| SD_BALANCE_NEWIDLE \ | SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \ | SD_BALANCE_EXEC \
| SD_WAKE_AFFINE \ | SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
| SD_WAKE_BALANCE, \ | SD_WAKE_BALANCE, \
.last_balance = jiffies, \ .last_balance = jiffies, \
.balance_interval = 1, \ .balance_interval = 1, \
......
...@@ -935,9 +935,10 @@ static inline unsigned long target_load(int cpu) ...@@ -935,9 +935,10 @@ static inline unsigned long target_load(int cpu)
#endif #endif
/* /*
* wake_idle() is useful especially on SMT architectures to wake a * wake_idle() will wake a task on an idle cpu if task->cpu is
* task onto an idle sibling if we would otherwise wake it onto a * not idle and an idle cpu is available. The span of cpus to
* busy sibling. * search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* *
* Returns the CPU we should wake onto. * Returns the CPU we should wake onto.
*/ */
...@@ -945,24 +946,23 @@ static inline unsigned long target_load(int cpu) ...@@ -945,24 +946,23 @@ static inline unsigned long target_load(int cpu)
static int wake_idle(int cpu, task_t *p) static int wake_idle(int cpu, task_t *p)
{ {
cpumask_t tmp; cpumask_t tmp;
runqueue_t *rq = cpu_rq(cpu);
struct sched_domain *sd; struct sched_domain *sd;
int i; int i;
if (idle_cpu(cpu)) if (idle_cpu(cpu))
return cpu; return cpu;
sd = rq->sd; for_each_domain(cpu, sd) {
if (!(sd->flags & SD_WAKE_IDLE)) if (sd->flags & SD_WAKE_IDLE) {
return cpu; cpus_and(tmp, sd->span, cpu_online_map);
cpus_and(tmp, tmp, p->cpus_allowed);
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) { for_each_cpu_mask(i, tmp) {
if (idle_cpu(i)) if (idle_cpu(i))
return i; return i;
} }
}
else break;
}
return cpu; return cpu;
} }
#else #else
...@@ -1074,7 +1074,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -1074,7 +1074,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
out_set_cpu: out_set_cpu:
schedstat_inc(rq, ttwu_attempts); schedstat_inc(rq, ttwu_attempts);
new_cpu = wake_idle(new_cpu, p); new_cpu = wake_idle(new_cpu, p);
if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { if (new_cpu != cpu) {
schedstat_inc(rq, ttwu_moved); schedstat_inc(rq, ttwu_moved);
set_task_cpu(p, new_cpu); set_task_cpu(p, new_cpu);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment