Commit 5517d86b authored by Eric Dumazet's avatar Eric Dumazet Committed by Linus Torvalds

Speed up divides by cpu_power in scheduler

I noticed expensive divides done in try_to_wakeup() and
find_busiest_group() on a bi dual core Opteron machine (total of 4 cores),
moderatly loaded (15.000 context switch per second)

oprofile numbers :

CPU: AMD64 processors, speed 2600.05 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit
mask of 0x00 (No unit mask) count 50000
samples  %        symbol name
...
613914    1.0498  try_to_wake_up
    834  0.0013 :ffffffff80227ae1:   div    %rcx
77513  0.1191 :ffffffff80227ae4:   mov    %rax,%r11

608893    1.0413  find_busiest_group
   1841  0.0031 :ffffffff802260bf:       div    %rdi
140109  0.2394 :ffffffff802260c2:       test   %sil,%sil

Some of these divides can use the reciprocal divides we introduced some
time ago (currently used in slab AFAIK)

We can assume a load will fit in a 32bits number, because with a
SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432

When/if we reach this limit one day, probably cpus will have a fast
hardware divide and we can zap the reciprocal divide trick.

Ingo suggested to rename cpu_power to __cpu_power to make clear it should
not be modified without changing its reciprocal value too.

I did not convert the divide in cpu_avg_load_per_task(), because tracking
nr_running changes may be not worth it ?  We could use a static table of 32
reciprocal values but it would add a conditional branch and table lookup.

[akpm@linux-foundation.org: !SMP build fix]
Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Acked-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 46cb4b7c
...@@ -680,8 +680,14 @@ struct sched_group { ...@@ -680,8 +680,14 @@ struct sched_group {
/* /*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a * CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU). * single CPU. This is read only (except for setup, hotplug CPU).
* Note : Never change cpu_power without recompute its reciprocal
*/ */
unsigned long cpu_power; unsigned int __cpu_power;
/*
* reciprocal value of cpu_power to avoid expensive divides
* (see include/linux/reciprocal_div.h)
*/
u32 reciprocal_cpu_power;
}; };
struct sched_domain { struct sched_domain {
......
...@@ -52,8 +52,9 @@ ...@@ -52,8 +52,9 @@
#include <linux/tsacct_kern.h> #include <linux/tsacct_kern.h>
#include <linux/kprobes.h> #include <linux/kprobes.h>
#include <linux/delayacct.h> #include <linux/delayacct.h>
#include <asm/tlb.h> #include <linux/reciprocal_div.h>
#include <asm/tlb.h>
#include <asm/unistd.h> #include <asm/unistd.h>
/* /*
...@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio) ...@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio); return SCALE_PRIO(DEF_TIMESLICE, static_prio);
} }
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
*/
static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
{
return reciprocal_divide(load, sg->reciprocal_cpu_power);
}
/*
* Each time a sched group cpu_power is changed,
* we must compute its reciprocal value
*/
static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
{
sg->__cpu_power += val;
sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
}
#endif
/* /*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms] * to time slice values: [800ms ... 100ms ... 5ms]
...@@ -1256,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ...@@ -1256,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
} }
/* Adjust by relative CPU power of the group */ /* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);
if (local_group) { if (local_group) {
this_load = avg_load; this_load = avg_load;
...@@ -2367,12 +2390,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -2367,12 +2390,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
} }
total_load += avg_load; total_load += avg_load;
total_pwr += group->cpu_power; total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */ /* Adjust by relative CPU power of the group */
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);
group_capacity = group->cpu_power / SCHED_LOAD_SCALE; group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) { if (local_group) {
this_load = avg_load; this_load = avg_load;
...@@ -2483,8 +2507,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -2483,8 +2507,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */ /* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * busiest->cpu_power, *imbalance = min(max_pull * busiest->__cpu_power,
(avg_load - this_load) * this->cpu_power) (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE; / SCHED_LOAD_SCALE;
/* /*
...@@ -2518,27 +2542,28 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, ...@@ -2518,27 +2542,28 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* moving them. * moving them.
*/ */
pwr_now += busiest->cpu_power * pwr_now += busiest->__cpu_power *
min(busiest_load_per_task, max_load); min(busiest_load_per_task, max_load);
pwr_now += this->cpu_power * pwr_now += this->__cpu_power *
min(this_load_per_task, this_load); min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE; pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */ /* Amount of load we'd subtract */
tmp = busiest_load_per_task * SCHED_LOAD_SCALE / tmp = sg_div_cpu_power(busiest,
busiest->cpu_power; busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp) if (max_load > tmp)
pwr_move += busiest->cpu_power * pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp); min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */ /* Amount of load we'd add */
if (max_load * busiest->cpu_power < if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE) busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = max_load * busiest->cpu_power / this->cpu_power; tmp = sg_div_cpu_power(this,
max_load * busiest->__cpu_power);
else else
tmp = busiest_load_per_task * SCHED_LOAD_SCALE / tmp = sg_div_cpu_power(this,
this->cpu_power; busiest_load_per_task * SCHED_LOAD_SCALE);
pwr_move += this->cpu_power * pwr_move += this->__cpu_power *
min(this_load_per_task, this_load + tmp); min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE; pwr_move /= SCHED_LOAD_SCALE;
...@@ -5501,7 +5526,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) ...@@ -5501,7 +5526,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break; break;
} }
if (!group->cpu_power) { if (!group->__cpu_power) {
printk("\n"); printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not " printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n"); "set\n");
...@@ -5678,7 +5703,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, ...@@ -5678,7 +5703,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
continue; continue;
sg->cpumask = CPU_MASK_NONE; sg->cpumask = CPU_MASK_NONE;
sg->cpu_power = 0; sg->__cpu_power = 0;
for_each_cpu_mask(j, span) { for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group) if (group_fn(j, cpu_map, NULL) != group)
...@@ -6367,7 +6392,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) ...@@ -6367,7 +6392,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
continue; continue;
} }
sg->cpu_power += sd->groups->cpu_power; sg_inc_cpu_power(sg, sd->groups->__cpu_power);
} }
sg = sg->next; sg = sg->next;
if (sg != group_head) if (sg != group_head)
...@@ -6442,6 +6467,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) ...@@ -6442,6 +6467,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child; child = sd->child;
sd->groups->__cpu_power = 0;
/* /*
* For perf policy, if the groups in child domain share resources * For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy * (for example cores sharing some portions of the cache hierarchy
...@@ -6452,18 +6479,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) ...@@ -6452,18 +6479,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags & (child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
sd->groups->cpu_power = SCHED_LOAD_SCALE; sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return; return;
} }
sd->groups->cpu_power = 0;
/* /*
* add cpu_power of each child group to this groups cpu_power * add cpu_power of each child group to this groups cpu_power
*/ */
group = child->groups; group = child->groups;
do { do {
sd->groups->cpu_power += group->cpu_power; sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next; group = group->next;
} while (group != child->groups); } while (group != child->groups);
} }
...@@ -6623,7 +6648,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) ...@@ -6623,7 +6648,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
sd = &per_cpu(node_domains, j); sd = &per_cpu(node_domains, j);
sd->groups = sg; sd->groups = sg;
} }
sg->cpu_power = 0; sg->__cpu_power = 0;
sg->cpumask = nodemask; sg->cpumask = nodemask;
sg->next = sg; sg->next = sg;
cpus_or(covered, covered, nodemask); cpus_or(covered, covered, nodemask);
...@@ -6651,7 +6676,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) ...@@ -6651,7 +6676,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
"Can not alloc domain group for node %d\n", j); "Can not alloc domain group for node %d\n", j);
goto error; goto error;
} }
sg->cpu_power = 0; sg->__cpu_power = 0;
sg->cpumask = tmp; sg->cpumask = tmp;
sg->next = prev->next; sg->next = prev->next;
cpus_or(covered, covered, tmp); cpus_or(covered, covered, tmp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment