Commit 80b19256 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] sched: fix setup races

From: Nick Piggin <nickpiggin@yahoo.com.au>

De-racify the sched domain setup code.  This involves creating a dummy
"init" domain during sched_init (which is called early).

When topology information becomes available, the sched domains are then
built and attached.  The attach mechanism is asynchronous and uses the
migration threads, which perform the switch with interrupts off.  This is a
quiescent state, so domains can still be lockless on the read side.  It
also allows us to change the domains at runtime without much more work. 
This is something SGI is interested in to elegantly do soft partitioning of
their systems without having to use hard cpu affinities (which cause
balancing problems of their own).

The current setup code also has a race somewhere because it is unable to
boot on a 384 CPU system.



From: Anton Blanchard <anton@samba.org>

   This is basically a mindless ppc64 merge of the x86 changes to sched
   domain init code.

   Actually if I produce a sibling_map[] then the x86 code and the ppc64
   will be identical.  Maybe we can merge it.
parent 17d66773
......@@ -1130,16 +1130,17 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
static struct sched_group sched_group_cpus[NR_CPUS];
static struct sched_group sched_group_phys[NR_CPUS];
static struct sched_group sched_group_nodes[MAX_NUMNODES];
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static DEFINE_PER_CPU(struct sched_domain, node_domains);
__init void arch_init_sched_domains(void)
{
int i;
struct sched_group *first_cpu = NULL, *last_cpu = NULL;
struct sched_group *first = NULL, *last = NULL;
/* Set up domains */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
struct sched_domain *node_domain = &per_cpu(node_domains, i);
int node = cpu_to_node(i);
......@@ -1147,26 +1148,27 @@ __init void arch_init_sched_domains(void)
*cpu_domain = SD_SIBLING_INIT;
cpu_domain->span = cpu_sibling_map[i];
cpu_domain->parent = phys_domain;
cpu_domain->groups = &sched_group_cpus[i];
*phys_domain = SD_CPU_INIT;
phys_domain->span = nodemask;
phys_domain->parent = node_domain;
phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
*node_domain = SD_NODE_INIT;
node_domain->span = cpu_possible_map;
node_domain->groups = &sched_group_nodes[cpu_to_node(i)];
}
/* Set up CPU (sibling) groups */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
int j;
first_cpu = last_cpu = NULL;
first = last = NULL;
if (i != first_cpu(cpu_domain->span)) {
cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
SD_SHARE_CPUPOWER;
if (i != first_cpu(cpu_domain->span))
continue;
}
for_each_cpu_mask(j, cpu_domain->span) {
struct sched_group *cpu = &sched_group_cpus[j];
......@@ -1175,13 +1177,13 @@ __init void arch_init_sched_domains(void)
cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
}
for (i = 0; i < MAX_NUMNODES; i++) {
......@@ -1193,10 +1195,10 @@ __init void arch_init_sched_domains(void)
if (cpus_empty(nodemask))
continue;
first_cpu = last_cpu = NULL;
first = last = NULL;
/* Set up physical groups */
for_each_cpu_mask(j, nodemask) {
struct sched_domain *cpu_domain = cpu_sched_domain(j);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j);
struct sched_group *cpu = &sched_group_phys[j];
if (j != first_cpu(cpu_domain->span))
......@@ -1210,17 +1212,17 @@ __init void arch_init_sched_domains(void)
cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
node->cpu_power += cpu->cpu_power;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
}
/* Set up nodes */
first_cpu = last_cpu = NULL;
first = last = NULL;
for (i = 0; i < MAX_NUMNODES; i++) {
struct sched_group *cpu = &sched_group_nodes[i];
cpumask_t nodemask;
......@@ -1232,65 +1234,53 @@ __init void arch_init_sched_domains(void)
cpu->cpumask = nodemask;
/* ->cpu_power already setup */
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
mb();
for_each_cpu(i) {
int node = cpu_to_node(i);
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
struct sched_domain *node_domain = &per_cpu(node_domains, i);
struct sched_group *cpu_group = &sched_group_cpus[i];
struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
struct sched_group *node_group = &sched_group_nodes[node];
cpu_domain->parent = phys_domain;
phys_domain->parent = node_domain;
node_domain->groups = node_group;
phys_domain->groups = phys_group;
cpu_domain->groups = cpu_group;
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
cpu_attach_domain(cpu_domain, i);
}
}
#else /* !CONFIG_NUMA */
static struct sched_group sched_group_cpus[NR_CPUS];
static struct sched_group sched_group_phys[NR_CPUS];
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
__init void arch_init_sched_domains(void)
{
int i;
struct sched_group *first_cpu = NULL, *last_cpu = NULL;
struct sched_group *first = NULL, *last = NULL;
/* Set up domains */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
*cpu_domain = SD_SIBLING_INIT;
cpu_domain->span = cpu_sibling_map[i];
cpu_domain->parent = phys_domain;
cpu_domain->groups = &sched_group_cpus[i];
*phys_domain = SD_CPU_INIT;
phys_domain->span = cpu_possible_map;
phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
}
/* Set up CPU (sibling) groups */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
int j;
first_cpu = last_cpu = NULL;
first = last = NULL;
if (i != first_cpu(cpu_domain->span)) {
cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
SD_SHARE_CPUPOWER;
if (i != first_cpu(cpu_domain->span))
continue;
}
for_each_cpu_mask(j, cpu_domain->span) {
struct sched_group *cpu = &sched_group_cpus[j];
......@@ -1299,19 +1289,19 @@ __init void arch_init_sched_domains(void)
cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
}
first_cpu = last_cpu = NULL;
first = last = NULL;
/* Set up physical groups */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
struct sched_group *cpu = &sched_group_phys[i];
if (i != first_cpu(cpu_domain->span))
......@@ -1321,23 +1311,18 @@ __init void arch_init_sched_domains(void)
/* See SMT+NUMA setup for comment */
cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
mb();
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
struct sched_group *cpu_group = &sched_group_cpus[i];
struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
cpu_domain->parent = phys_domain;
phys_domain->groups = phys_group;
cpu_domain->groups = cpu_group;
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
cpu_attach_domain(cpu_domain, i);
}
}
#endif /* CONFIG_NUMA */
......
......@@ -993,16 +993,17 @@ void __init smp_cpus_done(unsigned int max_cpus)
static struct sched_group sched_group_cpus[NR_CPUS];
static struct sched_group sched_group_phys[NR_CPUS];
static struct sched_group sched_group_nodes[MAX_NUMNODES];
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static DEFINE_PER_CPU(struct sched_domain, node_domains);
__init void arch_init_sched_domains(void)
{
int i;
struct sched_group *first_cpu = NULL, *last_cpu = NULL;
struct sched_group *first = NULL, *last = NULL;
/* Set up domains */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
struct sched_domain *node_domain = &per_cpu(node_domains, i);
int node = cpu_to_node(i);
......@@ -1011,32 +1012,31 @@ __init void arch_init_sched_domains(void)
cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1);
*cpu_domain = SD_SIBLING_INIT;
if (__is_processor(PV_POWER5))
if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask);
else
cpu_domain->span = my_cpumask;
cpu_domain->parent = phys_domain;
cpu_domain->groups = &sched_group_cpus[i];
*phys_domain = SD_CPU_INIT;
phys_domain->span = nodemask;
// phys_domain->cache_hot_time = XXX;
phys_domain->parent = node_domain;
phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
*node_domain = SD_NODE_INIT;
node_domain->span = cpu_possible_map;
// node_domain->cache_hot_time = XXX;
node_domain->groups = &sched_group_nodes[node];
}
/* Set up CPU (sibling) groups */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
int j;
first_cpu = last_cpu = NULL;
first = last = NULL;
if (i != first_cpu(cpu_domain->span)) {
cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
SD_SHARE_CPUPOWER;
if (i != first_cpu(cpu_domain->span))
continue;
}
for_each_cpu_mask(j, cpu_domain->span) {
struct sched_group *cpu = &sched_group_cpus[j];
......@@ -1045,13 +1045,13 @@ __init void arch_init_sched_domains(void)
cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
}
for (i = 0; i < MAX_NUMNODES; i++) {
......@@ -1059,15 +1059,15 @@ __init void arch_init_sched_domains(void)
cpumask_t nodemask;
struct sched_group *node = &sched_group_nodes[i];
cpumask_t node_cpumask = node_to_cpumask(i);
cpus_and(nodemask, node_cpumask, cpu_online_map);
cpus_and(nodemask, node_cpumask, cpu_possible_map);
if (cpus_empty(nodemask))
continue;
first_cpu = last_cpu = NULL;
first = last = NULL;
/* Set up physical groups */
for_each_cpu_mask(j, nodemask) {
struct sched_domain *cpu_domain = cpu_sched_domain(j);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j);
struct sched_group *cpu = &sched_group_phys[j];
if (j != first_cpu(cpu_domain->span))
......@@ -1081,17 +1081,17 @@ __init void arch_init_sched_domains(void)
cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
node->cpu_power += cpu->cpu_power;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
}
/* Set up nodes */
first_cpu = last_cpu = NULL;
first = last = NULL;
for (i = 0; i < MAX_NUMNODES; i++) {
struct sched_group *cpu = &sched_group_nodes[i];
cpumask_t nodemask;
......@@ -1104,71 +1104,58 @@ __init void arch_init_sched_domains(void)
cpu->cpumask = nodemask;
/* ->cpu_power already setup */
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
mb();
for_each_cpu(i) {
int node = cpu_to_node(i);
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
struct sched_domain *node_domain = &per_cpu(node_domains, i);
struct sched_group *cpu_group = &sched_group_cpus[i];
struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
struct sched_group *node_group = &sched_group_nodes[node];
cpu_domain->parent = phys_domain;
phys_domain->parent = node_domain;
node_domain->groups = node_group;
phys_domain->groups = phys_group;
cpu_domain->groups = cpu_group;
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
cpu_attach_domain(cpu_domain, i);
}
}
#else /* !CONFIG_NUMA */
static struct sched_group sched_group_cpus[NR_CPUS];
static struct sched_group sched_group_phys[NR_CPUS];
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
__init void arch_init_sched_domains(void)
{
int i;
struct sched_group *first_cpu = NULL, *last_cpu = NULL;
struct sched_group *first = NULL, *last = NULL;
/* Set up domains */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
cpumask_t my_cpumask = cpumask_of_cpu(i);
cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1);
*cpu_domain = SD_SIBLING_INIT;
if (__is_processor(PV_POWER5))
if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask);
else
cpu_domain->span = my_cpumask;
cpu_domain->parent = phys_domain;
cpu_domain->groups = &sched_group_cpus[i];
*phys_domain = SD_CPU_INIT;
phys_domain->span = cpu_possible_map;
// phys_domain->cache_hot_time = XXX;
phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
}
/* Set up CPU (sibling) groups */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
int j;
first_cpu = last_cpu = NULL;
first = last = NULL;
if (i != first_cpu(cpu_domain->span)) {
cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
SD_SHARE_CPUPOWER;
if (i != first_cpu(cpu_domain->span))
continue;
}
for_each_cpu_mask(j, cpu_domain->span) {
struct sched_group *cpu = &sched_group_cpus[j];
......@@ -1177,19 +1164,19 @@ __init void arch_init_sched_domains(void)
cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
}
first_cpu = last_cpu = NULL;
first = last = NULL;
/* Set up physical groups */
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
struct sched_group *cpu = &sched_group_phys[i];
if (i != first_cpu(cpu_domain->span))
......@@ -1199,64 +1186,19 @@ __init void arch_init_sched_domains(void)
/* See SMT+NUMA setup for comment */
cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
if (!first)
first = cpu;
if (last)
last->next = cpu;
last = cpu;
}
last_cpu->next = first_cpu;
last->next = first;
mb();
for_each_cpu(i) {
struct sched_domain *cpu_domain = cpu_sched_domain(i);
struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
struct sched_group *cpu_group = &sched_group_cpus[i];
struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
cpu_domain->parent = phys_domain;
phys_domain->groups = phys_group;
cpu_domain->groups = cpu_group;
struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
cpu_attach_domain(cpu_domain, i);
}
}
#endif /* CONFIG_NUMA */
#else /* !CONFIG_SCHED_SMT */
static struct sched_group sched_group_cpus[NR_CPUS];
__init void arch_init_sched_domains(void)
{
int i;
struct sched_group *first_cpu = NULL, *last_cpu = NULL;
/* Set up domains */
for_each_cpu(i) {
struct sched_domain *cpu_sd = cpu_sched_domain(i);
*cpu_sd = SD_CPU_INIT;
cpu_sd->span = cpu_possible_map;
// cpu_sd->cache_hot_time = XXX;
}
/* Set up CPU groups */
for_each_cpu_mask(i, cpu_possible_map) {
struct sched_group *cpu = &sched_group_cpus[i];
cpus_clear(cpu->cpumask);
cpu_set(i, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu)
first_cpu = cpu;
if (last_cpu)
last_cpu->next = cpu;
last_cpu = cpu;
}
last_cpu->next = first_cpu;
mb();
for_each_cpu(i) {
struct sched_domain *cpu_sd = cpu_sched_domain(i);
cpu_sd->groups = &sched_group_cpus[i];
}
}
#endif
#endif /* CONFIG_SCHED_SMT */
......@@ -598,7 +598,8 @@ struct sched_domain {
.per_cpu_gain = 15, \
.flags = SD_BALANCE_NEWIDLE \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE, \
| SD_WAKE_IDLE \
| SD_SHARE_CPUPOWER, \
.last_balance = jiffies, \
.balance_interval = 1, \
.nr_balance_failed = 0, \
......@@ -643,12 +644,7 @@ struct sched_domain {
}
#endif
DECLARE_PER_CPU(struct sched_domain, base_domains);
#define cpu_sched_domain(cpu) (&per_cpu(base_domains, (cpu)))
#define this_sched_domain() (&__get_cpu_var(base_domains))
#define for_each_domain(cpu, domain) \
for (domain = cpu_sched_domain(cpu); domain; domain = domain->parent)
extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
#else
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment