Commit e761b772 authored by Max Krasnyansky's avatar Max Krasnyansky Committed by Ingo Molnar

cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2)

This is based on Linus' idea of creating cpu_active_map that prevents
scheduler load balancer from migrating tasks to the cpu that is going
down.

It allows us to simplify domain management code and avoid unecessary
domain rebuilds during cpu hotplug event handling.

Please ignore the cpusets part for now. It needs some more work in order
to avoid crazy lock nesting. Although I did simplfy and unify domain
reinitialization logic. We now simply call partition_sched_domains() in
all the cases. This means that we're using exact same code paths as in
cpusets case and hence the test below cover cpusets too.
Cpuset changes to make rebuild_sched_domains() callable from various
contexts are in the separate patch (right next after this one).

This not only boots but also easily handles
	while true; do make clean; make -j 8; done
and
	while true; do on-off-cpu 1; done
at the same time.
(on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing).

Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing
this on right now in gnome-terminal and things are moving just fine.

Also this is running with most of the debug features enabled (lockdep,
mutex, etc) no BUG_ONs or lockdep complaints so far.

I believe I addressed all of the Dmitry's comments for original Linus'
version. I changed both fair and rt balancer to mask out non-active cpus.
And replaced cpu_is_offline() with !cpu_active() in the main scheduler
code where it made sense (to me).
Signed-off-by: default avatarMax Krasnyanskiy <maxk@qualcomm.com>
Acked-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Acked-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: default avatarGregory Haskins <ghaskins@novell.com>
Cc: dmitry.adamushko@gmail.com
Cc: pj@sgi.com
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 7ebefa8c
...@@ -359,13 +359,14 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp, ...@@ -359,13 +359,14 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
/* /*
* The following particular system cpumasks and operations manage * The following particular system cpumasks and operations manage
* possible, present and online cpus. Each of them is a fixed size * possible, present, active and online cpus. Each of them is a fixed size
* bitmap of size NR_CPUS. * bitmap of size NR_CPUS.
* *
* #ifdef CONFIG_HOTPLUG_CPU * #ifdef CONFIG_HOTPLUG_CPU
* cpu_possible_map - has bit 'cpu' set iff cpu is populatable * cpu_possible_map - has bit 'cpu' set iff cpu is populatable
* cpu_present_map - has bit 'cpu' set iff cpu is populated * cpu_present_map - has bit 'cpu' set iff cpu is populated
* cpu_online_map - has bit 'cpu' set iff cpu available to scheduler * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler
* cpu_active_map - has bit 'cpu' set iff cpu available to migration
* #else * #else
* cpu_possible_map - has bit 'cpu' set iff cpu is populated * cpu_possible_map - has bit 'cpu' set iff cpu is populated
* cpu_present_map - copy of cpu_possible_map * cpu_present_map - copy of cpu_possible_map
...@@ -416,6 +417,7 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp, ...@@ -416,6 +417,7 @@ static inline void __cpus_fold(cpumask_t *dstp, const cpumask_t *origp,
extern cpumask_t cpu_possible_map; extern cpumask_t cpu_possible_map;
extern cpumask_t cpu_online_map; extern cpumask_t cpu_online_map;
extern cpumask_t cpu_present_map; extern cpumask_t cpu_present_map;
extern cpumask_t cpu_active_map;
#if NR_CPUS > 1 #if NR_CPUS > 1
#define num_online_cpus() cpus_weight(cpu_online_map) #define num_online_cpus() cpus_weight(cpu_online_map)
...@@ -424,6 +426,7 @@ extern cpumask_t cpu_present_map; ...@@ -424,6 +426,7 @@ extern cpumask_t cpu_present_map;
#define cpu_online(cpu) cpu_isset((cpu), cpu_online_map) #define cpu_online(cpu) cpu_isset((cpu), cpu_online_map)
#define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map) #define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map)
#define cpu_present(cpu) cpu_isset((cpu), cpu_present_map) #define cpu_present(cpu) cpu_isset((cpu), cpu_present_map)
#define cpu_active(cpu) cpu_isset((cpu), cpu_active_map)
#else #else
#define num_online_cpus() 1 #define num_online_cpus() 1
#define num_possible_cpus() 1 #define num_possible_cpus() 1
...@@ -431,6 +434,7 @@ extern cpumask_t cpu_present_map; ...@@ -431,6 +434,7 @@ extern cpumask_t cpu_present_map;
#define cpu_online(cpu) ((cpu) == 0) #define cpu_online(cpu) ((cpu) == 0)
#define cpu_possible(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0)
#define cpu_present(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0)
#define cpu_active(cpu) ((cpu) == 0)
#endif #endif
#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
......
...@@ -78,6 +78,8 @@ extern void cpuset_track_online_nodes(void); ...@@ -78,6 +78,8 @@ extern void cpuset_track_online_nodes(void);
extern int current_cpuset_is_being_rebound(void); extern int current_cpuset_is_being_rebound(void);
extern void rebuild_sched_domains(void);
#else /* !CONFIG_CPUSETS */ #else /* !CONFIG_CPUSETS */
static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init_early(void) { return 0; }
...@@ -156,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void) ...@@ -156,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void)
return 0; return 0;
} }
static inline void rebuild_sched_domains(void)
{
partition_sched_domains(0, NULL, NULL);
}
#endif /* !CONFIG_CPUSETS */ #endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */ #endif /* _LINUX_CPUSET_H */
...@@ -415,6 +415,13 @@ static void __init smp_init(void) ...@@ -415,6 +415,13 @@ static void __init smp_init(void)
{ {
unsigned int cpu; unsigned int cpu;
/*
* Set up the current CPU as possible to migrate to.
* The other ones will be done by cpu_up/cpu_down()
*/
cpu = smp_processor_id();
cpu_set(cpu, cpu_active_map);
/* FIXME: This should be done in userspace --RR */ /* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) { for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus) if (num_online_cpus() >= setup_max_cpus)
......
...@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void) ...@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
cpu_hotplug.refcount = 0; cpu_hotplug.refcount = 0;
} }
cpumask_t cpu_active_map;
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
void get_online_cpus(void) void get_online_cpus(void)
...@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu) ...@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
int err = 0; int err = 0;
cpu_maps_update_begin(); cpu_maps_update_begin();
if (cpu_hotplug_disabled)
if (cpu_hotplug_disabled) {
err = -EBUSY; err = -EBUSY;
else goto out;
}
cpu_clear(cpu, cpu_active_map);
err = _cpu_down(cpu, 0); err = _cpu_down(cpu, 0);
if (cpu_online(cpu))
cpu_set(cpu, cpu_active_map);
out:
cpu_maps_update_done(); cpu_maps_update_done();
return err; return err;
} }
...@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu) ...@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
} }
cpu_maps_update_begin(); cpu_maps_update_begin();
if (cpu_hotplug_disabled)
if (cpu_hotplug_disabled) {
err = -EBUSY; err = -EBUSY;
else goto out;
}
err = _cpu_up(cpu, 0); err = _cpu_up(cpu, 0);
if (cpu_online(cpu))
cpu_set(cpu, cpu_active_map);
out:
cpu_maps_update_done(); cpu_maps_update_done();
return err; return err;
} }
......
...@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) ...@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
* partition_sched_domains(). * partition_sched_domains().
*/ */
static void rebuild_sched_domains(void) void rebuild_sched_domains(void)
{ {
struct kfifo *q; /* queue of cpusets to be scanned */ struct kfifo *q; /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */ struct cpuset *cp; /* scans q */
......
...@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) ...@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
if (!cpu_isset(dest_cpu, p->cpus_allowed) if (!cpu_isset(dest_cpu, p->cpus_allowed)
|| unlikely(cpu_is_offline(dest_cpu))) || unlikely(!cpu_active(dest_cpu)))
goto out; goto out;
/* force the process onto the specified CPU */ /* force the process onto the specified CPU */
...@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick) ...@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
/* /*
* If we are going offline and still the leader, give up! * If we are going offline and still the leader, give up!
*/ */
if (cpu_is_offline(cpu) && if (!cpu_active(cpu) &&
atomic_read(&nohz.load_balancer) == cpu) { atomic_read(&nohz.load_balancer) == cpu) {
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
BUG(); BUG();
...@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) ...@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
struct rq *rq_dest, *rq_src; struct rq *rq_dest, *rq_src;
int ret = 0, on_rq; int ret = 0, on_rq;
if (unlikely(cpu_is_offline(dest_cpu))) if (unlikely(!cpu_active(dest_cpu)))
return ret; return ret;
rq_src = cpu_rq(src_cpu); rq_src = cpu_rq(src_cpu);
...@@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void) ...@@ -7553,18 +7553,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
{ {
} }
/*
* Free current domain masks.
* Called after all cpus are attached to NULL domain.
*/
static void free_sched_domains(void)
{
ndoms_cur = 0;
if (doms_cur != &fallback_doms)
kfree(doms_cur);
doms_cur = &fallback_doms;
}
/* /*
* Set up scheduler domains and groups. Callers must hold the hotplug lock. * Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated cpus, but could be used to * For now this just excludes isolated cpus, but could be used to
...@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, ...@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* ownership of it and will kfree it when done with it. If the caller * ownership of it and will kfree it when done with it. If the caller
* failed the kmalloc call, then it can pass in doms_new == NULL, * failed the kmalloc call, then it can pass in doms_new == NULL,
* and partition_sched_domains() will fallback to the single partition * and partition_sched_domains() will fallback to the single partition
* 'fallback_doms'. * 'fallback_doms', it also forces the domains to be rebuilt.
* *
* Call with hotplug lock held * Call with hotplug lock held
*/ */
...@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, ...@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
/* always unregister in case we don't destroy any domains */ /* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl(); unregister_sched_domain_sysctl();
if (doms_new == NULL) { if (doms_new == NULL)
ndoms_new = 1; ndoms_new = 0;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
}
/* Destroy deleted domains */ /* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) { for (i = 0; i < ndoms_cur; i++) {
...@@ -7677,6 +7661,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, ...@@ -7677,6 +7661,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
; ;
} }
if (doms_new == NULL) {
ndoms_cur = 0;
ndoms_new = 1;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
}
/* Build new domains */ /* Build new domains */
for (i = 0; i < ndoms_new; i++) { for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < ndoms_cur; j++) { for (j = 0; j < ndoms_cur; j++) {
...@@ -7707,17 +7699,10 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, ...@@ -7707,17 +7699,10 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int arch_reinit_sched_domains(void) int arch_reinit_sched_domains(void)
{ {
int err;
get_online_cpus(); get_online_cpus();
mutex_lock(&sched_domains_mutex); rebuild_sched_domains();
detach_destroy_domains(&cpu_online_map);
free_sched_domains();
err = arch_init_sched_domains(&cpu_online_map);
mutex_unlock(&sched_domains_mutex);
put_online_cpus(); put_online_cpus();
return 0;
return err;
} }
static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
...@@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) ...@@ -7783,14 +7768,30 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
} }
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
#ifndef CONFIG_CPUSETS
/* /*
* Force a reinitialization of the sched domains hierarchy. The domains * Add online and remove offline CPUs from the scheduler domains.
* and groups cannot be updated in place without racing with the balancing * When cpusets are enabled they take over this function.
* code, so we temporarily attach all running cpus to the NULL domain
* which will prevent rebalancing while the sched domains are recalculated.
*/ */
static int update_sched_domains(struct notifier_block *nfb, static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu) unsigned long action, void *hcpu)
{
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
partition_sched_domains(0, NULL, NULL);
return NOTIFY_OK;
default:
return NOTIFY_DONE;
}
}
#endif
static int update_runtime(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{ {
int cpu = (int)(long)hcpu; int cpu = (int)(long)hcpu;
...@@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb, ...@@ -7798,44 +7799,18 @@ static int update_sched_domains(struct notifier_block *nfb,
case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_PREPARE_FROZEN:
disable_runtime(cpu_rq(cpu)); disable_runtime(cpu_rq(cpu));
/* fall-through */
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
free_sched_domains();
return NOTIFY_OK; return NOTIFY_OK;
case CPU_DOWN_FAILED: case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN: case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE: case CPU_ONLINE:
case CPU_ONLINE_FROZEN: case CPU_ONLINE_FROZEN:
enable_runtime(cpu_rq(cpu)); enable_runtime(cpu_rq(cpu));
/* fall-through */ return NOTIFY_OK;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
/*
* Fall through and re-initialise the domains.
*/
break;
default: default:
return NOTIFY_DONE; return NOTIFY_DONE;
} }
#ifndef CONFIG_CPUSETS
/*
* Create default domain partitioning if cpusets are disabled.
* Otherwise we let cpusets rebuild the domains based on the
* current setup.
*/
/* The hotplug lock is already held by cpu_up/cpu_down */
arch_init_sched_domains(&cpu_online_map);
#endif
return NOTIFY_OK;
} }
void __init sched_init_smp(void) void __init sched_init_smp(void)
...@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void) ...@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
cpu_set(smp_processor_id(), non_isolated_cpus); cpu_set(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex); mutex_unlock(&sched_domains_mutex);
put_online_cpus(); put_online_cpus();
#ifndef CONFIG_CPUSETS
/* XXX: Theoretical race here - CPU may be hotplugged now */ /* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0); hotcpu_notifier(update_sched_domains, 0);
#endif
/* RT runtime code needs to handle some hotplug events */
hotcpu_notifier(update_runtime, 0);
init_hrtick(); init_hrtick();
/* Move init over to a non-isolated CPU */ /* Move init over to a non-isolated CPU */
......
...@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq) ...@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
* not idle and an idle cpu is available. The span of cpus to * not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed, * search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu. * so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (cpu_active_map)
* *
* Returns the CPU we should wake onto. * Returns the CPU we should wake onto.
*/ */
...@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p) ...@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
|| ((sd->flags & SD_WAKE_IDLE_FAR) || ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq(p)->clock, sd))) { && !task_hot(p, task_rq(p)->clock, sd))) {
cpus_and(tmp, sd->span, p->cpus_allowed); cpus_and(tmp, sd->span, p->cpus_allowed);
cpus_and(tmp, tmp, cpu_active_map);
for_each_cpu_mask(i, tmp) { for_each_cpu_mask(i, tmp) {
if (idle_cpu(i)) { if (idle_cpu(i)) {
if (i != task_cpu(p)) { if (i != task_cpu(p)) {
......
...@@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task) ...@@ -933,6 +933,13 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
return -1; /* No targets found */ return -1; /* No targets found */
/*
* Only consider CPUs that are usable for migration.
* I guess we might want to change cpupri_find() to ignore those
* in the first place.
*/
cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
/* /*
* At this point we have built a mask of cpus representing the * At this point we have built a mask of cpus representing the
* lowest priority tasks in the system. Now we want to elect * lowest priority tasks in the system. Now we want to elect
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment