Commit ddea677b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Move migrate_all_tasks to CPU_DEAD handling

From: Srivatsa Vaddagiri <vatsa@in.ibm.com>

migrate_all_tasks is currently run with rest of the machine stopped.
It iterates thr' the complete task table, turning off cpu affinity of any task
that it finds affine to the dying cpu. Depending on the task table
size this can take considerable time. All this time machine is stopped, doing
nothing.

Stopping the machine for such extended periods can be avoided if we do
task migration in CPU_DEAD notification and that's precisely what this patch
does.

The patch puts idle task to the _front_ of the dying CPU's runqueue at the 
highest priority possible. This cause idle thread to run _immediately_ after
kstopmachine thread yields. Idle thread notices that its cpu is offline and
dies quickly. Task migration can then be done at leisure in CPU_DEAD
notification, when rest of the CPUs are running.

Some advantages with this approach are:

	- More scalable. Predicatable amout of time that machine is stopped.
	- No changes to hot path/core code. We are just exploiting scheduler
	  rules which runs the next high-priority task on the runqueue. Also
	  since I put idle task to the _front_ of the runqueue, there
	  are no races when a equally high priority task is woken up
	  and added to the runqueue. It gets in at the back of the runqueue,
	  _after_ idle task!
	- cpu_is_offline check that is presenty required in try_to_wake_up,
	  idle_balance and rebalance_tick can be removed, thus speeding them
	  up a bit

From: Srivatsa Vaddagiri <vatsa@in.ibm.com>

  Rusty mentioned that the unlikely hints against cpu_is_offline is
  redundant since the macro already has that hint.  Patch below removes those
  redundant hints I added.
parent 4197ad87
......@@ -671,8 +671,7 @@ extern void sched_balance_exec(void);
#define sched_balance_exec() {}
#endif
/* Move tasks off this (offline) CPU onto another. */
extern void migrate_all_tasks(void);
extern void sched_idle_next(void);
extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p);
extern int task_nice(task_t *p);
......
......@@ -43,15 +43,16 @@ void unregister_cpu_notifier(struct notifier_block *nb)
EXPORT_SYMBOL(unregister_cpu_notifier);
#ifdef CONFIG_HOTPLUG_CPU
static inline void check_for_tasks(int cpu, struct task_struct *k)
static inline void check_for_tasks(int cpu)
{
struct task_struct *p;
write_lock_irq(&tasklist_lock);
for_each_process(p) {
if (task_cpu(p) == cpu && p != k)
printk(KERN_WARNING "Task %s is on cpu %d\n",
p->comm, cpu);
if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
(state = %ld, flags = %lx) \n",
p->comm, p->pid, cpu, p->state, p->flags);
}
write_unlock_irq(&tasklist_lock);
}
......@@ -96,8 +97,9 @@ static int take_cpu_down(void *unused)
if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map);
else
/* Everyone else gets kicked off. */
migrate_all_tasks();
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
return err;
}
......@@ -106,6 +108,7 @@ int cpu_down(unsigned int cpu)
{
int err;
struct task_struct *p;
cpumask_t old_allowed, tmp;
if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err;
......@@ -120,17 +123,21 @@ int cpu_down(unsigned int cpu)
goto out;
}
/* Ensure that we are not runnable on dying cpu */
old_allowed = current->cpus_allowed;
tmp = CPU_MASK_ALL;
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) {
err = PTR_ERR(p);
goto out;
goto out_allowed;
}
if (cpu_online(cpu))
goto out_thread;
check_for_tasks(cpu, p);
/* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu))
yield();
......@@ -146,10 +153,14 @@ int cpu_down(unsigned int cpu)
== NOTIFY_BAD)
BUG();
check_for_tasks(cpu);
cpu_run_sbin_hotplug(cpu, "offline");
out_thread:
err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
out:
unlock_cpu_hotplug();
return err;
......
......@@ -26,6 +26,7 @@
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/fs.h>
#include <linux/cpu.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/jiffies.h>
......@@ -1196,8 +1197,15 @@ long do_fork(unsigned long clone_flags,
wake_up_forked_thread(p);
else
wake_up_forked_process(p);
} else
} else {
int cpu = get_cpu();
p->state = TASK_STOPPED;
if (cpu_is_offline(task_cpu(p)))
set_task_cpu(p, cpu);
put_cpu();
}
++total_forks;
if (unlikely (trace)) {
......
......@@ -331,7 +331,6 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
p->array = array;
}
#ifdef CONFIG_SMP
/*
* Used by the migration code - we pull tasks from the head of the
* remote queue so we want these tasks to show up at the head of the
......@@ -344,7 +343,6 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
array->nr_active++;
p->array = array;
}
#endif
/*
* effective_prio - return the priority that is based on the static
......@@ -386,6 +384,15 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
rq->nr_running++;
}
/*
* __activate_idle_task - move idle task to the _front_ of runqueue.
*/
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
enqueue_task_head(p, rq->active);
rq->nr_running++;
}
static void recalc_task_prio(task_t *p, unsigned long long now)
{
unsigned long long __sleep_time = now - p->timestamp;
......@@ -749,7 +756,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
this_cpu = smp_processor_id();
#ifdef CONFIG_SMP
if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
if (unlikely(task_running(rq, p)))
goto out_activate;
new_cpu = cpu;
......@@ -1781,9 +1788,6 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *sd;
if (unlikely(cpu_is_offline(this_cpu)))
return;
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
if (load_balance_newidle(this_cpu, this_rq, sd)) {
......@@ -1871,9 +1875,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;
if (unlikely(cpu_is_offline(this_cpu)))
return;
/* Update our load */
old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
......@@ -3325,18 +3326,19 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
static void __migrate_task(struct task_struct *p, int dest_cpu)
static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
runqueue_t *rq_dest;
runqueue_t *rq_dest, *rq_src;
if (unlikely(cpu_is_offline(dest_cpu)))
return;
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
double_rq_lock(this_rq(), rq_dest);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != smp_processor_id())
if (task_cpu(p) != src_cpu)
goto out;
/* Affinity changed (again). */
if (!cpu_isset(dest_cpu, p->cpus_allowed))
......@@ -3344,7 +3346,7 @@ static void __migrate_task(struct task_struct *p, int dest_cpu)
set_task_cpu(p, dest_cpu);
if (p->array) {
deactivate_task(p, this_rq());
deactivate_task(p, rq_src);
activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
......@@ -3352,7 +3354,7 @@ static void __migrate_task(struct task_struct *p, int dest_cpu)
p->timestamp = rq_dest->timestamp_last_tick;
out:
double_rq_unlock(this_rq(), rq_dest);
double_rq_unlock(rq_src, rq_dest);
}
/*
......@@ -3376,6 +3378,12 @@ static int migration_thread(void * data)
refrigerator(PF_FREEZE);
spin_lock_irq(&rq->lock);
if (cpu_is_offline(cpu)) {
spin_unlock_irq(&rq->lock);
goto wait_to_die;
}
if (rq->active_balance) {
active_load_balance(rq, cpu);
rq->active_balance = 0;
......@@ -3394,7 +3402,8 @@ static int migration_thread(void * data)
if (req->type == REQ_MOVE_TASK) {
spin_unlock(&rq->lock);
__migrate_task(req->task, req->dest_cpu);
__migrate_task(req->task, smp_processor_id(),
req->dest_cpu);
local_irq_enable();
} else if (req->type == REQ_SET_DOMAIN) {
rq->sd = req->sd;
......@@ -3407,23 +3416,27 @@ static int migration_thread(void * data)
complete(&req->done);
}
return 0;
wait_to_die:
/* Wait for kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
/* migrate_all_tasks - function to migrate all the tasks from the
* current cpu caller must have already scheduled this to the target
* cpu via set_cpus_allowed. Machine is stopped. */
void migrate_all_tasks(void)
/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */
static void migrate_all_tasks(int src_cpu)
{
struct task_struct *tsk, *t;
int dest_cpu, src_cpu;
int dest_cpu;
unsigned int node;
/* We're nailed to this CPU. */
src_cpu = smp_processor_id();
/* Not required, but here for neatness. */
write_lock(&tasklist_lock);
write_lock_irq(&tasklist_lock);
/* watch out for per node tasks, let's stay on this node */
node = cpu_to_node(src_cpu);
......@@ -3459,10 +3472,36 @@ void migrate_all_tasks(void)
tsk->pid, tsk->comm, src_cpu);
}
__migrate_task(tsk, dest_cpu);
__migrate_task(tsk, src_cpu, dest_cpu);
} while_each_thread(t, tsk);
write_unlock(&tasklist_lock);
write_unlock_irq(&tasklist_lock);
}
/* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible and adding it to
* the _front_ of runqueue. Used by CPU offline code.
*/
void sched_idle_next(void)
{
int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
struct task_struct *p = rq->idle;
unsigned long flags;
/* cpu has to be offline */
BUG_ON(cpu_online(cpu));
/* Strictly not necessary since rest of the CPUs are stopped by now
* and interrupts disabled on current cpu.
*/
spin_lock_irqsave(&rq->lock, flags);
__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
/* Add idle task to _front_ of it's priority queue */
__activate_idle_task(p, rq);
spin_unlock_irqrestore(&rq->lock, flags);
}
#endif /* CONFIG_HOTPLUG_CPU */
......@@ -3498,10 +3537,19 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
case CPU_DEAD:
migrate_all_tasks(cpu);
rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread);
rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
rq = task_rq_lock(rq->idle, &flags);
deactivate_task(rq->idle, rq);
__setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
task_rq_unlock(rq, &flags);
BUG_ON(rq->nr_running != 0);
/* No need to migrate the tasks: it was best-effort if
......@@ -3523,8 +3571,12 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
return NOTIFY_OK;
}
/* Register at highest priority so that task migration (migrate_all_tasks)
* happens before everything else.
*/
static struct notifier_block __devinitdata migration_notifier = {
.notifier_call = migration_call,
.priority = 10
};
int __init migration_init(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment