Commit 969c7921 authored by Tejun Heo's avatar Tejun Heo

sched: replace migration_thread with cpu_stop

Currently migration_thread is serving three purposes - migration
pusher, context to execute active_load_balance() and forced context
switcher for expedited RCU synchronize_sched.  All three roles are
hardcoded into migration_thread() and determining which job is
scheduled is slightly messy.

This patch kills migration_thread and replaces all three uses with
cpu_stop.  The three different roles of migration_thread() are
splitted into three separate cpu_stop callbacks -
migration_cpu_stop(), active_load_balance_cpu_stop() and
synchronize_sched_expedited_cpu_stop() - and each use case now simply
asks cpu_stop to execute the callback as necessary.

synchronize_sched_expedited() was implemented with private
preallocated resources and custom multi-cpu queueing and waiting
logic, both of which are provided by cpu_stop.
synchronize_sched_expedited_count is made atomic and all other shared
resources along with the mutex are dropped.

synchronize_sched_expedited() also implemented a check to detect cases
where not all the callback got executed on their assigned cpus and
fall back to synchronize_sched().  If called with cpu hotplug blocked,
cpu_stop already guarantees that and the condition cannot happen;
otherwise, stop_machine() would break.  However, this patch preserves
the paranoid check using a cpumask to record on which cpus the stopper
ran so that it can serve as a bisection point if something actually
goes wrong theree.

Because the internal execution state is no longer visible,
rcu_expedited_torture_stats() is removed.

This patch also renames cpu_stop threads to from "stopper/%d" to
"migration/%d".  The names of these threads ultimately don't matter
and there's no reason to make unnecessary userland visible changes.

With this patch applied, stop_machine() and sched now share the same
resources.  stop_machine() is faster without wasting any resources and
sched migration users are much cleaner.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarPeter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
parent 3fc1f1e2
...@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following: ...@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0
sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0
sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
state: -1 / 0:0 3:0 4:0
As before, the first four lines are similar to those for RCU.
The last line shows the task-migration state. The first number is
-1 if synchronize_sched_expedited() is idle, -2 if in the process of
posting wakeups to the migration kthreads, and N when waiting on CPU N.
Each of the colon-separated fields following the "/" is a CPU:state pair.
Valid states are "0" for idle, "1" for waiting for quiescent state,
"2" for passed through quiescent state, and "3" when a race with a
CPU-hotplug event forces use of the synchronize_sched() primitive.
USAGE USAGE
......
...@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void) ...@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
return 0; return 0;
} }
extern int rcu_expedited_torture_stats(char *page);
static inline void rcu_force_quiescent_state(void) static inline void rcu_force_quiescent_state(void)
{ {
} }
......
...@@ -35,7 +35,6 @@ struct notifier_block; ...@@ -35,7 +35,6 @@ struct notifier_block;
extern void rcu_sched_qs(int cpu); extern void rcu_sched_qs(int cpu);
extern void rcu_bh_qs(int cpu); extern void rcu_bh_qs(int cpu);
extern int rcu_needs_cpu(int cpu); extern int rcu_needs_cpu(int cpu);
extern int rcu_expedited_torture_stats(char *page);
#ifdef CONFIG_TREE_PREEMPT_RCU #ifdef CONFIG_TREE_PREEMPT_RCU
......
...@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = { ...@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
.sync = synchronize_sched_expedited, .sync = synchronize_sched_expedited,
.cb_barrier = NULL, .cb_barrier = NULL,
.fqs = rcu_sched_force_quiescent_state, .fqs = rcu_sched_force_quiescent_state,
.stats = rcu_expedited_torture_stats, .stats = NULL,
.irq_capable = 1, .irq_capable = 1,
.name = "sched_expedited" .name = "sched_expedited"
}; };
......
This diff is collapsed.
...@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) ...@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
} }
static int active_load_balance_cpu_stop(void *data);
/* /*
* Check this_cpu to ensure it is balanced within domain. Attempt to move * Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance. * tasks if there is an imbalance.
...@@ -2887,8 +2889,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -2887,8 +2889,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if (need_active_balance(sd, sd_idle, idle)) { if (need_active_balance(sd, sd_idle, idle)) {
raw_spin_lock_irqsave(&busiest->lock, flags); raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the migration_thread, if the curr /* don't kick the active_load_balance_cpu_stop,
* task on busiest cpu can't be moved to this_cpu * if the curr task on busiest cpu can't be
* moved to this_cpu
*/ */
if (!cpumask_test_cpu(this_cpu, if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) { &busiest->curr->cpus_allowed)) {
...@@ -2898,14 +2901,22 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -2898,14 +2901,22 @@ static int load_balance(int this_cpu, struct rq *this_rq,
goto out_one_pinned; goto out_one_pinned;
} }
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance) { if (!busiest->active_balance) {
busiest->active_balance = 1; busiest->active_balance = 1;
busiest->push_cpu = this_cpu; busiest->push_cpu = this_cpu;
active_balance = 1; active_balance = 1;
} }
raw_spin_unlock_irqrestore(&busiest->lock, flags); raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) if (active_balance)
wake_up_process(busiest->migration_thread); stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
/* /*
* We've kicked active balancing, reset the failure * We've kicked active balancing, reset the failure
...@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) ...@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
} }
/* /*
* active_load_balance is run by migration threads. It pushes running tasks * active_load_balance_cpu_stop is run by cpu stopper. It pushes
* off the busiest CPU onto idle CPUs. It requires at least 1 task to be * running tasks off the busiest CPU onto idle CPUs. It requires at
* running on each physical CPU where possible, and avoids physical / * least 1 task to be running on each physical CPU where possible, and
* logical imbalances. * avoids physical / logical imbalances.
*
* Called with busiest_rq locked.
*/ */
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static int active_load_balance_cpu_stop(void *data)
{ {
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu; int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd; struct sched_domain *sd;
struct rq *target_rq;
raw_spin_lock_irq(&busiest_rq->lock);
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
/* Is there any task to move? */ /* Is there any task to move? */
if (busiest_rq->nr_running <= 1) if (busiest_rq->nr_running <= 1)
return; goto out_unlock;
target_rq = cpu_rq(target_cpu);
/* /*
* This condition is "impossible", if it occurs * This condition is "impossible", if it occurs
...@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) ...@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
schedstat_inc(sd, alb_failed); schedstat_inc(sd, alb_failed);
} }
double_unlock_balance(busiest_rq, target_rq); double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
raw_spin_unlock_irq(&busiest_rq->lock);
return 0;
} }
#ifdef CONFIG_NO_HZ #ifdef CONFIG_NO_HZ
......
...@@ -301,7 +301,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, ...@@ -301,7 +301,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE: case CPU_UP_PREPARE:
BUG_ON(stopper->thread || stopper->enabled || BUG_ON(stopper->thread || stopper->enabled ||
!list_empty(&stopper->works)); !list_empty(&stopper->works));
p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d", p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
cpu); cpu);
if (IS_ERR(p)) if (IS_ERR(p))
return NOTIFY_BAD; return NOTIFY_BAD;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment