Commit 8c8cfc36 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] sched: balance-on-clone

From: Ingo Molnar <mingo@elte.hu>

Implement balancing during clone().  It does the following things:

- introduces SD_BALANCE_CLONE that can serve as a tool for an
  architecture to limit the search-idlest-CPU scope on clone().
  E.g. the 512-CPU systems should rather not enable this.

- uses the highest sd for the imbalance_pct, not this_rq (which didnt
  make sense).

- unifies balance-on-exec and balance-on-clone via the find_idlest_cpu()
  function. Gets rid of sched_best_cpu() which was still a bit
  inconsistent IMO, it used 'min_load < load' as a condition for
  balancing - while a more correct approach would be to use half of the
  imbalance_pct, like passive balancing does.

- the patch also reintroduces the possibility to do SD_BALANCE_EXEC on
  SMP systems, and activates it - to get testing.

- NOTE: there's one thing in this patch that is slightly unclean: i
  introduced wake_up_forked_thread. I did this to make it easier to get
  rid of this patch later (wake_up_forked_process() has lots of
  dependencies in various architectures). If this capability remains in
  the kernel then i'll clean it up and introduce one function for
  wake_up_forked_process/thread.

- NOTE2: i added the SD_BALANCE_CLONE flag to the NUMA CPU template too.
  Some NUMA architectures probably want to disable this.
parent a690c9b7
......@@ -547,10 +547,11 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
#define SD_BALANCE_NEWIDLE 1 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 2 /* Balance on exec */
#define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */
#define SD_BALANCE_CLONE 4 /* Balance on clone */
#define SD_WAKE_IDLE 8 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 16 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 32 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 64 /* Domain members share cpu power */
struct sched_group {
struct sched_group *next; /* Must be a circular list */
......@@ -598,6 +599,8 @@ struct sched_domain {
.cache_nice_tries = 0, \
.per_cpu_gain = 15, \
.flags = SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_BALANCE_CLONE \
| SD_WAKE_AFFINE \
| SD_WAKE_IDLE \
| SD_SHARE_CPUPOWER, \
......@@ -619,6 +622,8 @@ struct sched_domain {
.cache_nice_tries = 1, \
.per_cpu_gain = 100, \
.flags = SD_BALANCE_NEWIDLE \
| SD_BALANCE_EXEC \
| SD_BALANCE_CLONE \
| SD_WAKE_AFFINE \
| SD_WAKE_BALANCE, \
.last_balance = jiffies, \
......@@ -640,6 +645,7 @@ struct sched_domain {
.cache_nice_tries = 1, \
.per_cpu_gain = 100, \
.flags = SD_BALANCE_EXEC \
| SD_BALANCE_CLONE \
| SD_WAKE_BALANCE, \
.last_balance = jiffies, \
.balance_interval = 1, \
......@@ -659,7 +665,7 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
extern unsigned long long sched_clock(void);
#ifdef CONFIG_NUMA
#ifdef CONFIG_SMP
extern void sched_balance_exec(void);
#else
#define sched_balance_exec() {}
......@@ -717,12 +723,17 @@ extern void do_timer(struct pt_regs *);
extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk));
#else
static inline void kick_process(struct task_struct *tsk) { }
static inline void wake_up_forked_thread(struct task_struct * tsk)
{
return wake_up_forked_process(tsk);
}
#endif
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
extern void FASTCALL(sched_fork(task_t * p));
extern void FASTCALL(sched_exit(task_t * p));
......
......@@ -1180,9 +1180,23 @@ long do_fork(unsigned long clone_flags,
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
if (!(clone_flags & CLONE_STOPPED))
wake_up_forked_process(p); /* do this last */
if (!(clone_flags & CLONE_STOPPED)) {
/*
* Do the wakeup last. On SMP we treat fork() and
* CLONE_VM separately, because fork() has already
* created cache footprint on this CPU (due to
* copying the pagetables), hence migration would
* probably be costy. Threads on the other hand
* have less traction to the current CPU, and if
* there's an imbalance then the scheduler can
* migrate this fresh thread now, before it
* accumulates a larger cache footprint:
*/
if (clone_flags & CLONE_VM)
wake_up_forked_thread(p);
else
wake_up_forked_process(p);
} else
p->state = TASK_STOPPED;
++total_forks;
......
......@@ -1156,7 +1156,133 @@ enum idle_type
};
#ifdef CONFIG_SMP
#ifdef CONFIG_NUMA
/*
* find_idlest_cpu - find the least busy runqueue.
*/
static int find_idlest_cpu(struct task_struct *p, int this_cpu,
struct sched_domain *sd)
{
unsigned long load, min_load, this_load;
int i, min_cpu;
cpumask_t mask;
min_cpu = UINT_MAX;
min_load = ULONG_MAX;
cpus_and(mask, sd->span, cpu_online_map);
cpus_and(mask, mask, p->cpus_allowed);
for_each_cpu_mask(i, mask) {
load = target_load(i);
if (load < min_load) {
min_cpu = i;
min_load = load;
/* break out early on an idle CPU: */
if (!min_load)
break;
}
}
/* add +1 to account for the new task */
this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
/*
* Would with the addition of the new task to the
* current CPU there be an imbalance between this
* CPU and the idlest CPU?
*
* Use half of the balancing threshold - new-context is
* a good opportunity to balance.
*/
if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
return min_cpu;
return this_cpu;
}
/*
* wake_up_forked_thread - wake up a freshly forked thread.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, and it also does
* runqueue balancing.
*/
void fastcall wake_up_forked_thread(task_t * p)
{
unsigned long flags;
int this_cpu = get_cpu(), cpu;
struct sched_domain *tmp, *sd = NULL;
runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
/*
* Find the largest domain that this CPU is part of that
* is willing to balance on clone:
*/
for_each_domain(this_cpu, tmp)
if (tmp->flags & SD_BALANCE_CLONE)
sd = tmp;
if (sd)
cpu = find_idlest_cpu(p, this_cpu, sd);
else
cpu = this_cpu;
local_irq_save(flags);
lock_again:
rq = cpu_rq(cpu);
double_rq_lock(this_rq, rq);
BUG_ON(p->state != TASK_RUNNING);
/*
* We did find_idlest_cpu() unlocked, so in theory
* the mask could have changed - just dont migrate
* in this case:
*/
if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
cpu = this_cpu;
double_rq_unlock(this_rq, rq);
goto lock_again;
}
/*
* We decrease the sleep average of forking parents
* and children as well, to keep max-interactive tasks
* from forking tasks that are max-interactive.
*/
current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
p->interactive_credit = 0;
p->prio = effective_prio(p);
set_task_cpu(p, cpu);
if (cpu == this_cpu) {
if (unlikely(!current->array))
__activate_task(p, rq);
else {
p->prio = current->prio;
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
}
} else {
__activate_task(p, rq);
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
double_rq_unlock(this_rq, rq);
local_irq_restore(flags);
put_cpu();
}
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
......@@ -1197,34 +1323,6 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
task_rq_unlock(rq, &flags);
}
/*
* Find the least loaded CPU. Slightly favor the current CPU by
* setting its load as the minimum to start.
*/
static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
{
cpumask_t tmp;
int i, min_load, this_cpu, best_cpu;
best_cpu = this_cpu = task_cpu(p);
min_load = INT_MAX;
cpus_and(tmp, sd->span, cpu_online_map);
for_each_cpu_mask(i, tmp) {
unsigned long load;
if (i == this_cpu)
load = source_load(i);
else
load = target_load(i) + SCHED_LOAD_SCALE;
if (min_load > load) {
best_cpu = i;
min_load = load;
}
}
return best_cpu;
}
/*
* sched_balance_exec(): find the highest-level, exec-balance-capable
* domain and try to migrate the task to the least loaded CPU.
......@@ -1234,19 +1332,19 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
*/
void sched_balance_exec(void)
{
struct sched_domain *sd, *best_sd = NULL;
struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
/* Prefer the current CPU if there's only this task running */
if (this_rq()->nr_running <= 1)
goto out;
for_each_domain(this_cpu, sd)
if (sd->flags & SD_BALANCE_EXEC)
best_sd = sd;
for_each_domain(this_cpu, tmp)
if (tmp->flags & SD_BALANCE_EXEC)
sd = tmp;
if (best_sd) {
new_cpu = sched_best_cpu(current, best_sd);
if (sd) {
new_cpu = find_idlest_cpu(current, this_cpu, sd);
if (new_cpu != this_cpu) {
put_cpu();
sched_migrate_task(current, new_cpu);
......@@ -1256,7 +1354,6 @@ void sched_balance_exec(void)
out:
put_cpu();
}
#endif /* CONFIG_NUMA */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment