Commit f221af36 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] scheduler infrastructure

From: Ingo Molnar <mingo@elte.hu>

the attached scheduler patch (against test2-mm2) adds the scheduling
infrastructure items discussed on lkml. I got good feedback - and while i
dont expect it to solve all problems, it does solve a number of bad ones:

 - test_starve.c code from David Mosberger

 - thud.c making the system unusuable due to unfairness

 - fair/accurate sleep average based on a finegrained clock

 - audio skipping way too easily

other changes in sched-test2-mm2-A3:

 - ia64 sched_clock() code, from David Mosberger.

 - migration thread startup without relying on implicit scheduling
   behavior. While the current 2.6 code is correct (due to the cpu-up code
   adding CPUs one by one), but it's also fragile - and this code cannot
   be carried over into the 2.4 backports. So adding this method would
   clean up the startup and would make it easier to have 2.4 backports.

and here's the original changelog for the scheduler changes:

 - cycle accuracy (nanosec resolution) timekeeping within the scheduler.
   This fixes a number of audio artifacts (skipping) i've reproduced. I
   dont think we can get away without going cycle accuracy - reading the
   cycle counter adds some overhead, but it's acceptable. The first
   nanosec-accuracy patch was done by Mike Galbraith - this patch is
   different but similar in nature. I went further in also changing the
   sleep_avg to be of nanosec resolution.

 - more finegrained timeslices: there's now a timeslice 'sub unit' of 50
   usecs (TIMESLICE_GRANULARITY) - CPU hogs on the same priority level
   will roundrobin with this unit. This change is intended to make gaming
   latencies shorter.

 - include scheduling latency in sleep bonus calculation. This change
   extends the sleep-average calculation to the period of time a task
   spends on the runqueue but doesnt get scheduled yet, right after
   wakeup. Note that tasks that were preempted (ie. not woken up) and are
   still on the runqueue do not get this benefit. This change closes one
   of the last hole in the dynamic priority estimation, it should result
   in interactive tasks getting more priority under heavy load. This
   change also fixes the test-starve.c testcase from David Mosberger.


The TSC-based scheduler clock is disabled on ia32 NUMA platforms.  (ie. 
platforms that have unsynched TSC for sure.) Those platforms should provide
the proper code to rely on the TSC in a global way.  (no such infrastructure
exists at the moment - the monotonic TSC-based clock doesnt deal with TSC
offsets either, as far as i can tell.)
parent 1dffaaf7
......@@ -915,13 +915,13 @@ static void smp_tune_scheduling (void)
cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
}
cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000;
cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1;
printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
(long)cacheflush_time/(cpu_khz/1000),
((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
printk("task migration cache decay timeout: %ld msecs.\n",
(cache_decay_ticks + 1) * 1000 / HZ);
cache_decay_ticks);
}
/*
......
......@@ -127,6 +127,30 @@ static unsigned long long monotonic_clock_tsc(void)
return base + cycles_2_ns(this_offset - last_offset);
}
/*
* Scheduler clock - returns current time in nanosec units.
*/
unsigned long long sched_clock(void)
{
unsigned long long this_offset;
/*
* In the NUMA case we dont use the TSC as they are not
* synchronized across all CPUs.
*/
#ifndef CONFIG_NUMA
if (unlikely(!cpu_has_tsc))
#endif
return (unsigned long long)jiffies * (1000000000 / HZ);
/* Read the Time Stamp Counter */
rdtscll(this_offset);
/* return the value in ns */
return cycles_2_ns(this_offset);
}
static void mark_offset_tsc(void)
{
unsigned long lost,delay;
......
......@@ -154,13 +154,16 @@ static inline char * task_state(struct task_struct *p, char *buffer)
read_lock(&tasklist_lock);
buffer += sprintf(buffer,
"State:\t%s\n"
"SleepAVG:\t%lu%%\n"
"Tgid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p), p->tgid,
get_task_state(p),
(p->sleep_avg/1024)*100/(1000000000/1024),
p->tgid,
p->pid, p->pid ? p->real_parent->pid : 0,
p->pid && p->ptrace ? p->parent->pid : 0,
p->uid, p->euid, p->suid, p->fsuid,
......
......@@ -342,7 +342,8 @@ struct task_struct {
prio_array_t *array;
unsigned long sleep_avg;
unsigned long last_run;
unsigned long long timestamp;
int activated;
unsigned long policy;
cpumask_t cpus_allowed;
......@@ -506,6 +507,8 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
}
#endif
extern unsigned long long sched_clock(void);
#ifdef CONFIG_NUMA
extern void sched_balance_exec(void);
extern void node_nr_running_init(void);
......
......@@ -925,7 +925,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
p->last_run = jiffies;
p->timestamp = sched_clock();
if (!current->time_slice) {
/*
* This case is rare, it happens when the parent has only
......
......@@ -68,13 +68,15 @@
*/
#define MIN_TIMESLICE ( 10 * HZ / 1000)
#define MAX_TIMESLICE (200 * HZ / 1000)
#define CHILD_PENALTY 50
#define TIMESLICE_GRANULARITY (HZ/40 ?: 1)
#define ON_RUNQUEUE_WEIGHT 30
#define CHILD_PENALTY 95
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define INTERACTIVE_DELTA 2
#define MAX_SLEEP_AVG (10*HZ)
#define STARVATION_LIMIT (10*HZ)
#define MAX_SLEEP_AVG (1*1000000000)
#define STARVATION_LIMIT HZ
#define NODE_THRESHOLD 125
/*
......@@ -115,6 +117,11 @@
#define TASK_INTERACTIVE(p) \
((p)->prio <= (p)->static_prio - DELTA(p))
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio || \
((p)->prio == (rq)->curr->prio && \
(p)->time_slice > (rq)->curr->time_slice * 2))
/*
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
* to time slice values.
......@@ -318,8 +325,8 @@ static int effective_prio(task_t *p)
if (rt_task(p))
return p->prio;
bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*(p->sleep_avg/1024)/(MAX_SLEEP_AVG/1024)/100;
bonus -= MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
......@@ -338,24 +345,24 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
nr_running_inc(rq);
}
/*
* activate_task - move a task to the runqueue and do priority recalculation
*
* Update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
static inline void activate_task(task_t *p, runqueue_t *rq)
static void recalc_task_prio(task_t *p, unsigned long long now)
{
long sleep_time = jiffies - p->last_run - 1;
unsigned long long __sleep_time = now - p->timestamp;
unsigned long sleep_time;
if (__sleep_time > MAX_SLEEP_AVG)
sleep_time = MAX_SLEEP_AVG;
else
sleep_time = (unsigned long)__sleep_time;
if (sleep_time > 0) {
int sleep_avg;
unsigned long long sleep_avg;
/*
* This code gives a bonus to interactive tasks.
*
* The boost works by updating the 'average sleep time'
* value here, based on ->last_run. The more time a task
* value here, based on ->timestamp. The more time a task
* spends sleeping, the higher the average gets - and the
* higher the priority boost gets as well.
*/
......@@ -374,6 +381,37 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
p->prio = effective_prio(p);
}
}
}
/*
* activate_task - move a task to the runqueue and do priority recalculation
*
* Update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
static inline void activate_task(task_t *p, runqueue_t *rq)
{
unsigned long long now = sched_clock();
recalc_task_prio(p, now);
/*
* Tasks which were woken up by interrupts (ie. hw events)
* are most likely of interactive nature. So we give them
* the credit of extending their sleep time to the period
* of time they spend on the runqueue, waiting for execution
* on a CPU, first time around:
*/
if (in_interrupt())
p->activated = 2;
else
/*
* Normal first-time wakeups get a credit too for on-runqueue time,
* but it will be weighted down:
*/
p->activated = 1;
p->timestamp = now;
__activate_task(p, rq);
}
......@@ -500,7 +538,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync, int kick)
__activate_task(p, rq);
else {
activate_task(p, rq);
if (p->prio < rq->curr->prio)
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
success = 1;
......@@ -549,8 +587,8 @@ void wake_up_forked_process(task_t * p)
* and children as well, to keep max-interactive tasks
* from forking tasks that are max-interactive.
*/
current->sleep_avg = current->sleep_avg * PARENT_PENALTY / 100;
p->sleep_avg = p->sleep_avg * CHILD_PENALTY / 100;
current->sleep_avg = current->sleep_avg / 100 * PARENT_PENALTY;
p->sleep_avg = p->sleep_avg / 100 * CHILD_PENALTY;
p->prio = effective_prio(p);
set_task_cpu(p, smp_processor_id());
......@@ -591,8 +629,7 @@ void sched_exit(task_t * p)
* the sleep_avg of the parent as well.
*/
if (p->sleep_avg < p->parent->sleep_avg)
p->parent->sleep_avg = (p->parent->sleep_avg * EXIT_WEIGHT +
p->sleep_avg) / (EXIT_WEIGHT + 1);
p->parent->sleep_avg = p->parent->sleep_avg / (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / (EXIT_WEIGHT + 1);
}
/**
......@@ -994,13 +1031,8 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
if (p->prio < this_rq->curr->prio)
if (TASK_PREEMPTS_CURR(p, this_rq))
set_need_resched();
else {
if (p->prio == this_rq->curr->prio &&
p->time_slice > this_rq->curr->time_slice)
set_need_resched();
}
}
/*
......@@ -1017,12 +1049,14 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
runqueue_t *busiest;
prio_array_t *array;
struct list_head *head, *curr;
unsigned long long now;
task_t *tmp;
busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
if (!busiest)
goto out;
now = sched_clock();
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
......@@ -1063,7 +1097,7 @@ static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
*/
#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
((idle || (jiffies - (p)->last_run > cache_decay_ticks)) && \
((idle || (((now - (p)->timestamp)>>10) > cache_decay_ticks)) &&\
!task_running(rq, p) && \
cpu_isset(this_cpu, (p)->cpus_allowed))
......@@ -1180,8 +1214,7 @@ EXPORT_PER_CPU_SYMBOL(kstat);
*/
#define EXPIRED_STARVING(rq) \
(STARVATION_LIMIT && ((rq)->expired_timestamp && \
(jiffies - (rq)->expired_timestamp >= \
STARVATION_LIMIT * ((rq)->nr_running) + 1)))
(jiffies - (rq)->expired_timestamp >= STARVATION_LIMIT)))
/*
* This function gets called by the timer code, with HZ frequency.
......@@ -1231,14 +1264,11 @@ void scheduler_tick(int user_ticks, int sys_ticks)
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
* time slice counter and the sleep average. Note: we
* do not update a thread's priority until it either
* goes to sleep or uses up its timeslice. This makes
* it possible for interactive tasks to use up their
* timeslices at their highest priority levels.
* time slice counter. Note: we do not update a thread's
* priority until it either goes to sleep or uses up its
* timeslice. This makes it possible for interactive tasks
* to use up their timeslices at their highest priority levels.
*/
if (p->sleep_avg)
p->sleep_avg--;
if (unlikely(rt_task(p))) {
/*
* RR tasks need a special form of timeslice management.
......@@ -1262,12 +1292,33 @@ void scheduler_tick(int user_ticks, int sys_ticks)
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
enqueue_task(p, rq->expired);
} else
enqueue_task(p, rq->active);
} else {
/*
* Prevent a too long timeslice allowing a task to monopolize
* the CPU. We do this by splitting up the timeslice into
* smaller pieces.
*
* Note: this does not mean the task's timeslices expire or
* get lost in any way, they just might be preempted by
* another task of equal priority. (one with higher
* priority would have preempted this task already.) We
* requeue this task to the end of the list on this priority
* level, which is in essence a round-robin of tasks with
* equal priority.
*/
if (!((task_timeslice(p) - p->time_slice) % TIMESLICE_GRANULARITY) &&
(p->array == rq->active)) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
enqueue_task(p, rq->active);
}
}
out_unlock:
spin_unlock(&rq->lock);
......@@ -1286,6 +1337,8 @@ asmlinkage void schedule(void)
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int idx;
/*
......@@ -1306,7 +1359,11 @@ asmlinkage void schedule(void)
rq = this_rq();
release_kernel_lock(prev);
prev->last_run = jiffies;
now = sched_clock();
if (likely(now - prev->timestamp < MAX_SLEEP_AVG))
run_time = now - prev->timestamp;
else
run_time = MAX_SLEEP_AVG;
spin_lock_irq(&rq->lock);
/*
......@@ -1356,12 +1413,30 @@ asmlinkage void schedule(void)
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
if (next->activated) {
unsigned long long delta = now - next->timestamp;
if (next->activated == 1)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
next->activated = 0;
array = next->array;
dequeue_task(next, array);
recalc_task_prio(next, next->timestamp + delta);
enqueue_task(next, array);
}
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
RCU_qsctr(task_cpu(prev))++;
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg < 0)
prev->sleep_avg = 0;
prev->timestamp = now;
if (likely(prev != next)) {
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
......@@ -1601,6 +1676,7 @@ void set_user_nice(task_t *p, long nice)
unsigned long flags;
prio_array_t *array;
runqueue_t *rq;
int old_prio, new_prio, delta;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
......@@ -1609,6 +1685,12 @@ void set_user_nice(task_t *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
/*
* The RT priorities are set via setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
* not SCHED_NORMAL:
*/
if (rt_task(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
......@@ -1616,16 +1698,20 @@ void set_user_nice(task_t *p, long nice)
array = p->array;
if (array)
dequeue_task(p, array);
old_prio = p->prio;
new_prio = NICE_TO_PRIO(nice);
delta = new_prio - old_prio;
p->static_prio = NICE_TO_PRIO(nice);
p->prio = NICE_TO_PRIO(nice);
p->prio += delta;
if (array) {
enqueue_task(p, array);
/*
* If the task is running and lowered its priority,
* or increased its priority then reschedule its CPU:
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
*/
if ((NICE_TO_PRIO(nice) < p->static_prio) ||
task_running(rq, p))
if (delta < 0 || (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
......@@ -2382,6 +2468,12 @@ static void move_task_away(struct task_struct *p, int dest_cpu)
local_irq_restore(flags);
}
typedef struct {
int cpu;
struct completion startup_done;
task_t *task;
} migration_startup_t;
/*
* migration_thread - this is a highprio system thread that performs
* thread migration by bumping thread off CPU then 'pushing' onto
......@@ -2391,20 +2483,21 @@ static int migration_thread(void * data)
{
/* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */
struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 };
int cpu = (long) data;
migration_startup_t *startup = data;
int cpu = startup->cpu;
runqueue_t *rq;
int ret;
startup->task = current;
complete(&startup->startup_done);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
BUG_ON(smp_processor_id() != cpu);
daemonize("migration/%d", cpu);
set_fs(KERNEL_DS);
/*
* Either we are running on the right CPU, or there's a a
* migration thread on this CPU, guaranteed (we're started
* serially).
*/
set_cpus_allowed(current, cpumask_of_cpu(cpu));
ret = setscheduler(0, SCHED_FIFO, &param);
rq = this_rq();
......@@ -2440,13 +2533,30 @@ static int migration_call(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
long cpu = (long) hcpu;
migration_startup_t startup;
switch (action) {
case CPU_ONLINE:
printk("Starting migration thread for cpu %li\n",
(long)hcpu);
kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
while (!cpu_rq((long)hcpu)->migration_thread)
printk("Starting migration thread for cpu %li\n", cpu);
startup.cpu = cpu;
startup.task = NULL;
init_completion(&startup.startup_done);
kernel_thread(migration_thread, &startup, CLONE_KERNEL);
wait_for_completion(&startup.startup_done);
wait_task_inactive(startup.task);
startup.task->thread_info->cpu = cpu;
startup.task->cpus_allowed = cpumask_of_cpu(cpu);
wake_up_process(startup.task);
while (!cpu_rq(cpu)->migration_thread)
yield();
break;
}
return NOTIFY_OK;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment