Commit 97db62cc authored by Ingo Molnar's avatar Ingo Molnar Committed by Linus Torvalds

[PATCH] scheduler fixes

 - introduce new type of context-switch locking, this is a must-have for
   ia64 and sparc64.

 - load_balance() bug noticed by Scott Rhine and myself: scan the
   whole list to find imbalance number of tasks, not just the tail
   of the list.

 - sched_yield() fix: use current->array not rq->active.
parent 9e7cec88
......@@ -11,11 +11,6 @@
struct task_struct; /* one of the stranger aspects of C forward declarations.. */
extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));
#define prepare_arch_schedule(prev) do { } while(0)
#define finish_arch_schedule(prev) do { } while(0)
#define prepare_arch_switch(rq) do { } while(0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define switch_to(prev,next,last) do { \
asm volatile("pushl %%esi\n\t" \
"pushl %%edi\n\t" \
......
......@@ -83,11 +83,6 @@ extern void cacheable_memzero(void *p, unsigned int nb);
struct device_node;
extern void note_scsi_host(struct device_node *, void *);
#define prepare_arch_schedule(prev) do { } while(0)
#define finish_arch_schedule(prev) do { } while(0)
#define prepare_arch_switch(rq) do { } while(0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
struct task_struct;
extern void __switch_to(struct task_struct *, struct task_struct *);
#define switch_to(prev, next, last) __switch_to((prev), (next))
......
......@@ -18,11 +18,6 @@
#endif
#include <linux/kernel.h>
#define prepare_arch_schedule(prev) do { } while (0)
#define finish_arch_schedule(prev) do { } while (0)
#define prepare_arch_switch(rq) do { } while (0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define switch_to(prev,next,last) do { \
if (prev == next) \
break; \
......
......@@ -18,11 +18,6 @@
#endif
#include <linux/kernel.h>
#define prepare_arch_schedule(prev) do { } while (0)
#define finish_arch_schedule(prev) do { } while (0)
#define prepare_arch_switch(rq) do { } while (0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define switch_to(prev,next),last do { \
if (prev == next) \
break; \
......
......@@ -140,13 +140,17 @@ extern void __flushw_user(void);
#define flush_user_windows flushw_user
#define flush_register_windows flushw_all
#define prepare_arch_schedule(prev) task_lock(prev)
#define finish_arch_schedule(prev) task_unlock(prev)
#define prepare_arch_switch(rq) \
do { spin_unlock(&(rq)->lock); \
flushw_all(); \
#define prepare_arch_switch(rq, next) \
do { spin_lock(&(next)->switch_lock); \
spin_unlock(&(rq)->lock); \
flushw_all(); \
} while (0)
#define finish_arch_switch(rq) local_irq_enable()
#define finish_arch_switch(rq, prev) \
do { spin_unlock_irq(&(prev)->switch_lock); \
} while (0)
#ifndef CONFIG_DEBUG_SPINLOCK
#define CHECK_LOCKS(PREV) do { } while(0)
......
......@@ -13,11 +13,6 @@
#define LOCK_PREFIX ""
#endif
#define prepare_arch_schedule(prev) do { } while(0)
#define finish_arch_schedule(prev) do { } while(0)
#define prepare_arch_switch(rq) do { } while(0)
#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define __STR(x) #x
#define STR(x) __STR(x)
......
......@@ -47,7 +47,7 @@
lock_depth: -1, \
prio: MAX_PRIO-20, \
static_prio: MAX_PRIO-20, \
policy: SCHED_OTHER, \
policy: SCHED_NORMAL, \
cpus_allowed: -1, \
mm: NULL, \
active_mm: &init_mm, \
......@@ -78,6 +78,7 @@
pending: { NULL, &tsk.pending.head, {{0}}}, \
blocked: {{0}}, \
alloc_lock: SPIN_LOCK_UNLOCKED, \
switch_lock: SPIN_LOCK_UNLOCKED, \
journal_info: NULL, \
}
......
......@@ -116,7 +116,7 @@ extern unsigned long nr_uninterruptible(void);
/*
* Scheduling policies
*/
#define SCHED_OTHER 0
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
......@@ -207,7 +207,7 @@ struct signal_struct {
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are
* in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
* are inverted: lower p->prio value means higher priority.
*
......@@ -264,7 +264,7 @@ struct task_struct {
unsigned long policy;
unsigned long cpus_allowed;
unsigned int time_slice;
unsigned int time_slice, first_time_slice;
struct list_head tasks;
......@@ -361,6 +361,8 @@ struct task_struct {
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty */
spinlock_t alloc_lock;
/* context-switch lock */
spinlock_t switch_lock;
/* journalling filesystem info */
void *journal_info;
......
......@@ -184,7 +184,7 @@ void reparent_to_init(void)
current->exit_signal = SIGCHLD;
current->ptrace = 0;
if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
......
......@@ -611,7 +611,6 @@ struct task_struct *do_fork(unsigned long clone_flags,
unsigned long stack_size)
{
int retval;
unsigned long flags;
struct task_struct *p = NULL;
struct completion vfork;
......@@ -675,6 +674,7 @@ struct task_struct *do_fork(unsigned long clone_flags,
init_completion(&vfork);
}
spin_lock_init(&p->alloc_lock);
spin_lock_init(&p->switch_lock);
clear_tsk_thread_flag(p,TIF_SIGPENDING);
init_sigpending(&p->pending);
......@@ -740,8 +740,13 @@ struct task_struct *do_fork(unsigned long clone_flags,
* total amount of pending timeslices in the system doesnt change,
* resulting in more scheduling fairness.
*/
local_irq_save(flags);
p->time_slice = (current->time_slice + 1) >> 1;
local_irq_disable();
p->time_slice = (current->time_slice + 1) >> 1;
/*
* The remainder of the first timeslice might be recovered by
* the parent if the child exits early enough.
*/
p->first_time_slice = 1;
current->time_slice >>= 1;
p->sleep_timestamp = jiffies;
if (!current->time_slice) {
......@@ -753,11 +758,10 @@ struct task_struct *do_fork(unsigned long clone_flags,
current->time_slice = 1;
preempt_disable();
scheduler_tick(0, 0);
local_irq_restore(flags);
local_irq_enable();
preempt_enable();
} else
local_irq_restore(flags);
local_irq_enable();
/*
* Ok, add it to the run-queues and make it
* visible to the rest of the system.
......
......@@ -190,16 +190,19 @@ int request_module(const char * module_name)
pid_t pid;
int waitpid_result;
sigset_t tmpsig;
int i;
int i, ret;
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
unsigned long saved_policy = current->policy;
current->policy = SCHED_NORMAL;
/* Don't allow request_module() before the root fs is mounted! */
if ( ! current->fs->root ) {
printk(KERN_ERR "request_module[%s]: Root fs not mounted\n",
module_name);
return -EPERM;
ret = -EPERM;
goto out;
}
/* If modprobe needs a service that is in a module, we get a recursive
......@@ -220,14 +223,16 @@ int request_module(const char * module_name)
printk(KERN_ERR
"kmod: runaway modprobe loop assumed and stopped\n");
atomic_dec(&kmod_concurrent);
return -ENOMEM;
ret = -ENOMEM;
goto out;
}
pid = kernel_thread(exec_modprobe, (void*) module_name, 0);
if (pid < 0) {
printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid);
atomic_dec(&kmod_concurrent);
return pid;
ret = pid;
goto out;
}
/* Block everything but SIGKILL/SIGSTOP */
......@@ -250,7 +255,10 @@ int request_module(const char * module_name)
printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n",
module_name, pid, -waitpid_result);
}
return 0;
ret = 0;
out:
current->policy = saved_policy;
return ret;
}
#endif /* CONFIG_KMOD */
......
......@@ -12,8 +12,8 @@
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
* hybrid priority-list and round-robin design with
* an array-switch method of distributing timeslices
* and per-CPU runqueues. Additional code by Davide
* Libenzi, Robert Love, and Rusty Russell.
* and per-CPU runqueues. Cleanups and useful suggestions
* by Davide Libenzi, preemptible kernel bits by Robert Love.
*/
#include <linux/mm.h>
......@@ -102,16 +102,23 @@
((p)->prio <= (p)->static_prio - DELTA(p))
/*
* TASK_TIMESLICE scales user-nice values [ -20 ... 19 ]
* BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
* to time slice values.
*
* The higher a process's priority, the bigger timeslices
* it gets during one round of execution. But even the lowest
* priority process gets MIN_TIMESLICE worth of execution time.
*
* task_timeslice() is the interface that is used by the scheduler.
*/
#define TASK_TIMESLICE(p) (MIN_TIMESLICE + \
((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/39))
#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
static inline unsigned int task_timeslice(task_t *p)
{
return BASE_TIMESLICE(p);
}
/*
* These are the runqueue data structures:
......@@ -136,13 +143,15 @@ struct prio_array {
*/
struct runqueue {
spinlock_t lock;
unsigned long nr_running, nr_switches, expired_timestamp;
signed long nr_uninterruptible;
unsigned long nr_running, nr_switches, expired_timestamp,
nr_uninterruptible;
task_t *curr, *idle;
prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS];
task_t *migration_thread;
list_t migration_queue;
} ____cacheline_aligned;
static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
......@@ -153,6 +162,15 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define rt_task(p) ((p)->prio < MAX_RT_PRIO)
/*
* Default context-switch locking:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(rq, next) do { } while(0)
# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
# define task_running(rq, p) ((rq)->curr == (p))
#endif
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
......@@ -307,7 +325,7 @@ void wait_task_inactive(task_t * p)
repeat:
preempt_disable();
rq = task_rq(p);
if (unlikely(rq->curr == p)) {
if (unlikely(task_running(rq, p))) {
cpu_relax();
/*
* enable/disable preemption just to make this
......@@ -318,7 +336,7 @@ void wait_task_inactive(task_t * p)
goto repeat;
}
rq = task_rq_lock(p, &flags);
if (unlikely(rq->curr == p)) {
if (unlikely(task_running(rq, p))) {
task_rq_unlock(rq, &flags);
preempt_enable();
goto repeat;
......@@ -326,6 +344,7 @@ void wait_task_inactive(task_t * p)
task_rq_unlock(rq, &flags);
preempt_enable();
}
#endif
/*
* Kick the remote CPU if the task is running currently,
......@@ -338,10 +357,9 @@ void wait_task_inactive(task_t * p)
*/
void kick_if_running(task_t * p)
{
if (p == task_rq(p)->curr)
if ((task_running(task_rq(p), p)) && (p->thread_info->cpu != smp_processor_id()))
resched_task(p);
}
#endif
/*
* Wake up a process. Put it on the run-queue if it's not
......@@ -350,6 +368,8 @@ void kick_if_running(task_t * p)
* progress), and as such you're allowed to do the simpler
* "current->state = TASK_RUNNING" to mark yourself runnable
* without the overhead of this.
*
* returns failure only if the task is already active.
*/
static int try_to_wake_up(task_t * p, int sync)
{
......@@ -366,7 +386,7 @@ static int try_to_wake_up(task_t * p, int sync)
* Fast-migrate the task if it's not running or runnable
* currently. Do not violate hard affinity.
*/
if (unlikely(sync && (rq->curr != p) &&
if (unlikely(sync && !task_running(rq, p) &&
(task_cpu(p) != smp_processor_id()) &&
(p->cpus_allowed & (1UL << smp_processor_id())))) {
......@@ -377,9 +397,7 @@ static int try_to_wake_up(task_t * p, int sync)
if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
activate_task(p, rq);
/*
* If sync is set, a resched_task() is a NOOP
*/
if (p->prio < rq->curr->prio)
resched_task(rq->curr);
success = 1;
......@@ -428,9 +446,11 @@ void wake_up_forked_process(task_t * p)
void sched_exit(task_t * p)
{
local_irq_disable();
current->time_slice += p->time_slice;
if (unlikely(current->time_slice > MAX_TIMESLICE))
current->time_slice = MAX_TIMESLICE;
if (p->first_time_slice) {
current->time_slice += p->time_slice;
if (unlikely(current->time_slice > MAX_TIMESLICE))
current->time_slice = MAX_TIMESLICE;
}
local_irq_enable();
/*
* If the child was a (relative-) CPU hog then decrease
......@@ -444,8 +464,7 @@ void sched_exit(task_t * p)
#if CONFIG_SMP || CONFIG_PREEMPT
asmlinkage void schedule_tail(task_t *prev)
{
finish_arch_switch(this_rq());
finish_arch_schedule(prev);
finish_arch_switch(this_rq(), prev);
}
#endif
......@@ -502,7 +521,42 @@ unsigned long nr_context_switches(void)
return sum;
}
/*
* double_rq_lock - safely lock two runqueues
*
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
{
if (rq1 == rq2)
spin_lock(&rq1->lock);
else {
if (rq1 < rq2) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
spin_lock(&rq2->lock);
spin_lock(&rq1->lock);
}
}
}
/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
* you need to do so manually after calling.
*/
static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
{
spin_unlock(&rq1->lock);
if (rq1 != rq2)
spin_unlock(&rq2->lock);
}
#if CONFIG_SMP
/*
* Lock the busiest runqueue as well, this_rq is locked already.
* Recalculate nr_running if we have to drop the runqueue lock.
......@@ -526,22 +580,10 @@ static inline unsigned int double_lock_balance(runqueue_t *this_rq,
return nr_running;
}
/*
* Current runqueue is empty, or rebalance tick: if there is an
* inbalance (current runqueue is too short) then pull from
* busiest runqueue(s).
*
* We call this with the current runqueue locked,
* irqs disabled.
*/
static void load_balance(runqueue_t *this_rq, int idle)
static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance)
{
int imbalance, nr_running, load, max_load,
idx, i, this_cpu = smp_processor_id();
task_t *next = this_rq->idle, *tmp;
int nr_running, load, max_load, i;
runqueue_t *busiest, *rq_src;
prio_array_t *array;
list_t *head, *curr;
/*
* We search all runqueues to find the most busy one.
......@@ -590,21 +632,67 @@ static void load_balance(runqueue_t *this_rq, int idle)
}
if (likely(!busiest))
return;
goto out;
imbalance = (max_load - nr_running) / 2;
*imbalance = (max_load - nr_running) / 2;
/* It needs an at least ~25% imbalance to trigger balancing. */
if (!idle && (imbalance < (max_load + 3)/4))
return;
if (!idle && (*imbalance < (max_load + 3)/4)) {
busiest = NULL;
goto out;
}
nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
/*
* Make sure nothing changed since we checked the
* runqueue length.
*/
if (busiest->nr_running <= nr_running + 1)
goto out_unlock;
if (busiest->nr_running <= nr_running + 1) {
spin_unlock(&busiest->lock);
busiest = NULL;
}
out:
return busiest;
}
/*
* Move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
{
dequeue_task(p, src_array);
src_rq->nr_running--;
set_task_cpu(p, this_cpu);
this_rq->nr_running++;
enqueue_task(p, this_rq->active);
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
if (p->prio < this_rq->curr->prio)
set_need_resched();
}
/*
* Current runqueue is empty, or rebalance tick: if there is an
* inbalance (current runqueue is too short) then pull from
* busiest runqueue(s).
*
* We call this with the current runqueue locked,
* irqs disabled.
*/
static void load_balance(runqueue_t *this_rq, int idle)
{
int imbalance, idx, this_cpu = smp_processor_id();
runqueue_t *busiest;
prio_array_t *array;
list_t *head, *curr;
task_t *tmp;
busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance);
if (!busiest)
goto out;
/*
* We first consider expired tasks. Those will likely not be
......@@ -647,36 +735,28 @@ static void load_balance(runqueue_t *this_rq, int idle)
#define CAN_MIGRATE_TASK(p,rq,this_cpu) \
((jiffies - (p)->sleep_timestamp > cache_decay_ticks) && \
((p) != (rq)->curr) && \
!task_running(rq, p) && \
((p)->cpus_allowed & (1UL << (this_cpu))))
curr = curr->prev;
if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) {
curr = curr->next;
if (curr != head)
goto skip_queue;
idx++;
goto skip_bitmap;
}
next = tmp;
/*
* take the task out of the other runqueue and
* put it into this one:
*/
dequeue_task(next, array);
busiest->nr_running--;
set_task_cpu(next, this_cpu);
this_rq->nr_running++;
enqueue_task(next, this_rq->active);
if (next->prio < current->prio)
set_need_resched();
pull_task(busiest, array, tmp, this_rq, this_cpu);
if (!idle && --imbalance) {
if (array == busiest->expired) {
array = busiest->active;
goto new_array;
}
if (curr != head)
goto skip_queue;
idx++;
goto skip_bitmap;
}
out_unlock:
spin_unlock(&busiest->lock);
out:
;
}
/*
......@@ -691,13 +771,13 @@ static void load_balance(runqueue_t *this_rq, int idle)
#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
static inline void idle_tick(void)
static inline void idle_tick(runqueue_t *rq)
{
if (jiffies % IDLE_REBALANCE_TICK)
return;
spin_lock(&this_rq()->lock);
load_balance(this_rq(), 1);
spin_unlock(&this_rq()->lock);
spin_lock(&rq->lock);
load_balance(rq, 1);
spin_unlock(&rq->lock);
}
#endif
......@@ -720,7 +800,7 @@ static inline void idle_tick(void)
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
void scheduler_tick(int user_tick, int system)
void scheduler_tick(int user_ticks, int sys_ticks)
{
int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
......@@ -728,18 +808,18 @@ void scheduler_tick(int user_tick, int system)
if (p == rq->idle) {
/* note: this timer irq context must be accounted for as well */
if (irq_count() >= 2*HARDIRQ_OFFSET)
kstat.per_cpu_system[cpu] += system;
if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
kstat.per_cpu_system[cpu] += sys_ticks;
#if CONFIG_SMP
idle_tick();
idle_tick(rq);
#endif
return;
}
if (TASK_NICE(p) > 0)
kstat.per_cpu_nice[cpu] += user_tick;
kstat.per_cpu_nice[cpu] += user_ticks;
else
kstat.per_cpu_user[cpu] += user_tick;
kstat.per_cpu_system[cpu] += system;
kstat.per_cpu_user[cpu] += user_ticks;
kstat.per_cpu_system[cpu] += sys_ticks;
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
......@@ -753,7 +833,8 @@ void scheduler_tick(int user_tick, int system)
* FIFO tasks have no timeslices.
*/
if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = TASK_TIMESLICE(p);
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
set_tsk_need_resched(p);
/* put it at the end of the queue: */
......@@ -776,7 +857,8 @@ void scheduler_tick(int user_tick, int system)
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
p->time_slice = TASK_TIMESLICE(p);
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
if (!rq->expired_timestamp)
......@@ -818,7 +900,6 @@ asmlinkage void schedule(void)
rq = this_rq();
release_kernel_lock(prev);
prepare_arch_schedule(prev);
prev->sleep_timestamp = jiffies;
spin_lock_irq(&rq->lock);
......@@ -875,14 +956,13 @@ asmlinkage void schedule(void)
rq->nr_switches++;
rq->curr = next;
prepare_arch_switch(rq);
prepare_arch_switch(rq, next);
prev = context_switch(prev, next);
barrier();
rq = this_rq();
finish_arch_switch(rq);
finish_arch_switch(rq, prev);
} else
spin_unlock_irq(&rq->lock);
finish_arch_schedule(prev);
reacquire_kernel_lock(current);
preempt_enable_no_resched();
......@@ -1114,7 +1194,8 @@ void set_user_nice(task_t *p, long nice)
* If the task is running and lowered its priority,
* or increased its priority then reschedule its CPU:
*/
if ((NICE_TO_PRIO(nice) < p->static_prio) || (p == rq->curr))
if ((NICE_TO_PRIO(nice) < p->static_prio) ||
task_running(rq, p))
resched_task(rq->curr);
}
out_unlock:
......@@ -1228,18 +1309,18 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param)
else {
retval = -EINVAL;
if (policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_OTHER)
policy != SCHED_NORMAL)
goto out_unlock;
}
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_OTHER is 0.
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
*/
retval = -EINVAL;
if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1)
goto out_unlock;
if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0))
goto out_unlock;
retval = -EPERM;
......@@ -1260,7 +1341,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param *param)
retval = 0;
p->policy = policy;
p->rt_priority = lp.sched_priority;
if (policy != SCHED_OTHER)
if (policy != SCHED_NORMAL)
p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
else
p->prio = p->static_prio;
......@@ -1439,39 +1520,18 @@ asmlinkage long sys_sched_yield(void)
prio_array_t *array = current->array;
/*
* There are three levels of how a yielding task will give up
* the current CPU:
* We implement yielding by moving the task into the expired
* queue.
*
* #1 - it decreases its priority by one. This priority loss is
* temporary, it's recovered once the current timeslice
* expires.
*
* #2 - once it has reached the lowest priority level,
* it will give up timeslices one by one. (We do not
* want to give them up all at once, it's gradual,
* to protect the casual yield()er.)
*
* #3 - once all timeslices are gone we put the process into
* the expired array.
*
* (special rule: RT tasks do not lose any priority, they just
* roundrobin on their current priority level.)
* (special rule: RT tasks will just roundrobin in the active
* array.)
*/
if (likely(current->prio == MAX_PRIO-1)) {
if (current->time_slice <= 1) {
dequeue_task(current, rq->active);
enqueue_task(current, rq->expired);
} else
current->time_slice--;
} else if (unlikely(rt_task(current))) {
list_move_tail(&current->run_list, array->queue + current->prio);
if (likely(!rt_task(current))) {
dequeue_task(current, array);
enqueue_task(current, rq->expired);
} else {
list_del(&current->run_list);
if (list_empty(array->queue + current->prio))
__clear_bit(current->prio, array->bitmap);
current->prio++;
list_add_tail(&current->run_list, array->queue + current->prio);
__set_bit(current->prio, array->bitmap);
}
/*
* Since we are going to call schedule() anyway, there's
......@@ -1506,7 +1566,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
case SCHED_RR:
ret = MAX_USER_RT_PRIO-1;
break;
case SCHED_OTHER:
case SCHED_NORMAL:
ret = 0;
break;
}
......@@ -1522,7 +1582,7 @@ asmlinkage long sys_sched_get_priority_min(int policy)
case SCHED_RR:
ret = 1;
break;
case SCHED_OTHER:
case SCHED_NORMAL:
ret = 0;
}
return ret;
......@@ -1548,7 +1608,7 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
goto out_unlock;
jiffies_to_timespec(p->policy & SCHED_FIFO ?
0 : TASK_TIMESLICE(p), &t);
0 : task_timeslice(p), &t);
read_unlock(&tasklist_lock);
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
out_nounlock:
......@@ -1652,40 +1712,6 @@ void show_state(void)
read_unlock(&tasklist_lock);
}
/*
* double_rq_lock - safely lock two runqueues
*
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
{
if (rq1 == rq2)
spin_lock(&rq1->lock);
else {
if (rq1 < rq2) {
spin_lock(&rq1->lock);
spin_lock(&rq2->lock);
} else {
spin_lock(&rq2->lock);
spin_lock(&rq1->lock);
}
}
}
/*
* double_rq_unlock - safely unlock two runqueues
*
* Note this does not restore interrupts like task_rq_unlock,
* you need to do so manually after calling.
*/
static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
{
spin_unlock(&rq1->lock);
if (rq1 != rq2)
spin_unlock(&rq2->lock);
}
void __init init_idle(task_t *idle, int cpu)
{
runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle));
......@@ -1712,57 +1738,6 @@ void __init init_idle(task_t *idle, int cpu)
#endif
}
extern void init_timervecs(void);
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
void __init sched_init(void)
{
runqueue_t *rq;
int i, j, k;
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;
rq = cpu_rq(i);
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->migration_queue);
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
}
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
}
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
rq = this_rq();
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
wake_up_process(current);
init_timervecs();
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current, smp_processor_id());
}
#if CONFIG_SMP
/*
......@@ -1821,7 +1796,7 @@ void set_cpus_allowed(task_t *p, unsigned long new_mask)
* If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field.
*/
if (!p->array && (p != rq->curr)) {
if (!p->array && !task_running(rq, p)) {
set_task_cpu(p, __ffs(p->cpus_allowed));
task_rq_unlock(rq, &flags);
goto out;
......@@ -1939,3 +1914,55 @@ void __init migration_init(void)
}
}
#endif
extern void init_timervecs(void);
extern void timer_bh(void);
extern void tqueue_bh(void);
extern void immediate_bh(void);
void __init sched_init(void)
{
runqueue_t *rq;
int i, j, k;
for (i = 0; i < NR_CPUS; i++) {
prio_array_t *array;
rq = cpu_rq(i);
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->migration_queue);
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
for (k = 0; k < MAX_PRIO; k++) {
INIT_LIST_HEAD(array->queue + k);
__clear_bit(k, array->bitmap);
}
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
}
/*
* We have to do a little magic to get the first
* process right in SMP mode.
*/
rq = this_rq();
rq->curr = current;
rq->idle = current;
set_task_cpu(current, smp_processor_id());
wake_up_process(current);
init_timervecs();
init_bh(TIMER_BH, timer_bh);
init_bh(TQUEUE_BH, tqueue_bh);
init_bh(IMMEDIATE_BH, immediate_bh);
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current, smp_processor_id());
}
......@@ -500,7 +500,6 @@ inline void signal_wake_up(struct task_struct *t)
{
set_tsk_thread_flag(t,TIF_SIGPENDING);
#ifdef CONFIG_SMP
/*
* If the task is running on a different CPU
* force a reschedule on the other CPU to make
......@@ -511,9 +510,8 @@ inline void signal_wake_up(struct task_struct *t)
* process of changing - but no harm is done by that
* other than doing an extra (lightweight) IPI interrupt.
*/
if ((t->state == TASK_RUNNING) && (t->thread_info->cpu != smp_processor_id()))
if (t->state == TASK_RUNNING)
kick_if_running(t);
#endif
if (t->state & TASK_INTERRUPTIBLE) {
wake_up_process(t);
return;
......
......@@ -888,7 +888,7 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
current->policy != SCHED_OTHER)
current->policy != SCHED_NORMAL)
{
/*
* Short delay requests up to 2 ms will be handled with
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment