[PATCH] scheduler fixes

- introduce new type of context-switch locking, this is a must-have for ia64 and sparc64. - load_balance() bug noticed by Scott Rhine and myself: scan the whole list to find imbalance number of tasks, not just the tail of the list. - sched_yield() fix: use current->array not rq->active.

[PATCH] scheduler fixes
- introduce new type of context-switch locking, this is a must-have for ia64 and sparc64. - load_balance() bug noticed by Scott Rhine and myself: scan the whole list to find imbalance number of tasks, not just the tail of the list. - sched_yield() fix: use current->array not rq->active.
97db62cc · Ingo Molnar · Linus Torvalds · 9e7cec88 · 97db62cc · 97db62cc
Commit 97db62cc authored Jul 23, 2002 by Ingo Molnar Committed by Linus Torvalds Jul 23, 2002
14 changed files
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -11,11 +11,6 @@
 struct task_struct;	/* one of the stranger aspects of C forward declarations.. */
 extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next));

-#define prepare_arch_schedule(prev)		do { } while(0)
-#define finish_arch_schedule(prev)		do { } while(0)
-#define prepare_arch_switch(rq)			do { } while(0)
-#define finish_arch_switch(rq)			spin_unlock_irq(&(rq)->lock)
-
 #define switch_to(prev,next,last) do {					\
 	asm volatile("pushl %%esi\n\t"					\
 		     "pushl %%edi\n\t"					\

--- a/include/asm-ppc/system.h
+++ b/include/asm-ppc/system.h
@@ -83,11 +83,6 @@ extern void cacheable_memzero(void *p, unsigned int nb);
 struct device_node;
 extern void note_scsi_host(struct device_node *, void *);

-#define prepare_arch_schedule(prev)		do { } while(0)
-#define finish_arch_schedule(prev)		do { } while(0)
-#define prepare_arch_switch(rq)			do { } while(0)
-#define finish_arch_switch(rq)			spin_unlock_irq(&(rq)->lock)
-
 struct task_struct;
 extern void __switch_to(struct task_struct *, struct task_struct *);
 #define switch_to(prev, next, last)	__switch_to((prev), (next))

--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -18,11 +18,6 @@
 #endif
 #include <linux/kernel.h>

-#define prepare_arch_schedule(prev)		do { } while (0)
-#define finish_arch_schedule(prev)		do { } while (0)
-#define prepare_arch_switch(rq)			do { } while (0)
-#define finish_arch_switch(rq)			spin_unlock_irq(&(rq)->lock)
-
 #define switch_to(prev,next,last) do {					     \
 	if (prev == next)						     \
 		break;							     \

--- a/include/asm-s390x/system.h
+++ b/include/asm-s390x/system.h
@@ -18,11 +18,6 @@
 #endif
 #include <linux/kernel.h>

-#define prepare_arch_schedule(prev)		do { } while (0)
-#define finish_arch_schedule(prev)		do { } while (0)
-#define prepare_arch_switch(rq)			do { } while (0)
-#define finish_arch_switch(rq)			spin_unlock_irq(&(rq)->lock)
-
 #define switch_to(prev,next),last do {					     \
 	if (prev == next)						     \
 		break;							     \

--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -140,13 +140,17 @@ extern void __flushw_user(void);
 #define flush_user_windows flushw_user
 #define flush_register_windows flushw_all

-#define prepare_arch_schedule(prev)	task_lock(prev)
-#define finish_arch_schedule(prev)	task_unlock(prev)
-#define prepare_arch_switch(rq)		\
-do {	spin_unlock(&(rq)->lock);	\
-	flushw_all();			\
+#define prepare_arch_switch(rq, next)		\
+do {	spin_lock(&(next)->switch_lock);	\
+	spin_unlock(&(rq)->lock);		\
+	flushw_all();				\
 } while (0)
-#define finish_arch_switch(rq)		local_irq_enable()
+
+#define finish_arch_switch(rq, prev)		\
+do {	spin_unlock_irq(&(prev)->switch_lock);	\
+} while (0)
+
+

 #ifndef CONFIG_DEBUG_SPINLOCK
 #define CHECK_LOCKS(PREV)	do { } while(0)

--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -13,11 +13,6 @@
 #define LOCK_PREFIX ""
 #endif

-#define prepare_arch_schedule(prev)            do { } while(0)
-#define finish_arch_schedule(prev)             do { } while(0)
-#define prepare_arch_switch(rq)                        do { } while(0)
-#define finish_arch_switch(rq)                 spin_unlock_irq(&(rq)->lock)
-
 #define __STR(x) #x
 #define STR(x) __STR(x)


--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -47,7 +47,7 @@
    lock_depth:		-1,						\
    prio:		MAX_PRIO-20,					\
    static_prio:	MAX_PRIO-20,					\
-    policy:		SCHED_OTHER,					\
+    policy:		SCHED_NORMAL,					\
    cpus_allowed:	-1,						\
    mm:			NULL,						\
    active_mm:		&init_mm,					\
@@ -78,6 +78,7 @@
    pending:		{ NULL, &tsk.pending.head, {{0}}},		\
    blocked:		{{0}},						\
    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+    switch_lock:	SPIN_LOCK_UNLOCKED,				\
    journal_info:	NULL,						\
 }


--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -116,7 +116,7 @@ extern unsigned long nr_uninterruptible(void);
 /*
 * Scheduling policies
 */
-#define SCHED_OTHER		0
+#define SCHED_NORMAL		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2

@@ -207,7 +207,7 @@ struct signal_struct {

 /*
 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_OTHER tasks are
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are
 * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
 * are inverted: lower p->prio value means higher priority.
 *
@@ -264,7 +264,7 @@ struct task_struct {

 	unsigned long policy;
 	unsigned long cpus_allowed;
-	unsigned int time_slice;
+	unsigned int time_slice, first_time_slice;

 	struct list_head tasks;

@@ -361,6 +361,8 @@ struct task_struct {
   	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
 	spinlock_t alloc_lock;
+/* context-switch lock */
+	spinlock_t switch_lock;

 /* journalling filesystem info */
 	void *journal_info;

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -184,7 +184,7 @@ void reparent_to_init(void)
 	current->exit_signal = SIGCHLD;

 	current->ptrace = 0;
-	if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0))
+	if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0))
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -611,7 +611,6 @@ struct task_struct *do_fork(unsigned long clone_flags,
 			    unsigned long stack_size)
 {
 	int retval;
-	unsigned long flags;
 	struct task_struct *p = NULL;
 	struct completion vfork;

@@ -675,6 +674,7 @@ struct task_struct *do_fork(unsigned long clone_flags,
 		init_completion(&vfork);
 	}
 	spin_lock_init(&p->alloc_lock);
+	spin_lock_init(&p->switch_lock);

 	clear_tsk_thread_flag(p,TIF_SIGPENDING);
 	init_sigpending(&p->pending);
@@ -740,8 +740,13 @@ struct task_struct *do_fork(unsigned long clone_flags,
 	 * total amount of pending timeslices in the system doesnt change,
 	 * resulting in more scheduling fairness.
 	 */
-	local_irq_save(flags);
-	p->time_slice = (current->time_slice + 1) >> 1;
+	local_irq_disable();
+        p->time_slice = (current->time_slice + 1) >> 1;
+	/*
+	 * The remainder of the first timeslice might be recovered by
+	 * the parent if the child exits early enough.
+	 */
+	p->first_time_slice = 1;
 	current->time_slice >>= 1;
 	p->sleep_timestamp = jiffies;
 	if (!current->time_slice) {
@@ -753,11 +758,10 @@ struct task_struct *do_fork(unsigned long clone_flags,
 		current->time_slice = 1;
 		preempt_disable();
 		scheduler_tick(0, 0);
-		local_irq_restore(flags);
+		local_irq_enable();
 		preempt_enable();
 	} else
-		local_irq_restore(flags);
-
+		local_irq_enable();
 	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.

--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -190,16 +190,19 @@ int request_module(const char * module_name)
 	pid_t pid;
 	int waitpid_result;
 	sigset_t tmpsig;
-	int i;
+	int i, ret;
 	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
 #define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
+	unsigned long saved_policy = current->policy;

+	current->policy = SCHED_NORMAL;
 	/* Don't allow request_module() before the root fs is mounted!  */
 	if ( ! current->fs->root ) {
 		printk(KERN_ERR "request_module[%s]: Root fs not mounted\n",
 			module_name);
-		return -EPERM;
+		ret = -EPERM;
+		goto out;
 	}

 	/* If modprobe needs a service that is in a module, we get a recursive
@@ -220,14 +223,16 @@ int request_module(const char * module_name)
 			printk(KERN_ERR
 			       "kmod: runaway modprobe loop assumed and stopped\n");
 		atomic_dec(&kmod_concurrent);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}

 	pid = kernel_thread(exec_modprobe, (void*) module_name, 0);
 	if (pid < 0) {
 		printk(KERN_ERR "request_module[%s]: fork failed, errno %d\n", module_name, -pid);
 		atomic_dec(&kmod_concurrent);
-		return pid;
+		ret = pid;
+		goto out;
 	}

 	/* Block everything but SIGKILL/SIGSTOP */
@@ -250,7 +255,10 @@ int request_module(const char * module_name)
 		printk(KERN_ERR "request_module[%s]: waitpid(%d,...) failed, errno %d\n",
 		       module_name, pid, -waitpid_result);
 	}
-	return 0;
+	ret = 0;
+out:
+	current->policy = saved_policy;
+	return ret;
 }
 #endif /* CONFIG_KMOD */


--- a/kernel/sched.c
+++ b/kernel/sched.c
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -500,7 +500,6 @@ inline void signal_wake_up(struct task_struct *t)
 {
 	set_tsk_thread_flag(t,TIF_SIGPENDING);

-#ifdef CONFIG_SMP
 	/*
 	 * If the task is running on a different CPU 
 	 * force a reschedule on the other CPU to make
@@ -511,9 +510,8 @@ inline void signal_wake_up(struct task_struct *t)
 	 * process of changing - but no harm is done by that
 	 * other than doing an extra (lightweight) IPI interrupt.
 	 */
-	if ((t->state == TASK_RUNNING) && (t->thread_info->cpu != smp_processor_id()))
+	if (t->state == TASK_RUNNING)
 		kick_if_running(t);
-#endif
 	if (t->state & TASK_INTERRUPTIBLE) {
 		wake_up_process(t);
 		return;

--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -888,7 +888,7 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)


 	if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
-	    current->policy != SCHED_OTHER)
+	    current->policy != SCHED_NORMAL)
 	{
 		/*
 		 * Short delay requests up to 2 ms will be handled with