Import 2.1.133pre4

b7cd5844 · Linus Torvalds · 9390bd47 · b7cd5844 · b7cd5844 · b7cd5844
Commit b7cd5844 authored Nov 23, 2007 by Linus Torvalds
32 changed files
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -38,13 +38,6 @@ unsigned int local_irq_count[NR_CPUS];
 unsigned int local_bh_count[NR_CPUS];
 unsigned long hardirq_no[NR_CPUS];

-#define RTC_IRQ    8
-#ifdef CONFIG_RTC
-#define TIMER_IRQ  0        /* timer is the pit */
-#else
-#define TIMER_IRQ  RTC_IRQ  /* the timer is, in fact, the rtc */
-#endif
-
 #if NR_IRQS > 64
 #  error Unable to handle more than 64 irq levels.
 #endif

--- a/arch/alpha/kernel/irq.h
+++ b/arch/alpha/kernel/irq.h
@@ -21,3 +21,11 @@ extern void isa_device_interrupt(unsigned long vector, struct pt_regs * regs);
 extern void srm_device_interrupt(unsigned long vector, struct pt_regs * regs);

 extern void handle_irq(int irq, int ack, struct pt_regs * regs);
+
+#define RTC_IRQ    8
+#ifdef CONFIG_RTC
+#define TIMER_IRQ  0			 /* timer is the pit */
+#else
+#define TIMER_IRQ  RTC_IRQ		 /* timer is the rtc */
+#endif
+
--- a/arch/alpha/kernel/sys_ruffian.c
+++ b/arch/alpha/kernel/sys_ruffian.c
@@ -92,6 +92,12 @@ ruffian_device_interrupt(unsigned long vector, struct pt_regs *regs)
 		i = ffz(~pld);
 		pld &= pld - 1; /* clear least bit set */
 		if (i == 7) { /* if ISA int */
+			/* Ruffian does not have the RTC connected to 
+			   the CPU timer interrupt.  Instead, it uses the
+			   PIT connected to IRQ 0.  So we must detect that
+			   and route that specifically to where we expected
+			   to find the timer interrupt come in.  */
+
 			/* Copy this code from isa_device_interrupt because
 			   we need to hook into int 0 for the timer.  I
 			   refuse to soil device_interrupt with ifdefs.  */
@@ -107,7 +113,7 @@ ruffian_device_interrupt(unsigned long vector, struct pt_regs *regs)
 			if (j == 7 && !(inb(0x20) & 0x80)) {
 				/* It's only a passive release... */
 			} else if (j == 0) {
-			  	handle_irq(8, -1, regs); /* fake it */
+			  	handle_irq(TIMER_IRQ, -1, regs);
 				ruffian_ack_irq(0);
 			} else {
 				handle_irq(j, j, regs);

--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -35,12 +35,7 @@
 #include <linux/timex.h>

 #include "proto.h"
-
-#ifdef CONFIG_RTC 
-#define TIMER_IRQ 0  /* using pit for timer */
-#else 
-#define TIMER_IRQ 8  /* using rtc for timer */
-#endif
+#include "irq.h"

 static int set_rtc_mmss(unsigned long);


--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -153,10 +153,10 @@ ENTRY(lcall7)
 	ALIGN
 	.globl	ret_from_fork
 ret_from_fork:
-	GET_CURRENT(%ebx)
 #ifdef __SMP__
-	lock ; btrl $0, SYMBOL_NAME(scheduler_lock)
+	call SYMBOL_NAME(schedule_tail)
 #endif /* __SMP__ */
+	GET_CURRENT(%ebx)
 	jmp	ret_from_sys_call

 /*

--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(__global_cli);
 EXPORT_SYMBOL(__global_sti);
 EXPORT_SYMBOL(__global_save_flags);
 EXPORT_SYMBOL(__global_restore_flags);
-EXPORT_SYMBOL(smp_message_pass);
 EXPORT_SYMBOL(mtrr_hook);
 #endif


--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -953,7 +953,7 @@ static inline void self_IPI(unsigned int irq)

 	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 		desc->status = status | IRQ_REPLAY;
-		send_IPI(APIC_DEST_SELF, IO_APIC_VECTOR(irq));
+		send_IPI_self(IO_APIC_VECTOR(irq));
 	}
 }


--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -189,7 +189,7 @@ BUILD_IRQ(60) BUILD_IRQ(61) BUILD_IRQ(62) BUILD_IRQ(63)
 /*
 * The following vectors are part of the Linux architecture, there
 * is no hardware IRQ pin equivalent for them, they are triggered
- * through the ICC by us (IPIs), via smp_message_pass():
+ * through the ICC by us (IPIs)
 */
 BUILD_SMP_INTERRUPT(reschedule_interrupt)
 BUILD_SMP_INTERRUPT(invalidate_interrupt)
@@ -297,7 +297,7 @@ int get_irq_list(char *buf)
 	}
 	p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter));
 #ifdef __SMP__
-	p += sprintf(p, "IPI: %10lu\n", ipi_count);
+	p += sprintf(p, "ERR: %10lu\n", ipi_count);
 #endif		
 	return p - buf;
 }
@@ -989,22 +989,22 @@ __initfunc(void init_IRQ(void))
 	 */

 	/* IPI for rescheduling */
-	set_intr_gate(0x30, reschedule_interrupt);
+	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);

 	/* IPI for invalidation */
-	set_intr_gate(0x31, invalidate_interrupt);
+	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);

 	/* IPI for CPU halt */
-	set_intr_gate(0x40, stop_cpu_interrupt);
+	set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt);

 	/* self generated IPI for local APIC timer */
-	set_intr_gate(0x41, apic_timer_interrupt);
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);

 	/* IPI for MTRR control */
-	set_intr_gate(0x50, mtrr_interrupt);
+	set_intr_gate(MTRR_CHANGE_VECTOR, mtrr_interrupt);

 	/* IPI vector for APIC spurious interrupts */
-	set_intr_gate(0xff, spurious_interrupt);
+	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 #endif	
 	request_region(0x20,0x20,"pic1");
 	request_region(0xa0,0x20,"pic2");

--- a/arch/i386/kernel/irq.h
+++ b/arch/i386/kernel/irq.h
@@ -40,8 +40,29 @@ typedef struct {
 	unsigned int depth;			/* Disable depth for nested irq disables */
 } irq_desc_t;

+/*
+ * Special IRQ vectors used by the SMP architecture:
+ *
+ * (some of the following vectors are 'rare', they might be merged
+ *  into a single vector to save vector space. TLB, reschedule and
+ *  local APIC vectors are performance-critical.)
+ */
+#define RESCHEDULE_VECTOR	0x30
+#define INVALIDATE_TLB_VECTOR	0x31
+#define STOP_CPU_VECTOR		0x40
+#define LOCAL_TIMER_VECTOR	0x41
+#define MTRR_CHANGE_VECTOR	0x50
+
+/*
+ * First vector available to drivers: (vectors 0x51-0xfe)
+ */
 #define IRQ0_TRAP_VECTOR	0x51

+/*
+ * This IRQ should never happen, but we print a message nevertheless.
+ */
+#define SPURIOUS_APIC_VECTOR	0xff
+
 extern irq_desc_t irq_desc[NR_IRQS];
 extern int irq_vector[NR_IRQS];
 #define IO_APIC_VECTOR(irq)	irq_vector[irq]
@@ -56,17 +77,18 @@ extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 * Interrupt entry/exit code at both C and assembly level
 */

-void mask_irq(unsigned int irq);
-void unmask_irq(unsigned int irq);
-void disable_8259A_irq(unsigned int irq);
-int i8259A_irq_pending(unsigned int irq);
-void ack_APIC_irq(void);
-void setup_IO_APIC(void);
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
-void make_8259A_irq(unsigned int irq);
-void send_IPI(int dest, int vector);
-void init_pic_mode(void);
-void print_IO_APIC(void);
+extern void mask_irq(unsigned int irq);
+extern void unmask_irq(unsigned int irq);
+extern void disable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void ack_APIC_irq(void);
+extern void setup_IO_APIC(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void make_8259A_irq(unsigned int irq);
+extern void FASTCALL(send_IPI_self(int vector));
+extern void smp_send_mtrr(void);
+extern void init_pic_mode(void);
+extern void print_IO_APIC(void);

 extern unsigned long long io_apic_irqs;


--- a/arch/i386/kernel/mtrr.c
+++ b/arch/i386/kernel/mtrr.c
@@ -164,6 +164,9 @@
 #include <asm/bitops.h>
 #include <asm/atomic.h>

+#include <asm/hardirq.h>
+#include "irq.h"
+
 #define MTRR_VERSION            "1.26 (19981001)"

 #define TRUE  1
@@ -612,7 +615,7 @@ static void do_all_cpus (void (*handler) (struct set_mtrr_context *ctxt,
    /*  Send a message to all other CPUs and wait for them to enter the
 	barrier  */
    atomic_set (&undone_count, smp_num_cpus - 1);
-    smp_message_pass (MSG_ALL_BUT_SELF, MSG_MTRR_CHANGE, 0, 0);
+    smp_send_mtrr();
    /*  Wait for it to be done  */
    timeout = jiffies + JIFFIE_TIMEOUT;
    while ( (atomic_read (&undone_count) > 0) &&

--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -140,11 +140,18 @@ int cpu_idle(void *unused)
 	current->priority = 0;
 	current->counter = -100;
 	while(1) {
-		if (current_cpu_data.hlt_works_ok && !hlt_counter && !current->need_resched)
+		if (current_cpu_data.hlt_works_ok && !hlt_counter &&
+				 !current->need_resched)
 			__asm__("hlt");
+		/*
+		 * although we are an idle CPU, we do not want to
+		 * get into the scheduler unnecessarily.
+		 */
+		if (current->need_resched) {
 			schedule();
 			check_pgt_cache();
 		}
+	}
 }

 #endif

--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -3,12 +3,14 @@
 *	hosts.
 *
 *	(c) 1995 Alan Cox, CymruNET Ltd  <alan@cymru.net>
+ *	(c) 1998 Ingo Molnar
+ *
 *	Supported by Caldera http://www.caldera.com.
 *	Much of the core SMP work is based on previous work by Thomas Radke, to
 *	whom a great many thanks are extended.
 *
- *	Thanks to Intel for making available several different Pentium and
- *	Pentium Pro MP machines.
+ *	Thanks to Intel for making available several different Pentium,
+ *	Pentium Pro and Pentium-II/Xeon MP machines.
 *
 *	This code is released under the GNU public license version 2 or
 *	later.
@@ -26,6 +28,7 @@
 *		Ingo Molnar	:	Added APIC timers, based on code
 *					from Jose Renau
 *		Alan Cox	:	Added EBDA scanning
+ *		Ingo Molnar	:	various cleanups and rewrites
 */

 #include <linux/config.h>
@@ -41,6 +44,7 @@
 #include <asm/bitops.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
+#include <linux/io_trace.h>

 #ifdef CONFIG_MTRR
 #  include <asm/mtrr.h>
@@ -112,6 +116,12 @@ extern __inline int max(int a,int b)
 	return b;
 }

+/*
+ * function prototypes:
+ */
+static void cache_APIC_registers (void);
+
+
 static int smp_b_stepping = 0;				/* Set if we find a B stepping CPU			*/

 static int max_cpus = -1;				/* Setup configured maximum number of CPUs to activate	*/
@@ -131,19 +141,14 @@ unsigned long mp_ioapic_addr = 0xFEC00000;		/* Address of the I/O apic (not yet
 unsigned char boot_cpu_id = 0;				/* Processor that is doing the boot up 			*/
 static int smp_activated = 0;				/* Tripped once we need to start cross invalidating 	*/
 int apic_version[NR_CPUS];				/* APIC version number					*/
-static volatile int smp_commenced=0;			/* Tripped when we start scheduling 		    	*/
+volatile int smp_commenced=0;			/* Tripped when we start scheduling 		    	*/
 unsigned long apic_retval;				/* Just debugging the assembler.. 			*/

-static volatile unsigned char smp_cpu_in_msg[NR_CPUS];	/* True if this processor is sending an IPI		*/
-
 volatile unsigned long kernel_counter=0;		/* Number of times the processor holds the lock		*/
 volatile unsigned long syscall_count=0;			/* Number of times the processor holds the syscall lock	*/

 volatile unsigned long ipi_count;			/* Number of IPIs delivered				*/

-volatile unsigned long  smp_proc_in_lock[NR_CPUS] = {0,};/* for computing process time */
-volatile int smp_process_available=0;
-
 const char lk_lockmsg[] = "lock from interrupt context at %p\n"; 

 int mp_bus_id_to_type [MAX_MP_BUSSES] = { -1, };
@@ -245,7 +250,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)

 	if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4))
 	{
-		printk("Bad signature [%c%c%c%c].\n",
+		panic("SMP mptable: bad signature [%c%c%c%c]!\n",
 			mpc->mpc_signature[0],
 			mpc->mpc_signature[1],
 			mpc->mpc_signature[2],
@@ -254,7 +259,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
 	}
 	if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length))
 	{
-		printk("Checksum error.\n");
+		panic("SMP mptable: checksum error!\n");
 		return 1;
 	}
 	if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04)
@@ -760,11 +765,7 @@ void __init initialize_secondary(void)
 	/*
 	 * We don't actually need to load the full TSS,
 	 * basically just the stack pointer and the eip.
-	 *
-	 * Get the scheduler lock, because we're going
-	 * to release it as part of the "reschedule" return.
 	 */
-	spin_lock(&scheduler_lock);

 	asm volatile(
 		"movl %0,%%esp\n\t"
@@ -1165,6 +1166,7 @@ void __init smp_boot_cpus(void)
 		printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
 	SMP_PRINTK(("Boot done.\n"));

+	cache_APIC_registers();
 	/*
 	 * Here we can be sure that there is an IO-APIC in the system. Let's
 	 * go and set it up:
@@ -1175,257 +1177,280 @@ void __init smp_boot_cpus(void)
 smp_done:
 }

-void send_IPI(int dest, int vector)
-{
-	unsigned long cfg;
-	unsigned long flags;
-
-	__save_flags(flags);
-	__cli();

-	/*
-	 * prepare target chip field
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
 */

-	cfg = apic_read(APIC_ICR2) & 0x00FFFFFF;
-	apic_write(APIC_ICR2, cfg|SET_APIC_DEST_FIELD(dest));
-
-	cfg = apic_read(APIC_ICR);
-	cfg &= ~0xFDFFF;
-	cfg |= APIC_DEST_FIELD|APIC_DEST_DM_FIXED|vector;
-	cfg |= dest;

-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
+/*
+ * Silly serialization to work around CPU bug in P5s.
+ * We can safely turn it off on a 686.
 */
+#if defined(CONFIG_M686) & !defined(SMP_DEBUG)
+# define FORCE_APIC_SERIALIZATION 0
+#else
+# define FORCE_APIC_SERIALIZATION 1
+#endif

-	apic_write(APIC_ICR, cfg);
-	__restore_flags(flags);
-}
+static unsigned int cached_APIC_ICR;
+static unsigned int cached_APIC_ICR2;

 /*
- * A non wait message cannot pass data or CPU source info. This current setup
- * is only safe because the kernel lock owner is the only person who can send
- * a message.
- *
- * Wrapping this whole block in a spinlock is not the safe answer either. A
- * processor may get stuck with IRQs off waiting to send a message and thus
- * not replying to the person spinning for a reply.
+ * Caches reserved bits, APIC reads are (mildly) expensive
+ * and force otherwise unnecessary CPU synchronization.
 *
- * In the end flush tlb ought to be the NMI and a very short function
- * (to avoid the old IDE disk problems), and other messages sent with IRQs
- * enabled in a civilised fashion. That will also boost performance.
+ * (We could cache other APIC registers too, but these are the
+ * main ones used in RL.)
 */
+#define slow_ICR (apic_read(APIC_ICR) & ~0xFDFFF)
+#define slow_ICR2 (apic_read(APIC_ICR2) & 0x00FFFFFF)

-void smp_message_pass(int target, int msg, unsigned long data, int wait)
+void cache_APIC_registers (void)
 {
-	unsigned long cfg;
-	unsigned long dest = 0;
-	unsigned long target_map;
-	int p=smp_processor_id();
-	int irq;
-	int ct=0;
+	cached_APIC_ICR = slow_ICR;
+	cached_APIC_ICR2 = slow_ICR2;
+	mb();
+}

+static inline unsigned int __get_ICR (void)
+{
+#if FORCE_APIC_SERIALIZATION
 	/*
-	 *	During boot up send no messages
+	 * Wait for the APIC to become ready - this should never occur. It's
+	 * a debugging check really.
 	 */
+	int count = 0;
+	unsigned int cfg;

-	if (!smp_activated || !smp_commenced)
-		return;
+	IO_trace (IO_smp_wait_apic_start, 0, 0, 0, 0);
+	while (count < 1000)
+	{
+		cfg = slow_ICR;
+		if (!(cfg&(1<<12))) {
+			IO_trace (IO_smp_wait_apic_end, 0, 0, 0, 0);
+			if (count)
+				atomic_add(count, (atomic_t*)&ipi_count);
+			return cfg;
+		}
+		count++;
+		udelay(10);
+	}
+	printk("CPU #%d: previous IPI still not cleared after 10mS\n",
+			smp_processor_id());
+	return cfg;
+#else
+	return cached_APIC_ICR;
+#endif
+}

+static inline unsigned int __get_ICR2 (void)
+{
+#if FORCE_APIC_SERIALIZATION
+	return slow_ICR2;
+#else
+	return cached_APIC_ICR2;
+#endif
+}

-	/*
-	 *	Skip the reschedule if we are waiting to clear a
-	 *	message at this time. The reschedule cannot wait
-	 *	but is not critical.
-	 */
+static inline int __prepare_ICR (unsigned int shortcut, int vector)
+{
+	unsigned int cfg;

-	switch (msg) {
-		case MSG_RESCHEDULE:
-			irq = 0x30;
-			if (smp_cpu_in_msg[p])
-				return;
-			break;
+	cfg = __get_ICR();
+	cfg |= APIC_DEST_FIELD|APIC_DEST_DM_FIXED|shortcut|vector;

-		case MSG_INVALIDATE_TLB:
-			/* make this a NMI some day */
-			irq = 0x31;
-			break;
+	return cfg;
+}

-		case MSG_STOP_CPU:
-			irq = 0x40;
-			break;
+static inline int __prepare_ICR2 (unsigned int dest)
+{
+	unsigned int cfg;

-		case MSG_MTRR_CHANGE:
-			irq = 0x50;
-			break;
+	cfg = __get_ICR2();
+	cfg |= SET_APIC_DEST_FIELD(dest);

-		default:
-			printk("Unknown SMP message %d\n", msg);
-			return;
-	}
+	return cfg;
+}

-	/*
-	 * Sanity check we don't re-enter this across CPUs.  Only the kernel
-	 * lock holder may send messages.  For a STOP_CPU we are bringing the
-	 * entire box to the fastest halt we can.  A reschedule carries
-	 * no data and can occur during a flush.  Guess what panic
-	 * I got to notice this bug.
+static inline void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+	unsigned int cfg;
+/*
+ * Subtle. In the case of the 'never do double writes' workaround we
+ * have to lock out interrupts to be safe. Otherwise it's just one
+ * single atomic write to the APIC, no need for cli/sti.
 */
+#if FORCE_APIC_SERIALIZATION
+	unsigned long flags;
+
+	__save_flags(flags);
+	__cli();
+#endif

 	/*
-	 *	We are busy.
+	 * No need to touch the target chip field
 	 */

-	smp_cpu_in_msg[p]++;
-
-/*	printk("SMP message pass #%d to %d of %d\n",
-		p, msg, target);*/
+	cfg = __prepare_ICR(shortcut, vector);

 	/*
-	 * Wait for the APIC to become ready - this should never occur. It's
-	 * a debugging check really.
+	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */

-	while (ct<1000)
-	{
-		cfg=apic_read(APIC_ICR);
-		if (!(cfg&(1<<12)))
-			break;
-		ct++;
-		udelay(10);
-	}
+	IO_trace (IO_smp_send_ipi, shortcut, vector, cfg, 0);
 	
-	/*
-	 *	Just pray... there is nothing more we can do
-	 */
+	apic_write(APIC_ICR, cfg);
+#if FORCE_APIC_SERIALIZATION
+	__restore_flags(flags);
+#endif
+}

-	if (ct==1000)
-		printk("CPU #%d: previous IPI still not cleared after 10mS\n", p);
+static inline void send_IPI_allbutself(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
+}

-	/*
-	 *	Set the target requirement
-	 */
+static inline void send_IPI_all(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_ALLINC, vector);
+}

-	if (target==MSG_ALL_BUT_SELF)
-	{
-		dest=APIC_DEST_ALLBUT;
-		target_map=cpu_present_map;
-		cpu_callin_map[0]=(1<<p);
-	}
-	else if (target==MSG_ALL)
-	{
-		dest=APIC_DEST_ALLINC;
-		target_map=cpu_present_map;
-		cpu_callin_map[0]=0;
-	}
-	else
-	{
-		dest=0;
-		target_map=(1<<target);
-		cpu_callin_map[0]=0;
-	}
+void send_IPI_self(int vector)
+{
+	__send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+static inline void send_IPI_single(int dest, int vector)
+{
+	unsigned long cfg;
+#if FORCE_APIC_SERIALIZATION
+	unsigned long flags;
+
+	__save_flags(flags);
+	__cli();
+#endif

 	/*
-	 * Program the APIC to deliver the IPI
+	 * prepare target chip field
 	 */

-	send_IPI(dest,irq);
+	cfg = __prepare_ICR2(dest);
+	apic_write(APIC_ICR2, cfg);

 	/*
-	 * Spin waiting for completion
+	 * program the ICR 
 	 */
-	
-	switch(wait)
-	{
-		int stuck;
-		case 1:
-			stuck = 50000000;
-			while(cpu_callin_map[0]!=target_map) {
-				--stuck;
-				if (!stuck) {
-					printk("stuck on target_map IPI wait\n");
-					break;
-				}
-			}
-			break;
-		case 2:
-			stuck = 50000000;
-			/* Wait for invalidate map to clear */
-			while (smp_invalidate_needed) {
-				/* Take care of "crossing" invalidates */
-				if (test_bit(p, &smp_invalidate_needed))
-					clear_bit(p, &smp_invalidate_needed);
-				--stuck;
-				if (!stuck) {
-					printk("stuck on smp_invalidate_needed IPI wait (CPU#%d)\n",p);
-					break;
-				}
-			}
-			break;
-	}
+	cfg = __prepare_ICR(0, vector);
 	
 	/*
-	 *	Record our completion
+	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */

-	smp_cpu_in_msg[p]--;
+	IO_trace (IO_smp_send_ipi, dest, vector, cfg, 0);
+	
+	apic_write(APIC_ICR, cfg);
+#if FORCE_APIC_SERIALIZATION
+	__restore_flags(flags);
+#endif
 }

 /*
- *	This is fraught with deadlocks. Linus does a flush tlb at a whim
- *	even with IRQs off. We have to avoid a pair of crossing flushes
- *	or we are doomed.  See the notes about smp_message_pass.
+ * This is fraught with deadlocks. Probably the situation is not that
+ * bad as in the early days of SMP, so we might ease some of the
+ * paranoia here.
 */

 void smp_flush_tlb(void)
 {
+	int cpu = smp_processor_id();
+	int stuck;
 	unsigned long flags;

-/*	printk("SMI-");*/
-
 	/*
-	 *	The assignment is safe because it's volatile so the compiler cannot reorder it,
-	 *	because the i586 has strict memory ordering and because only the kernel lock holder
-	 *	may issue a tlb flush. If you break any one of those three change this to an atomic
-	 *	bus locked or.
+	 * The assignment is safe because it's volatile so the
+	 * compiler cannot reorder it, because the i586 has
+	 * strict memory ordering and because only the kernel
+	 * lock holder may issue a tlb flush. If you break any
+	 * one of those three change this to an atomic bus
+	 * locked or.
 	 */

-	smp_invalidate_needed=cpu_present_map;
+	smp_invalidate_needed = cpu_present_map;

 	/*
-	 *	Processors spinning on the lock will see this IRQ late. The smp_invalidate_needed map will
-	 *	ensure they don't do a spurious flush tlb or miss one.
+	 * Processors spinning on some lock with IRQs disabled
+	 * will see this IRQ late. The smp_invalidate_needed
+	 * map will ensure they don't do a spurious flush tlb
+	 * or miss one.
 	 */
 	
 	__save_flags(flags);
 	__cli();
-	smp_message_pass(MSG_ALL_BUT_SELF, MSG_INVALIDATE_TLB, 0L, 2);
+
+	IO_trace (IO_smp_message, 0, 0, 0, 0);
+
+	send_IPI_allbutself(INVALIDATE_TLB_VECTOR);

 	/*
-	 *	Flush the local TLB
+	 * Spin waiting for completion
 	 */

+	stuck = 50000000;
+	while (smp_invalidate_needed) {
+		/*
+		 * Take care of "crossing" invalidates
+		 */
+		if (test_bit(cpu, &smp_invalidate_needed))
+			clear_bit(cpu, &smp_invalidate_needed);
+		--stuck;
+		if (!stuck) {
+			printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
+			break;
+		}
+	}
+
+	/*
+	 *	Flush the local TLB
+	 */
 	local_flush_tlb();

 	__restore_flags(flags);
+}

-	/*
-	 *	Completed.
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
 */

-/*	printk("SMID\n");*/
+void smp_send_reschedule(int cpu)
+{
+	send_IPI_single(cpu, RESCHEDULE_VECTOR);
 }

+/*
+ * this function sends a 'stop' IPI to all other CPUs in the system.
+ * it goes straight through.
+ */

-void smp_send_reschedule(int cpu)
+void smp_send_stop(void)
 {
-	unsigned long flags;
+	send_IPI_allbutself(STOP_CPU_VECTOR);
+}

-	__save_flags(flags);
-	__cli();
-	smp_message_pass(cpu, MSG_RESCHEDULE, 0L, 0);
-	__restore_flags(flags);
+/*
+ * this function sends an 'reload MTRR state' IPI to all other CPUs
+ * in the system. it goes straight through, completion processing
+ * is done on the mttr.c level.
+ */
+
+void smp_send_mtrr(void)
+{
+	send_IPI_allbutself(MTRR_CHANGE_VECTOR);
 }

 /*
@@ -1531,6 +1556,9 @@ void smp_apic_timer_interrupt(struct pt_regs * regs)
 */
 asmlinkage void smp_reschedule_interrupt(void)
 {
+	IO_trace (IO_smp_reschedule, current->need_resched,
+			 current->priority, current->counter, 0);
+
 	ack_APIC_irq();
 }

@@ -1539,6 +1567,9 @@ asmlinkage void smp_reschedule_interrupt(void)
 */
 asmlinkage void smp_invalidate_interrupt(void)
 {
+	IO_trace (IO_smp_tlbflush,
+		 atomic_read((atomic_t *)&smp_invalidate_needed), 0, 0, 0);
+
 	if (test_and_clear_bit(smp_processor_id(), &smp_invalidate_needed))
 		local_flush_tlb();

@@ -1626,12 +1657,9 @@ void setup_APIC_timer(unsigned int clocks)
 	 * Unfortunately the local APIC timer cannot be set up into NMI
 	 * mode. With the IO APIC we can re-route the external timer
 	 * interrupt and broadcast it as an NMI to all CPUs, so no pain.
-	 *
-	 * NOTE: this trap vector (0x41) and the gate in
-	 * BUILD_SMP_TIMER_INTERRUPT should be the same ;)
 	 */
 	tmp_value = apic_read(APIC_LVTT);
-	lvtt1_value = APIC_LVT_TIMER_PERIODIC | 0x41;
+	lvtt1_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
 	apic_write(APIC_LVTT , lvtt1_value);

 	/*

--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -72,6 +72,8 @@ extern int setup_x86_irq(int, struct irqaction *);

 unsigned long cpu_hz;	/* Detected as we calibrate the TSC */

+cycles_t cacheflush_time;
+
 /* Number of usecs that the last interrupt was delayed */
 static int delay_at_last_interrupt;

@@ -96,7 +98,6 @@ static unsigned long do_fast_gettimeoffset(void)
 		:"=a" (eax), "=d" (edx));

 	/* .. relative to previous jiffy (32 bits is enough) */
-	edx = 0;
 	eax -= last_tsc_low;	/* tsc_low delta */

 	/*
@@ -110,11 +111,11 @@ static unsigned long do_fast_gettimeoffset(void)

 	__asm__("mull %2"
 		:"=a" (eax), "=d" (edx)
-		:"r" (fast_gettimeoffset_quotient),
-		 "0" (eax), "1" (edx));
+		:"g" (fast_gettimeoffset_quotient),
+		 "0" (eax));

 	/* our adjusted time offset in microseconds */
-	return edx + delay_at_last_interrupt;
+	return delay_at_last_interrupt + edx;
 }

 /* This function must be called with interrupts disabled 
@@ -240,17 +241,26 @@ void do_gettimeofday(struct timeval *tv)
 {
 	extern volatile unsigned long lost_ticks;
 	unsigned long flags;
+	unsigned long usec, sec;

 	read_lock_irqsave(&xtime_lock, flags);
-	*tv = xtime;
-	tv->tv_usec += do_gettimeoffset();
-	if (lost_ticks)
-		tv->tv_usec += lost_ticks * (1000000/HZ);
+	usec = do_gettimeoffset();
+	{
+		unsigned long lost = lost_ticks;
+		if (lost)
+			usec += lost * (1000000 / HZ);
+	}
+	sec = xtime.tv_sec;
+	usec += xtime.tv_usec;
 	read_unlock_irqrestore(&xtime_lock, flags);
-	while (tv->tv_usec >= 1000000) {
-		tv->tv_usec -= 1000000;
-		tv->tv_sec++;
+
+	while (usec >= 1000000) {
+		usec -= 1000000;
+		sec++;
 	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = usec;
 }

 void do_settimeofday(struct timeval *tv)
@@ -377,13 +387,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *reg
 		else
 			last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
 	}
-#if 0
-	/* As we return to user mode fire off the other CPU schedulers.. this is 
-	   basically because we don't yet share IRQ's around. This message is
-	   rigged to be safe on the 386 - basically it's a hack, so don't look
-	   closely for now.. */
-	smp_message_pass(MSG_ALL_BUT_SELF, MSG_RESCHEDULE, 0L, 0);
-#endif
 	    
 #ifdef CONFIG_MCA
 	if( MCA_bus ) {
@@ -639,5 +642,13 @@ __initfunc(void time_init(void))
 			printk("Detected %ld Hz processor.\n", cpu_hz);
 		}
 	}
+
+	/*
+	 * Rough estimation for SMP scheduling, this is the number of
+	 * cycles it takes for a fully memory-limited process to flush
+	 * the SMP-local cache.
+	 */
+	cacheflush_time = cpu_hz/10000;
+
 	setup_x86_irq(0, &irq0);
 }
--- a/arch/i386/vmlinux.lds
+++ b/arch/i386/vmlinux.lds
@@ -45,9 +45,13 @@ SECTIONS
  . = ALIGN(4096);
  __init_end = .;

+  . = ALIGN(32);
+  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
  . = ALIGN(4096);
  .data.page_aligned : { *(.data.idt) }

+
  __bss_start = .;		/* BSS */
  .bss : {
 	*(.bss)

--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -15,7 +15,7 @@ typedef struct { volatile int counter; } atomic_t;
 typedef struct { int counter; } atomic_t;
 #endif

-#define ATOMIC_INIT(i)	{ (i) }
+#define ATOMIC_INIT(i)	( (atomic_t) { (i) } )

 #define atomic_read(v)		((v)->counter)
 #define atomic_set(v,i)		((v)->counter = (i))

--- a/include/asm-alpha/core_apecs.h
+++ b/include/asm-alpha/core_apecs.h
@@ -458,7 +458,7 @@ __EXTERN_INLINE unsigned int apecs_inb(unsigned long addr)

 __EXTERN_INLINE void apecs_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + APECS_IO + 0x00) = w;
@@ -473,7 +473,7 @@ __EXTERN_INLINE unsigned int apecs_inw(unsigned long addr)

 __EXTERN_INLINE void apecs_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + APECS_IO + 0x08) = w;

--- a/include/asm-alpha/core_cia.h
+++ b/include/asm-alpha/core_cia.h
@@ -326,7 +326,7 @@ __EXTERN_INLINE unsigned int cia_inb(unsigned long addr)

 __EXTERN_INLINE void cia_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w = __kernel_insbl(b, addr & 3);
+	unsigned long w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + CIA_IO + 0x00) = w;
 	wmb();
 }
@@ -340,7 +340,7 @@ __EXTERN_INLINE unsigned int cia_inw(unsigned long addr)

 __EXTERN_INLINE void cia_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w = __kernel_inswl(b, addr & 3);
+	unsigned long w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + CIA_IO + 0x08) = w;
 	wmb();
 }

--- a/include/asm-alpha/core_lca.h
+++ b/include/asm-alpha/core_lca.h
@@ -262,7 +262,7 @@ __EXTERN_INLINE unsigned int lca_inb(unsigned long addr)

 __EXTERN_INLINE void lca_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + LCA_IO + 0x00) = w;
@@ -277,7 +277,7 @@ __EXTERN_INLINE unsigned int lca_inw(unsigned long addr)

 __EXTERN_INLINE void lca_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + LCA_IO + 0x08) = w;
@@ -340,7 +340,7 @@ __EXTERN_INLINE unsigned long lca_readq(unsigned long addr)
 __EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
 {
 	unsigned long msb;
-	unsigned int w;
+	unsigned long w;

 	if (addr >= (1UL << 24)) {
 		msb = addr & 0xf8000000;
@@ -354,7 +354,7 @@ __EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
 __EXTERN_INLINE void lca_writew(unsigned short b, unsigned long addr)
 {
 	unsigned long msb;
-	unsigned int w;
+	unsigned long w;

 	if (addr >= (1UL << 24)) {
 		msb = addr & 0xf8000000;

--- a/include/asm-alpha/core_mcpcia.h
+++ b/include/asm-alpha/core_mcpcia.h
@@ -264,7 +264,7 @@ __EXTERN_INLINE void mcpcia_outb(unsigned char b, unsigned long in_addr)
 {
 	unsigned long addr = in_addr & 0xffffffffUL;
 	unsigned long hose = (in_addr >> 32) & 3;
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x00) = w;
@@ -283,7 +283,7 @@ __EXTERN_INLINE void mcpcia_outw(unsigned short b, unsigned long in_addr)
 {
 	unsigned long addr = in_addr & 0xffffffffUL;
 	unsigned long hose = (in_addr >> 32) & 3;
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x08) = w;

--- a/include/asm-alpha/core_pyxis.h
+++ b/include/asm-alpha/core_pyxis.h
@@ -326,7 +326,7 @@ __EXTERN_INLINE unsigned int pyxis_inb(unsigned long addr)

 __EXTERN_INLINE void pyxis_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + PYXIS_IO + 0x00) = w;
@@ -341,7 +341,7 @@ __EXTERN_INLINE unsigned int pyxis_inw(unsigned long addr)

 __EXTERN_INLINE void pyxis_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + PYXIS_IO + 0x08) = w;

--- a/include/asm-alpha/core_t2.h
+++ b/include/asm-alpha/core_t2.h
@@ -378,7 +378,7 @@ __EXTERN_INLINE unsigned int t2_inw(unsigned long addr)

 __EXTERN_INLINE void t2_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + T2_IO + 0x08) = w;

--- a/include/asm-alpha/io.h
+++ b/include/asm-alpha/io.h
@@ -3,7 +3,6 @@

 #include <linux/config.h>
 #include <asm/system.h>
-#include <asm/machvec.h>

 /* We don't use IO slowdowns on the Alpha, but.. */
 #define __SLOW_DOWN_IO	do { } while (0)
@@ -19,6 +18,7 @@
 #endif

 #ifdef __KERNEL__
+#include <asm/machvec.h>

 /*
 * We try to avoid hae updates (thus the cache), but when we
@@ -78,6 +78,7 @@ extern void _sethae (unsigned long addr);	/* cached version */
 * There are different chipsets to interface the Alpha CPUs to the world.
 */

+#ifdef __KERNEL__
 #ifdef CONFIG_ALPHA_GENERIC

 /* In a generic kernel, we always go through the machine vector.  */
@@ -147,6 +148,7 @@ extern void _sethae (unsigned long addr);	/* cached version */
 #undef __WANT_IO_DEF

 #endif /* GENERIC */
+#endif /* __KERNEL__ */

 /*
 * The convention used for inb/outb etc. is that names starting with
@@ -172,6 +174,7 @@ extern void		_writew(unsigned short b, unsigned long addr);
 extern void		_writel(unsigned int b, unsigned long addr);
 extern void		_writeq(unsigned long b, unsigned long addr);

+#ifdef __KERNEL__
 /*
 * The platform header files may define some of these macros to use
 * the inlined versions where appropriate.  These macros may also be
@@ -216,6 +219,27 @@ extern void		_writeq(unsigned long b, unsigned long addr);
 # define outl_p		outl
 #endif

+#else 
+
+/* Userspace declarations.  */
+
+extern unsigned int	inb (unsigned long port);
+extern unsigned int	inw (unsigned long port);
+extern unsigned int	inl (unsigned long port);
+extern void		outb (unsigned char b,unsigned long port);
+extern void		outw (unsigned short w,unsigned long port);
+extern void		outl (unsigned int l,unsigned long port);
+extern unsigned long	readb(unsigned long addr);
+extern unsigned long	readw(unsigned long addr);
+extern unsigned long	readl(unsigned long addr);
+extern void		writeb(unsigned char b, unsigned long addr);
+extern void		writew(unsigned short b, unsigned long addr);
+extern void		writel(unsigned int b, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+#ifdef __KERNEL__
+
 /*
 * The "address" in IO memory space is not clearly either an integer or a
 * pointer. We will accept both, thus the casts.
@@ -257,8 +281,6 @@ static inline void iounmap(void *addr)
 # define writeq(v,a)	_writeq((v),(unsigned long)(a))
 #endif

-#ifdef __KERNEL__
-
 /*
 * String version of IO memory access ops:
 */

--- a/include/asm-alpha/softirq.h
+++ b/include/asm-alpha/softirq.h
@@ -117,6 +117,7 @@ extern inline void disable_bh(int nr)
 {
 	bh_mask &= ~(1 << nr);
 	atomic_inc(&bh_mask_count[nr]);
+	synchronize_bh();
 }

 extern inline void enable_bh(int nr)

--- a/include/asm-i386/init.h
+++ b/include/asm-i386/init.h
@@ -11,4 +11,7 @@
 #define __FINIT	.previous
 #define __INITDATA	.section	".data.init",#alloc,#write

+#define __cacheline_aligned __attribute__ \
+			 ((__section__ (".data.cacheline_aligned")))
+
 #endif
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -185,10 +185,6 @@ extern inline int cpu_logical_map(int cpu)
 extern void smp_callin(void);
 extern void smp_boot_cpus(void);
 extern void smp_store_cpu_info(int id);		/* Store per CPU info (like the initial udelay numbers */
-extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
-
-extern volatile unsigned long smp_proc_in_lock[NR_CPUS]; /* for computing process time */
-extern volatile int smp_process_available;

 /*
 *	APIC handlers: Note according to the Intel specification update
@@ -237,9 +233,7 @@ extern __inline int hard_smp_processor_id(void)
 *	processes are run.
 */
 
-#define PROC_CHANGE_PENALTY	10		/* Schedule penalty */
+#define PROC_CHANGE_PENALTY	15		/* Schedule penalty */

-#define SMP_FROM_INT		1
-#define SMP_FROM_SYSCALL	2
 #endif
 #endif
--- a/include/asm-i386/system.h.lock~
+++ b/include/asm-i386/system.h.lock~
+torvalds@penguin.transmeta.com
\ No newline at end of file
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -12,4 +12,22 @@
 	(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
 		<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)

+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ */
+typedef unsigned long long cycles_t;
+
+extern cycles_t cacheflush_time;
+
+static inline cycles_t get_cycles (void)
+{
+	cycles_t value;
+
+	__asm__("rdtsc"
+		:"=a" (*(((int *)&value)+0)),
+		 "=d" (*(((int *)&value)+1)));
+	return value;
+}
+
 #endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,6 +11,7 @@ extern unsigned long event;
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/times.h>
+#include <linux/timex.h>

 #include <asm/system.h>
 #include <asm/semaphore.h>
@@ -219,6 +220,7 @@ struct task_struct {
 /* various fields */
 	long counter;
 	long priority;
+	cycles_t avg_slice;
 /* SMP and runqueue state */
 	int has_cpu;
 	int processor;
@@ -336,7 +338,7 @@ struct task_struct {
 */
 #define INIT_TASK \
 /* state etc */	{ 0,0,0,KERNEL_DS,&default_exec_domain,0, \
-/* counter */	DEF_PRIORITY,DEF_PRIORITY, \
+/* counter */	DEF_PRIORITY,DEF_PRIORITY,0, \
 /* SMP */	0,0,0,-1, \
 /* schedlink */	&init_task,&init_task, &init_task, &init_task, \
 /* binfmt */	NULL, \

--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,11 +11,21 @@
 #include <asm/smp.h>

 /*
- * main IPI interface, handles INIT, TLB flush, STOP, etc. (defined in asm header):
- *
- * extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
+ * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
+ * (defined in asm header):
 */ 

+/*
+ * stops all CPUs but the current one:
+ */
+extern void smp_send_stop(void);
+
+/*
+ * sends a 'reschedule' event to another CPU:
+ */
+extern void FASTCALL(smp_send_reschedule(int cpu));
+
+
 /*
 * Boot processor call to load the other CPU's
 */
@@ -61,7 +71,6 @@ extern volatile int smp_msg_id;
 #define smp_num_cpus			1
 #define smp_processor_id()		0
 #define hard_smp_processor_id()		0
-#define smp_message_pass(t,m,d,w)	
 #define smp_threads_ready		1
 #define kernel_lock()
 #define cpu_logical_map(cpu)		0

--- a/init/main.c
+++ b/init/main.c
@@ -1177,6 +1177,7 @@ asmlinkage void __init start_kernel(void)
 	 */
 	smp_init();
 	kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	current->need_resched = 1;
 	cpu_idle(NULL);
 }


--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -50,7 +50,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	unblank_console();

 #ifdef __SMP__
-	smp_message_pass(MSG_ALL_BUT_SELF, MSG_STOP_CPU, 0, 0);
+	smp_send_stop();
 #endif
 	if (panic_timeout > 0)
 	{

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -12,6 +12,7 @@
 *  1998-12-24	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *		serialize accesses to xtime/lost_ticks).
 *				Copyright (C) 1998  Andrea Arcangeli
+ *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
 */

 /*
@@ -96,47 +97,110 @@ struct kernel_stat kstat = { 0 };

 void scheduling_functions_start_here(void) { }

-static inline void reschedule_idle(struct task_struct * p)
+#ifdef __SMP__
+static void reschedule_idle_slow(struct task_struct * p)
 {
+/*
+ * (see reschedule_idle() for an explanation first ...)
+ *
+ * Pass #2
+ *
+ * We try to find another (idle) CPU for this woken-up process.
+ *
+ * On SMP, we mostly try to see if the CPU the task used
+ * to run on is idle.. but we will use another idle CPU too,
+ * at this point we already know that this CPU is not
+ * willing to reschedule in the near future.
+ *
+ * An idle CPU is definitely wasted, especially if this CPU is
+ * running long-timeslice processes. The following algorithm is
+ * pretty good at finding the best idle CPU to send this process
+ * to.
+ *
+ * [We can try to preempt low-priority processes on other CPUs in
+ * 2.3. Also we can try to use the avg_slice value to predict
+ * 'likely reschedule' events even on other CPUs.]
+ */
+	int best_cpu = p->processor, this_cpu = smp_processor_id();
+	struct task_struct **idle = task, *tsk, *target_tsk;
+	int i = smp_num_cpus;

+	target_tsk = NULL;
+	do {
+		tsk = *idle;
+		idle++;
+		if (tsk->has_cpu) {
+			if (tsk->processor == this_cpu)
+				continue;
+			target_tsk = tsk;
+			if (tsk->processor == best_cpu) {
 				/*
-	 * For SMP, we try to see if the CPU the task used
-	 * to run on is idle..
+				 * bingo, we couldnt get a better
+				 * CPU, activate it.
 				 */
-#if 0
+				goto send; /* this one helps GCC ... */
+			}
+		}
+	} while (--i > 0);
+
 	/*
-	 * Disable this for now. Ingo has some interesting
-	 * code that looks too complex, and I have some ideas,
-	 * but in the meantime.. One problem is that "wakeup()"
-	 * can be (and is) called before we've even initialized
-	 * SMP completely, so..
+	 * found any idle CPU?
 	 */
-#ifdef __SMP__
-	int want_cpu = p->processor;
+	if (target_tsk) {
+send:
+		target_tsk->need_resched = 1;
+		smp_send_reschedule(target_tsk->processor);
+		return;
+	}
+}
+#endif /* __SMP__ */
+
+static inline void reschedule_idle(struct task_struct * p)
+{

+	if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
+		current->need_resched = 1;
+		return;
+	}
+
+#ifdef __SMP__
 	/*
-	 * Don't even try to find another CPU for us if the task
-	 * ran on this one before..
+	 * ("wakeup()" should not be called before we've initialized
+	 * SMP completely. [Linus, is there any exception to this?]
+	 * Basically a not-yet initialized SMP subsystem can be
+	 * considered as a not-yet working scheduler, simply dont use
+	 * it before it'd up and running ...)
+	 *
+	 * SMP rescheduling is done in 2 passes:
+	 *  - pass #1: faster: 'quick decisions'
+	 *  - pass #2: slower: 'lets try and find another CPU'
 	 */
-	if (want_cpu != smp_processor_id()) {
-		struct task_struct **idle = task;
-		int i = smp_num_cpus;

-		do {
-			struct task_struct *tsk = *idle;
-			idle++;
-			/* Something like this.. */
-			if (tsk->has_cpu && tsk->processor == want_cpu) {
-				tsk->need_resched = 1;
-				smp_send_reschedule(want_cpu);
+	/*
+	 * Pass #1
+	 *
+	 * There are two metrics here:
+	 *
+	 * first, a 'cutoff' interval, currently ~250 usecs on
+	 * x86 CPUs. If the current process has longer average
+	 * timeslices than this, then we utilize the idle CPU.
+	 *
+	 * second, if the wakeup comes from a process context,
+	 * then the two processes are 'related'. (they form a
+	 * 'gang')
+	 *
+	 * An idle CPU is almost always a bad thing, thus we skip
+	 * the idle-CPU utilization only if both these conditions
+	 * are true. (ie. a 'process-gang' rescheduling with rather
+	 * high frequency should stay on the same CPU).
+	 *
+	 * [We can switch to something more finegrained in 2.3.]
+	 */
+	if ((current->avg_slice < cacheflush_time) && !in_interrupt())
 		return;
-			}
-		} while (--i > 0);
-	}
-#endif
-#endif
-	if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
-		current->need_resched = 1;	
+
+	reschedule_idle_slow(p);
+#endif /* __SMP__ */
 }

 /*
@@ -244,6 +308,8 @@ static void process_timeout(unsigned long __data)
 	wake_up_process(p);
 }

+int _PROC_CHANGE_PENALTY = 13;
+
 /*
 * This is the function that decides how desirable a process is..
 * You can weigh different processes against each other depending
@@ -488,6 +554,63 @@ signed long schedule_timeout(signed long timeout)
 	return timeout < 0 ? 0 : timeout;
 }

+/*
+ * This one aligns per-CPU data on cacheline boundaries.
+ */
+static union {
+	struct schedule_data {
+		struct task_struct * prev;
+		long prevstate;
+		cycles_t last_schedule;
+	} schedule_data;
+	char __pad [L1_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+
+static inline void __schedule_tail (void)
+{
+#ifdef __SMP__
+	struct schedule_data * sched_data;
+
+	/*
+	 * We might have switched CPUs:
+	 */
+	sched_data = & aligned_data[smp_processor_id()].schedule_data;
+
+	/*
+	 * Subtle. In the rare event that we got a wakeup to 'prev' just
+	 * during the reschedule (this is possible, the scheduler is pretty
+	 * parallel), we should do another reschedule in the next task's
+	 * context. schedule() will do the right thing next time around.
+	 * this is equivalent to 'delaying' the wakeup until the reschedule
+	 * has finished.
+	 */
+	if (sched_data->prev->state != sched_data->prevstate)
+		current->need_resched = 1;
+
+	/*
+	 * Release the previous process ...
+	 *
+	 * We have dropped all locks, and we must make sure that we
+	 * only mark the previous process as no longer having a CPU
+	 * after all other state has been seen by other CPU's. Thus
+	 * the memory barrier!
+	 */
+	mb();
+	sched_data->prev->has_cpu = 0;
+#endif /* __SMP__ */
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+void schedule_tail (void)
+{
+	__schedule_tail();
+}
+
 /*
 *  'schedule()' is the scheduler function. It's a very simple and nice
 * scheduler: it's not perfect, but certainly works for most things.
@@ -500,11 +623,18 @@ signed long schedule_timeout(signed long timeout)
 */
 asmlinkage void schedule(void)
 {
+	struct schedule_data * sched_data;
 	struct task_struct * prev, * next;
 	int this_cpu;

 	prev = current;
 	this_cpu = prev->processor;
+	/*
+	 * 'sched_data' is protected by the fact that we can run
+	 * only one process per CPU.
+	 */
+	sched_data = & aligned_data[this_cpu].schedule_data;
+
 	if (in_interrupt())
 		goto scheduling_in_interrupt;
 	release_kernel_lock(prev, this_cpu);
@@ -519,6 +649,7 @@ asmlinkage void schedule(void)

 	/* move an exhausted RR process to be last.. */
 	prev->need_resched = 0;
+
 	if (!prev->counter && prev->policy == SCHED_RR) {
 		prev->counter = prev->priority;
 		move_last_runqueue(prev);
@@ -534,6 +665,9 @@ asmlinkage void schedule(void)
 			del_from_runqueue(prev);
 		case TASK_RUNNING:
 	}
+
+	sched_data->prevstate = prev->state;
+
 	{
 		struct task_struct * p = init_task.next_run;
 		/*
@@ -580,27 +714,49 @@ asmlinkage void schedule(void)
 		}
 	}

+ 	/*
+ 	 * maintain the per-process 'average timeslice' value.
+ 	 * (this has to be recalculated even if we reschedule to
+ 	 * the same process) Currently this is only used on SMP:
+ 	 */
 #ifdef __SMP__
-	next->has_cpu = 1;
-#endif
+	{
+		cycles_t t, this_slice;

+		t = get_cycles();
+		this_slice = t - sched_data->last_schedule;
+		sched_data->last_schedule = t;
+
+		/*
+		 * Simple, exponentially fading average calculation:
+		 */
+		prev->avg_slice = this_slice + prev->avg_slice;
+		prev->avg_slice >>= 1;
+	}
+
+	/*
+	 * We drop the scheduler lock early (it's a global spinlock),
+	 * thus we have to lock the previous process from getting
+	 * rescheduled during switch_to().
+	 */
+	prev->has_cpu = 1;
+
+ 	next->has_cpu = 1;
+ 	next->processor = this_cpu;
+	spin_unlock(&scheduler_lock);
+#endif /* __SMP__ */
 	if (prev != next) {
 #ifdef __SMP__
-		next->processor = this_cpu;
+		sched_data->prev = prev;
 #endif
 	 	kstat.context_swtch++;
 		get_mmu_context(next);
 		switch_to(prev,next);
-	}

-	spin_unlock(&scheduler_lock);
+		__schedule_tail();
+	}
  
-	/*
-	 * At this point "prev" is "current", as we just
-	 * switched into it (from an even more "previous"
-	 * prev)
-	 */
-	reacquire_kernel_lock(prev);
+	reacquire_kernel_lock(current);
 	return;

 scheduling_in_interrupt:
@@ -608,7 +764,6 @@ asmlinkage void schedule(void)
 	*(int *)0 = 0;
 }

-
 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;

 /*