Import 2.1.133pre4

b7cd5844 · Linus Torvalds · 9390bd47 · b7cd5844 · b7cd5844 · b7cd5844
Commit b7cd5844 authored Nov 23, 2007 by Linus Torvalds
32 changed files
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -38,13 +38,6 @@ unsigned int local_irq_count[NR_CPUS];
 unsigned int local_bh_count[NR_CPUS];
 unsigned long hardirq_no[NR_CPUS];

-#define RTC_IRQ    8
-#ifdef CONFIG_RTC
-#define TIMER_IRQ  0        /* timer is the pit */
-#else
-#define TIMER_IRQ  RTC_IRQ  /* the timer is, in fact, the rtc */
-#endif
-
 #if NR_IRQS > 64
 #  error Unable to handle more than 64 irq levels.
 #endif

--- a/arch/alpha/kernel/irq.h
+++ b/arch/alpha/kernel/irq.h
@@ -21,3 +21,11 @@ extern void isa_device_interrupt(unsigned long vector, struct pt_regs * regs);
 extern void srm_device_interrupt(unsigned long vector, struct pt_regs * regs);

 extern void handle_irq(int irq, int ack, struct pt_regs * regs);
+
+#define RTC_IRQ    8
+#ifdef CONFIG_RTC
+#define TIMER_IRQ  0			 /* timer is the pit */
+#else
+#define TIMER_IRQ  RTC_IRQ		 /* timer is the rtc */
+#endif
+
--- a/arch/alpha/kernel/sys_ruffian.c
+++ b/arch/alpha/kernel/sys_ruffian.c
@@ -92,6 +92,12 @@ ruffian_device_interrupt(unsigned long vector, struct pt_regs *regs)
 		i = ffz(~pld);
 		pld &= pld - 1; /* clear least bit set */
 		if (i == 7) { /* if ISA int */
+			/* Ruffian does not have the RTC connected to 
+			   the CPU timer interrupt.  Instead, it uses the
+			   PIT connected to IRQ 0.  So we must detect that
+			   and route that specifically to where we expected
+			   to find the timer interrupt come in.  */
+
 			/* Copy this code from isa_device_interrupt because
 			   we need to hook into int 0 for the timer.  I
 			   refuse to soil device_interrupt with ifdefs.  */
@@ -107,7 +113,7 @@ ruffian_device_interrupt(unsigned long vector, struct pt_regs *regs)
 			if (j == 7 && !(inb(0x20) & 0x80)) {
 				/* It's only a passive release... */
 			} else if (j == 0) {
-			  	handle_irq(8, -1, regs); /* fake it */
+			  	handle_irq(TIMER_IRQ, -1, regs);
 				ruffian_ack_irq(0);
 			} else {
 				handle_irq(j, j, regs);

--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -35,12 +35,7 @@
 #include <linux/timex.h>

 #include "proto.h"
-
-#ifdef CONFIG_RTC 
-#define TIMER_IRQ 0  /* using pit for timer */
-#else 
-#define TIMER_IRQ 8  /* using rtc for timer */
-#endif
+#include "irq.h"

 static int set_rtc_mmss(unsigned long);


--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -153,10 +153,10 @@ ENTRY(lcall7)
 	ALIGN
 	.globl	ret_from_fork
 ret_from_fork:
-	GET_CURRENT(%ebx)
 #ifdef __SMP__
-	lock ; btrl $0, SYMBOL_NAME(scheduler_lock)
+	call SYMBOL_NAME(schedule_tail)
 #endif /* __SMP__ */
+	GET_CURRENT(%ebx)
 	jmp	ret_from_sys_call

 /*

--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(__global_cli);
 EXPORT_SYMBOL(__global_sti);
 EXPORT_SYMBOL(__global_save_flags);
 EXPORT_SYMBOL(__global_restore_flags);
-EXPORT_SYMBOL(smp_message_pass);
 EXPORT_SYMBOL(mtrr_hook);
 #endif


--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -953,7 +953,7 @@ static inline void self_IPI(unsigned int irq)

 	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
 		desc->status = status | IRQ_REPLAY;
-		send_IPI(APIC_DEST_SELF, IO_APIC_VECTOR(irq));
+		send_IPI_self(IO_APIC_VECTOR(irq));
 	}
 }


--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -189,7 +189,7 @@ BUILD_IRQ(60) BUILD_IRQ(61) BUILD_IRQ(62) BUILD_IRQ(63)
 /*
 * The following vectors are part of the Linux architecture, there
 * is no hardware IRQ pin equivalent for them, they are triggered
- * through the ICC by us (IPIs), via smp_message_pass():
+ * through the ICC by us (IPIs)
 */
 BUILD_SMP_INTERRUPT(reschedule_interrupt)
 BUILD_SMP_INTERRUPT(invalidate_interrupt)
@@ -297,7 +297,7 @@ int get_irq_list(char *buf)
 	}
 	p += sprintf(p, "NMI: %10u\n", atomic_read(&nmi_counter));
 #ifdef __SMP__
-	p += sprintf(p, "IPI: %10lu\n", ipi_count);
+	p += sprintf(p, "ERR: %10lu\n", ipi_count);
 #endif		
 	return p - buf;
 }
@@ -989,22 +989,22 @@ __initfunc(void init_IRQ(void))
 	 */

 	/* IPI for rescheduling */
-	set_intr_gate(0x30, reschedule_interrupt);
+	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);

 	/* IPI for invalidation */
-	set_intr_gate(0x31, invalidate_interrupt);
+	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);

 	/* IPI for CPU halt */
-	set_intr_gate(0x40, stop_cpu_interrupt);
+	set_intr_gate(STOP_CPU_VECTOR, stop_cpu_interrupt);

 	/* self generated IPI for local APIC timer */
-	set_intr_gate(0x41, apic_timer_interrupt);
+	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);

 	/* IPI for MTRR control */
-	set_intr_gate(0x50, mtrr_interrupt);
+	set_intr_gate(MTRR_CHANGE_VECTOR, mtrr_interrupt);

 	/* IPI vector for APIC spurious interrupts */
-	set_intr_gate(0xff, spurious_interrupt);
+	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 #endif	
 	request_region(0x20,0x20,"pic1");
 	request_region(0xa0,0x20,"pic2");

--- a/arch/i386/kernel/irq.h
+++ b/arch/i386/kernel/irq.h
@@ -40,8 +40,29 @@ typedef struct {
 	unsigned int depth;			/* Disable depth for nested irq disables */
 } irq_desc_t;

+/*
+ * Special IRQ vectors used by the SMP architecture:
+ *
+ * (some of the following vectors are 'rare', they might be merged
+ *  into a single vector to save vector space. TLB, reschedule and
+ *  local APIC vectors are performance-critical.)
+ */
+#define RESCHEDULE_VECTOR	0x30
+#define INVALIDATE_TLB_VECTOR	0x31
+#define STOP_CPU_VECTOR		0x40
+#define LOCAL_TIMER_VECTOR	0x41
+#define MTRR_CHANGE_VECTOR	0x50
+
+/*
+ * First vector available to drivers: (vectors 0x51-0xfe)
+ */
 #define IRQ0_TRAP_VECTOR	0x51

+/*
+ * This IRQ should never happen, but we print a message nevertheless.
+ */
+#define SPURIOUS_APIC_VECTOR	0xff
+
 extern irq_desc_t irq_desc[NR_IRQS];
 extern int irq_vector[NR_IRQS];
 #define IO_APIC_VECTOR(irq)	irq_vector[irq]
@@ -56,17 +77,18 @@ extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 * Interrupt entry/exit code at both C and assembly level
 */

-void mask_irq(unsigned int irq);
-void unmask_irq(unsigned int irq);
-void disable_8259A_irq(unsigned int irq);
-int i8259A_irq_pending(unsigned int irq);
-void ack_APIC_irq(void);
-void setup_IO_APIC(void);
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
-void make_8259A_irq(unsigned int irq);
-void send_IPI(int dest, int vector);
-void init_pic_mode(void);
-void print_IO_APIC(void);
+extern void mask_irq(unsigned int irq);
+extern void unmask_irq(unsigned int irq);
+extern void disable_8259A_irq(unsigned int irq);
+extern int i8259A_irq_pending(unsigned int irq);
+extern void ack_APIC_irq(void);
+extern void setup_IO_APIC(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern void make_8259A_irq(unsigned int irq);
+extern void FASTCALL(send_IPI_self(int vector));
+extern void smp_send_mtrr(void);
+extern void init_pic_mode(void);
+extern void print_IO_APIC(void);

 extern unsigned long long io_apic_irqs;


--- a/arch/i386/kernel/mtrr.c
+++ b/arch/i386/kernel/mtrr.c
@@ -164,6 +164,9 @@
 #include <asm/bitops.h>
 #include <asm/atomic.h>

+#include <asm/hardirq.h>
+#include "irq.h"
+
 #define MTRR_VERSION            "1.26 (19981001)"

 #define TRUE  1
@@ -612,7 +615,7 @@ static void do_all_cpus (void (*handler) (struct set_mtrr_context *ctxt,
    /*  Send a message to all other CPUs and wait for them to enter the
 	barrier  */
    atomic_set (&undone_count, smp_num_cpus - 1);
-    smp_message_pass (MSG_ALL_BUT_SELF, MSG_MTRR_CHANGE, 0, 0);
+    smp_send_mtrr();
    /*  Wait for it to be done  */
    timeout = jiffies + JIFFIE_TIMEOUT;
    while ( (atomic_read (&undone_count) > 0) &&

--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -140,11 +140,18 @@ int cpu_idle(void *unused)
 	current->priority = 0;
 	current->counter = -100;
 	while(1) {
-		if (current_cpu_data.hlt_works_ok && !hlt_counter && !current->need_resched)
+		if (current_cpu_data.hlt_works_ok && !hlt_counter &&
+				 !current->need_resched)
 			__asm__("hlt");
+		/*
+		 * although we are an idle CPU, we do not want to
+		 * get into the scheduler unnecessarily.
+		 */
+		if (current->need_resched) {
 			schedule();
 			check_pgt_cache();
 		}
+	}
 }

 #endif

--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -72,6 +72,8 @@ extern int setup_x86_irq(int, struct irqaction *);

 unsigned long cpu_hz;	/* Detected as we calibrate the TSC */

+cycles_t cacheflush_time;
+
 /* Number of usecs that the last interrupt was delayed */
 static int delay_at_last_interrupt;

@@ -96,7 +98,6 @@ static unsigned long do_fast_gettimeoffset(void)
 		:"=a" (eax), "=d" (edx));

 	/* .. relative to previous jiffy (32 bits is enough) */
-	edx = 0;
 	eax -= last_tsc_low;	/* tsc_low delta */

 	/*
@@ -110,11 +111,11 @@ static unsigned long do_fast_gettimeoffset(void)

 	__asm__("mull %2"
 		:"=a" (eax), "=d" (edx)
-		:"r" (fast_gettimeoffset_quotient),
-		 "0" (eax), "1" (edx));
+		:"g" (fast_gettimeoffset_quotient),
+		 "0" (eax));

 	/* our adjusted time offset in microseconds */
-	return edx + delay_at_last_interrupt;
+	return delay_at_last_interrupt + edx;
 }

 /* This function must be called with interrupts disabled 
@@ -240,17 +241,26 @@ void do_gettimeofday(struct timeval *tv)
 {
 	extern volatile unsigned long lost_ticks;
 	unsigned long flags;
+	unsigned long usec, sec;

 	read_lock_irqsave(&xtime_lock, flags);
-	*tv = xtime;
-	tv->tv_usec += do_gettimeoffset();
-	if (lost_ticks)
-		tv->tv_usec += lost_ticks * (1000000/HZ);
+	usec = do_gettimeoffset();
+	{
+		unsigned long lost = lost_ticks;
+		if (lost)
+			usec += lost * (1000000 / HZ);
+	}
+	sec = xtime.tv_sec;
+	usec += xtime.tv_usec;
 	read_unlock_irqrestore(&xtime_lock, flags);
-	while (tv->tv_usec >= 1000000) {
-		tv->tv_usec -= 1000000;
-		tv->tv_sec++;
+
+	while (usec >= 1000000) {
+		usec -= 1000000;
+		sec++;
 	}
+
+	tv->tv_sec = sec;
+	tv->tv_usec = usec;
 }

 void do_settimeofday(struct timeval *tv)
@@ -377,13 +387,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *reg
 		else
 			last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */
 	}
-#if 0
-	/* As we return to user mode fire off the other CPU schedulers.. this is 
-	   basically because we don't yet share IRQ's around. This message is
-	   rigged to be safe on the 386 - basically it's a hack, so don't look
-	   closely for now.. */
-	smp_message_pass(MSG_ALL_BUT_SELF, MSG_RESCHEDULE, 0L, 0);
-#endif
 	    
 #ifdef CONFIG_MCA
 	if( MCA_bus ) {
@@ -639,5 +642,13 @@ __initfunc(void time_init(void))
 			printk("Detected %ld Hz processor.\n", cpu_hz);
 		}
 	}
+
+	/*
+	 * Rough estimation for SMP scheduling, this is the number of
+	 * cycles it takes for a fully memory-limited process to flush
+	 * the SMP-local cache.
+	 */
+	cacheflush_time = cpu_hz/10000;
+
 	setup_x86_irq(0, &irq0);
 }
--- a/arch/i386/vmlinux.lds
+++ b/arch/i386/vmlinux.lds
@@ -45,9 +45,13 @@ SECTIONS
  . = ALIGN(4096);
  __init_end = .;

+  . = ALIGN(32);
+  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+
  . = ALIGN(4096);
  .data.page_aligned : { *(.data.idt) }

+
  __bss_start = .;		/* BSS */
  .bss : {
 	*(.bss)

--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -15,7 +15,7 @@ typedef struct { volatile int counter; } atomic_t;
 typedef struct { int counter; } atomic_t;
 #endif

-#define ATOMIC_INIT(i)	{ (i) }
+#define ATOMIC_INIT(i)	( (atomic_t) { (i) } )

 #define atomic_read(v)		((v)->counter)
 #define atomic_set(v,i)		((v)->counter = (i))

--- a/include/asm-alpha/core_apecs.h
+++ b/include/asm-alpha/core_apecs.h
@@ -458,7 +458,7 @@ __EXTERN_INLINE unsigned int apecs_inb(unsigned long addr)

 __EXTERN_INLINE void apecs_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + APECS_IO + 0x00) = w;
@@ -473,7 +473,7 @@ __EXTERN_INLINE unsigned int apecs_inw(unsigned long addr)

 __EXTERN_INLINE void apecs_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + APECS_IO + 0x08) = w;

--- a/include/asm-alpha/core_cia.h
+++ b/include/asm-alpha/core_cia.h
@@ -326,7 +326,7 @@ __EXTERN_INLINE unsigned int cia_inb(unsigned long addr)

 __EXTERN_INLINE void cia_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w = __kernel_insbl(b, addr & 3);
+	unsigned long w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + CIA_IO + 0x00) = w;
 	wmb();
 }
@@ -340,7 +340,7 @@ __EXTERN_INLINE unsigned int cia_inw(unsigned long addr)

 __EXTERN_INLINE void cia_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w = __kernel_inswl(b, addr & 3);
+	unsigned long w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + CIA_IO + 0x08) = w;
 	wmb();
 }

--- a/include/asm-alpha/core_lca.h
+++ b/include/asm-alpha/core_lca.h
@@ -262,7 +262,7 @@ __EXTERN_INLINE unsigned int lca_inb(unsigned long addr)

 __EXTERN_INLINE void lca_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + LCA_IO + 0x00) = w;
@@ -277,7 +277,7 @@ __EXTERN_INLINE unsigned int lca_inw(unsigned long addr)

 __EXTERN_INLINE void lca_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + LCA_IO + 0x08) = w;
@@ -340,7 +340,7 @@ __EXTERN_INLINE unsigned long lca_readq(unsigned long addr)
 __EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
 {
 	unsigned long msb;
-	unsigned int w;
+	unsigned long w;

 	if (addr >= (1UL << 24)) {
 		msb = addr & 0xf8000000;
@@ -354,7 +354,7 @@ __EXTERN_INLINE void lca_writeb(unsigned char b, unsigned long addr)
 __EXTERN_INLINE void lca_writew(unsigned short b, unsigned long addr)
 {
 	unsigned long msb;
-	unsigned int w;
+	unsigned long w;

 	if (addr >= (1UL << 24)) {
 		msb = addr & 0xf8000000;

--- a/include/asm-alpha/core_mcpcia.h
+++ b/include/asm-alpha/core_mcpcia.h
@@ -264,7 +264,7 @@ __EXTERN_INLINE void mcpcia_outb(unsigned char b, unsigned long in_addr)
 {
 	unsigned long addr = in_addr & 0xffffffffUL;
 	unsigned long hose = (in_addr >> 32) & 3;
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x00) = w;
@@ -283,7 +283,7 @@ __EXTERN_INLINE void mcpcia_outw(unsigned short b, unsigned long in_addr)
 {
 	unsigned long addr = in_addr & 0xffffffffUL;
 	unsigned long hose = (in_addr >> 32) & 3;
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + MCPCIA_IO(hose) + 0x08) = w;

--- a/include/asm-alpha/core_pyxis.h
+++ b/include/asm-alpha/core_pyxis.h
@@ -326,7 +326,7 @@ __EXTERN_INLINE unsigned int pyxis_inb(unsigned long addr)

 __EXTERN_INLINE void pyxis_outb(unsigned char b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_insbl(b, addr & 3);
 	*(vuip) ((addr << 5) + PYXIS_IO + 0x00) = w;
@@ -341,7 +341,7 @@ __EXTERN_INLINE unsigned int pyxis_inw(unsigned long addr)

 __EXTERN_INLINE void pyxis_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + PYXIS_IO + 0x08) = w;

--- a/include/asm-alpha/core_t2.h
+++ b/include/asm-alpha/core_t2.h
@@ -378,7 +378,7 @@ __EXTERN_INLINE unsigned int t2_inw(unsigned long addr)

 __EXTERN_INLINE void t2_outw(unsigned short b, unsigned long addr)
 {
-	unsigned int w;
+	unsigned long w;

 	w = __kernel_inswl(b, addr & 3);
 	*(vuip) ((addr << 5) + T2_IO + 0x08) = w;

--- a/include/asm-alpha/io.h
+++ b/include/asm-alpha/io.h
@@ -3,7 +3,6 @@

 #include <linux/config.h>
 #include <asm/system.h>
-#include <asm/machvec.h>

 /* We don't use IO slowdowns on the Alpha, but.. */
 #define __SLOW_DOWN_IO	do { } while (0)
@@ -19,6 +18,7 @@
 #endif

 #ifdef __KERNEL__
+#include <asm/machvec.h>

 /*
 * We try to avoid hae updates (thus the cache), but when we
@@ -78,6 +78,7 @@ extern void _sethae (unsigned long addr);	/* cached version */
 * There are different chipsets to interface the Alpha CPUs to the world.
 */

+#ifdef __KERNEL__
 #ifdef CONFIG_ALPHA_GENERIC

 /* In a generic kernel, we always go through the machine vector.  */
@@ -147,6 +148,7 @@ extern void _sethae (unsigned long addr);	/* cached version */
 #undef __WANT_IO_DEF

 #endif /* GENERIC */
+#endif /* __KERNEL__ */

 /*
 * The convention used for inb/outb etc. is that names starting with
@@ -172,6 +174,7 @@ extern void		_writew(unsigned short b, unsigned long addr);
 extern void		_writel(unsigned int b, unsigned long addr);
 extern void		_writeq(unsigned long b, unsigned long addr);

+#ifdef __KERNEL__
 /*
 * The platform header files may define some of these macros to use
 * the inlined versions where appropriate.  These macros may also be
@@ -216,6 +219,27 @@ extern void		_writeq(unsigned long b, unsigned long addr);
 # define outl_p		outl
 #endif

+#else 
+
+/* Userspace declarations.  */
+
+extern unsigned int	inb (unsigned long port);
+extern unsigned int	inw (unsigned long port);
+extern unsigned int	inl (unsigned long port);
+extern void		outb (unsigned char b,unsigned long port);
+extern void		outw (unsigned short w,unsigned long port);
+extern void		outl (unsigned int l,unsigned long port);
+extern unsigned long	readb(unsigned long addr);
+extern unsigned long	readw(unsigned long addr);
+extern unsigned long	readl(unsigned long addr);
+extern void		writeb(unsigned char b, unsigned long addr);
+extern void		writew(unsigned short b, unsigned long addr);
+extern void		writel(unsigned int b, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+#ifdef __KERNEL__
+
 /*
 * The "address" in IO memory space is not clearly either an integer or a
 * pointer. We will accept both, thus the casts.
@@ -257,8 +281,6 @@ static inline void iounmap(void *addr)
 # define writeq(v,a)	_writeq((v),(unsigned long)(a))
 #endif

-#ifdef __KERNEL__
-
 /*
 * String version of IO memory access ops:
 */

--- a/include/asm-alpha/softirq.h
+++ b/include/asm-alpha/softirq.h
@@ -117,6 +117,7 @@ extern inline void disable_bh(int nr)
 {
 	bh_mask &= ~(1 << nr);
 	atomic_inc(&bh_mask_count[nr]);
+	synchronize_bh();
 }

 extern inline void enable_bh(int nr)

--- a/include/asm-i386/init.h
+++ b/include/asm-i386/init.h
@@ -11,4 +11,7 @@
 #define __FINIT	.previous
 #define __INITDATA	.section	".data.init",#alloc,#write

+#define __cacheline_aligned __attribute__ \
+			 ((__section__ (".data.cacheline_aligned")))
+
 #endif
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -185,10 +185,6 @@ extern inline int cpu_logical_map(int cpu)
 extern void smp_callin(void);
 extern void smp_boot_cpus(void);
 extern void smp_store_cpu_info(int id);		/* Store per CPU info (like the initial udelay numbers */
-extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
-
-extern volatile unsigned long smp_proc_in_lock[NR_CPUS]; /* for computing process time */
-extern volatile int smp_process_available;

 /*
 *	APIC handlers: Note according to the Intel specification update
@@ -237,9 +233,7 @@ extern __inline int hard_smp_processor_id(void)
 *	processes are run.
 */
 
-#define PROC_CHANGE_PENALTY	10		/* Schedule penalty */
+#define PROC_CHANGE_PENALTY	15		/* Schedule penalty */

-#define SMP_FROM_INT		1
-#define SMP_FROM_SYSCALL	2
 #endif
 #endif
--- a/include/asm-i386/system.h.lock~
+++ b/include/asm-i386/system.h.lock~
+torvalds@penguin.transmeta.com
\ No newline at end of file
--- a/include/asm-i386/timex.h
+++ b/include/asm-i386/timex.h
@@ -12,4 +12,22 @@
 	(1000000/CLOCK_TICK_FACTOR) / (CLOCK_TICK_RATE/CLOCK_TICK_FACTOR)) \
 		<< (SHIFT_SCALE-SHIFT_HZ)) / HZ)

+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ */
+typedef unsigned long long cycles_t;
+
+extern cycles_t cacheflush_time;
+
+static inline cycles_t get_cycles (void)
+{
+	cycles_t value;
+
+	__asm__("rdtsc"
+		:"=a" (*(((int *)&value)+0)),
+		 "=d" (*(((int *)&value)+1)));
+	return value;
+}
+
 #endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -11,6 +11,7 @@ extern unsigned long event;
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/times.h>
+#include <linux/timex.h>

 #include <asm/system.h>
 #include <asm/semaphore.h>
@@ -219,6 +220,7 @@ struct task_struct {
 /* various fields */
 	long counter;
 	long priority;
+	cycles_t avg_slice;
 /* SMP and runqueue state */
 	int has_cpu;
 	int processor;
@@ -336,7 +338,7 @@ struct task_struct {
 */
 #define INIT_TASK \
 /* state etc */	{ 0,0,0,KERNEL_DS,&default_exec_domain,0, \
-/* counter */	DEF_PRIORITY,DEF_PRIORITY, \
+/* counter */	DEF_PRIORITY,DEF_PRIORITY,0, \
 /* SMP */	0,0,0,-1, \
 /* schedlink */	&init_task,&init_task, &init_task, &init_task, \
 /* binfmt */	NULL, \

--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,11 +11,21 @@
 #include <asm/smp.h>

 /*
- * main IPI interface, handles INIT, TLB flush, STOP, etc. (defined in asm header):
- *
- * extern void smp_message_pass(int target, int msg, unsigned long data, int wait);
+ * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
+ * (defined in asm header):
 */ 

+/*
+ * stops all CPUs but the current one:
+ */
+extern void smp_send_stop(void);
+
+/*
+ * sends a 'reschedule' event to another CPU:
+ */
+extern void FASTCALL(smp_send_reschedule(int cpu));
+
+
 /*
 * Boot processor call to load the other CPU's
 */
@@ -61,7 +71,6 @@ extern volatile int smp_msg_id;
 #define smp_num_cpus			1
 #define smp_processor_id()		0
 #define hard_smp_processor_id()		0
-#define smp_message_pass(t,m,d,w)	
 #define smp_threads_ready		1
 #define kernel_lock()
 #define cpu_logical_map(cpu)		0

--- a/init/main.c
+++ b/init/main.c
@@ -1177,6 +1177,7 @@ asmlinkage void __init start_kernel(void)
 	 */
 	smp_init();
 	kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	current->need_resched = 1;
 	cpu_idle(NULL);
 }


--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -50,7 +50,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	unblank_console();

 #ifdef __SMP__
-	smp_message_pass(MSG_ALL_BUT_SELF, MSG_STOP_CPU, 0, 0);
+	smp_send_stop();
 #endif
 	if (panic_timeout > 0)
 	{

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -12,6 +12,7 @@
 *  1998-12-24	Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *		serialize accesses to xtime/lost_ticks).
 *				Copyright (C) 1998  Andrea Arcangeli
+ *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
 */

 /*
@@ -96,47 +97,110 @@ struct kernel_stat kstat = { 0 };

 void scheduling_functions_start_here(void) { }

-static inline void reschedule_idle(struct task_struct * p)
+#ifdef __SMP__
+static void reschedule_idle_slow(struct task_struct * p)
 {
+/*
+ * (see reschedule_idle() for an explanation first ...)
+ *
+ * Pass #2
+ *
+ * We try to find another (idle) CPU for this woken-up process.
+ *
+ * On SMP, we mostly try to see if the CPU the task used
+ * to run on is idle.. but we will use another idle CPU too,
+ * at this point we already know that this CPU is not
+ * willing to reschedule in the near future.
+ *
+ * An idle CPU is definitely wasted, especially if this CPU is
+ * running long-timeslice processes. The following algorithm is
+ * pretty good at finding the best idle CPU to send this process
+ * to.
+ *
+ * [We can try to preempt low-priority processes on other CPUs in
+ * 2.3. Also we can try to use the avg_slice value to predict
+ * 'likely reschedule' events even on other CPUs.]
+ */
+	int best_cpu = p->processor, this_cpu = smp_processor_id();
+	struct task_struct **idle = task, *tsk, *target_tsk;
+	int i = smp_num_cpus;

+	target_tsk = NULL;
+	do {
+		tsk = *idle;
+		idle++;
+		if (tsk->has_cpu) {
+			if (tsk->processor == this_cpu)
+				continue;
+			target_tsk = tsk;
+			if (tsk->processor == best_cpu) {
 				/*
-	 * For SMP, we try to see if the CPU the task used
-	 * to run on is idle..
+				 * bingo, we couldnt get a better
+				 * CPU, activate it.
 				 */
-#if 0
+				goto send; /* this one helps GCC ... */
+			}
+		}
+	} while (--i > 0);
+
 	/*
-	 * Disable this for now. Ingo has some interesting
-	 * code that looks too complex, and I have some ideas,
-	 * but in the meantime.. One problem is that "wakeup()"
-	 * can be (and is) called before we've even initialized
-	 * SMP completely, so..
+	 * found any idle CPU?
 	 */
-#ifdef __SMP__
-	int want_cpu = p->processor;
+	if (target_tsk) {
+send:
+		target_tsk->need_resched = 1;
+		smp_send_reschedule(target_tsk->processor);
+		return;
+	}
+}
+#endif /* __SMP__ */
+
+static inline void reschedule_idle(struct task_struct * p)
+{

+	if (p->policy != SCHED_OTHER || p->counter > current->counter + 3) {
+		current->need_resched = 1;
+		return;
+	}
+
+#ifdef __SMP__
 	/*
-	 * Don't even try to find another CPU for us if the task
-	 * ran on this one before..
+	 * ("wakeup()" should not be called before we've initialized
+	 * SMP completely. [Linus, is there any exception to this?]
+	 * Basically a not-yet initialized SMP subsystem can be
+	 * considered as a not-yet working scheduler, simply dont use
+	 * it before it'd up and running ...)
+	 *
+	 * SMP rescheduling is done in 2 passes:
+	 *  - pass #1: faster: 'quick decisions'
+	 *  - pass #2: slower: 'lets try and find another CPU'
 	 */
-	if (want_cpu != smp_processor_id()) {
-		struct task_struct **idle = task;
-		int i = smp_num_cpus;

-		do {
-			struct task_struct *tsk = *idle;
-			idle++;
-			/* Something like this.. */
-			if (tsk->has_cpu && tsk->processor == want_cpu) {
-				tsk->need_resched = 1;
-				smp_send_reschedule(want_cpu);
+	/*
+	 * Pass #1
+	 *
+	 * There are two metrics here:
+	 *
+	 * first, a 'cutoff' interval, currently ~250 usecs on
+	 * x86 CPUs. If the current process has longer average
+	 * timeslices than this, then we utilize the idle CPU.
+	 *
+	 * second, if the wakeup comes from a process context,
+	 * then the two processes are 'related'. (they form a
+	 * 'gang')
+	 *
+	 * An idle CPU is almost always a bad thing, thus we skip
+	 * the idle-CPU utilization only if both these conditions
+	 * are true. (ie. a 'process-gang' rescheduling with rather
+	 * high frequency should stay on the same CPU).
+	 *
+	 * [We can switch to something more finegrained in 2.3.]
+	 */
+	if ((current->avg_slice < cacheflush_time) && !in_interrupt())
 		return;
-			}
-		} while (--i > 0);
-	}
-#endif
-#endif
-	if (p->policy != SCHED_OTHER || p->counter > current->counter + 3)
-		current->need_resched = 1;	
+
+	reschedule_idle_slow(p);
+#endif /* __SMP__ */
 }

 /*
@@ -244,6 +308,8 @@ static void process_timeout(unsigned long __data)
 	wake_up_process(p);
 }

+int _PROC_CHANGE_PENALTY = 13;
+
 /*
 * This is the function that decides how desirable a process is..
 * You can weigh different processes against each other depending
@@ -488,6 +554,63 @@ signed long schedule_timeout(signed long timeout)
 	return timeout < 0 ? 0 : timeout;
 }

+/*
+ * This one aligns per-CPU data on cacheline boundaries.
+ */
+static union {
+	struct schedule_data {
+		struct task_struct * prev;
+		long prevstate;
+		cycles_t last_schedule;
+	} schedule_data;
+	char __pad [L1_CACHE_BYTES];
+} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
+
+
+static inline void __schedule_tail (void)
+{
+#ifdef __SMP__
+	struct schedule_data * sched_data;
+
+	/*
+	 * We might have switched CPUs:
+	 */
+	sched_data = & aligned_data[smp_processor_id()].schedule_data;
+
+	/*
+	 * Subtle. In the rare event that we got a wakeup to 'prev' just
+	 * during the reschedule (this is possible, the scheduler is pretty
+	 * parallel), we should do another reschedule in the next task's
+	 * context. schedule() will do the right thing next time around.
+	 * this is equivalent to 'delaying' the wakeup until the reschedule
+	 * has finished.
+	 */
+	if (sched_data->prev->state != sched_data->prevstate)
+		current->need_resched = 1;
+
+	/*
+	 * Release the previous process ...
+	 *
+	 * We have dropped all locks, and we must make sure that we
+	 * only mark the previous process as no longer having a CPU
+	 * after all other state has been seen by other CPU's. Thus
+	 * the memory barrier!
+	 */
+	mb();
+	sched_data->prev->has_cpu = 0;
+#endif /* __SMP__ */
+}
+
+/*
+ * schedule_tail() is getting called from the fork return path. This
+ * cleans up all remaining scheduler things, without impacting the
+ * common case.
+ */
+void schedule_tail (void)
+{
+	__schedule_tail();
+}
+
 /*
 *  'schedule()' is the scheduler function. It's a very simple and nice
 * scheduler: it's not perfect, but certainly works for most things.
@@ -500,11 +623,18 @@ signed long schedule_timeout(signed long timeout)
 */
 asmlinkage void schedule(void)
 {
+	struct schedule_data * sched_data;
 	struct task_struct * prev, * next;
 	int this_cpu;

 	prev = current;
 	this_cpu = prev->processor;
+	/*
+	 * 'sched_data' is protected by the fact that we can run
+	 * only one process per CPU.
+	 */
+	sched_data = & aligned_data[this_cpu].schedule_data;
+
 	if (in_interrupt())
 		goto scheduling_in_interrupt;
 	release_kernel_lock(prev, this_cpu);
@@ -519,6 +649,7 @@ asmlinkage void schedule(void)

 	/* move an exhausted RR process to be last.. */
 	prev->need_resched = 0;
+
 	if (!prev->counter && prev->policy == SCHED_RR) {
 		prev->counter = prev->priority;
 		move_last_runqueue(prev);
@@ -534,6 +665,9 @@ asmlinkage void schedule(void)
 			del_from_runqueue(prev);
 		case TASK_RUNNING:
 	}
+
+	sched_data->prevstate = prev->state;
+
 	{
 		struct task_struct * p = init_task.next_run;
 		/*
@@ -580,27 +714,49 @@ asmlinkage void schedule(void)
 		}
 	}

+ 	/*
+ 	 * maintain the per-process 'average timeslice' value.
+ 	 * (this has to be recalculated even if we reschedule to
+ 	 * the same process) Currently this is only used on SMP:
+ 	 */
 #ifdef __SMP__
-	next->has_cpu = 1;
-#endif
+	{
+		cycles_t t, this_slice;

+		t = get_cycles();
+		this_slice = t - sched_data->last_schedule;
+		sched_data->last_schedule = t;
+
+		/*
+		 * Simple, exponentially fading average calculation:
+		 */
+		prev->avg_slice = this_slice + prev->avg_slice;
+		prev->avg_slice >>= 1;
+	}
+
+	/*
+	 * We drop the scheduler lock early (it's a global spinlock),
+	 * thus we have to lock the previous process from getting
+	 * rescheduled during switch_to().
+	 */
+	prev->has_cpu = 1;
+
+ 	next->has_cpu = 1;
+ 	next->processor = this_cpu;
+	spin_unlock(&scheduler_lock);
+#endif /* __SMP__ */
 	if (prev != next) {
 #ifdef __SMP__
-		next->processor = this_cpu;
+		sched_data->prev = prev;
 #endif
 	 	kstat.context_swtch++;
 		get_mmu_context(next);
 		switch_to(prev,next);
-	}

-	spin_unlock(&scheduler_lock);
+		__schedule_tail();
+	}
  
-	/*
-	 * At this point "prev" is "current", as we just
-	 * switched into it (from an even more "previous"
-	 * prev)
-	 */
-	reacquire_kernel_lock(prev);
+	reacquire_kernel_lock(current);
 	return;

 scheduling_in_interrupt:
@@ -608,7 +764,6 @@ asmlinkage void schedule(void)
 	*(int *)0 = 0;
 }

-
 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;

 /*