Merge http://linux-sound.bkbits.net/linux-sound

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge http://linux-sound.bkbits.net/linux-sound
into home.transmeta.com:/home/torvalds/v2.5/linux
fb796d31 · Linus Torvalds · d9c28b28 · 3fddff46 · fb796d31 · fb796d31
Commit fb796d31 authored Oct 28, 2002 by Linus Torvalds
40 changed files
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -6,6 +6,7 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/msr.h>
+#include <asm/uaccess.h>

 #include "cpu.h"

@@ -13,6 +14,11 @@ static int disable_x86_serial_nr __initdata = 1;
 static int disable_P4_HT __initdata = 0;
 extern int trap_init_f00f_bug(void);

+#ifdef INTEL_MOVSL
+struct movsl_mask movsl_mask;	/* alignment at which movsl is preferred for
+			   	   bulk memory copies */
+#endif
+
 /*
 *	Early probe support logic for ppro memory erratum #50
 *
@@ -348,6 +354,25 @@ static void __init init_intel(struct cpuinfo_x86 *c)

 	/* Work around errata */
 	Intel_errata_workarounds(c);
+
+#ifdef INTEL_MOVSL
+	/*
+	 * Set up the preferred alignment for movsl bulk memory moves
+	 */
+	switch (c->x86) {
+	case 4:		/* 486: untested */
+		break;
+	case 5:		/* Old Pentia: untested */
+		break;
+	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	case 15:	/* P4 is OK down to 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	}
+#endif
+
 }



--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -21,8 +21,13 @@ struct dmi_header
 	u16	handle;
 };

+#undef DMI_DEBUG
+
+#ifdef DMI_DEBUG
+#define dmi_printk(x) printk x
+#else
 #define dmi_printk(x)
-//#define dmi_printk(x) printk x
+#endif

 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
@@ -832,7 +837,9 @@ static __init void dmi_check_blacklist(void)

 static void __init dmi_decode(struct dmi_header *dm)
 {
+#ifdef DMI_DEBUG
 	u8 *data = (u8 *)dm;
+#endif
 	
 	switch(dm->type)
 	{

--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -116,8 +116,10 @@ EXPORT_SYMBOL(strncpy_from_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(clear_user);
 EXPORT_SYMBOL(__clear_user);
-EXPORT_SYMBOL(__generic_copy_from_user);
-EXPORT_SYMBOL(__generic_copy_to_user);
+EXPORT_SYMBOL(copy_from_user);
+EXPORT_SYMBOL(__copy_from_user);
+EXPORT_SYMBOL(copy_to_user);
+EXPORT_SYMBOL(__copy_to_user);
 EXPORT_SYMBOL(strnlen_user);

 EXPORT_SYMBOL(pci_alloc_consistent);

--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -328,20 +328,21 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
-	long esp;

 	irq_enter();

 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
-	__asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191));
-	if (unlikely(esp < (sizeof(struct task_struct) + 1024))) {
-		extern void show_stack(unsigned long *);
-
-		printk("do_IRQ: stack overflow: %ld\n",
-		esp - sizeof(struct task_struct));
-		__asm__ __volatile__("movl %%esp,%0" : "=r" (esp));
-		show_stack((void *)esp);
+	{
+		long esp;
+
+		__asm__ __volatile__("andl %%esp,%0" :
+					"=r" (esp) : "0" (8191));
+		if (unlikely(esp < (sizeof(struct task_struct) + 1024))) {
+			printk("do_IRQ: stack overflow: %ld\n",
+				esp - sizeof(struct task_struct));
+			dump_stack();
+		}
 	}
 #endif
 	kstat.irqs[cpu][irq]++;

--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -9,58 +9,20 @@
 #include <asm/uaccess.h>
 #include <asm/mmx.h>

-#ifdef CONFIG_X86_USE_3DNOW_AND_WORKS
-
-unsigned long
-__generic_copy_to_user(void *to, const void *from, unsigned long n)
+#ifdef INTEL_MOVSL
+static inline int movsl_is_ok(const void *a1, const void *a2, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
-	{
-		if(n<512)
-			__copy_user(to,from,n);
-		else
-			mmx_copy_user(to,from,n);
-	}
-	return n;
-}
-
-unsigned long
-__generic_copy_from_user(void *to, const void *from, unsigned long n)
-{
-	if (access_ok(VERIFY_READ, from, n))
-	{
-		if(n<512)
-			__copy_user_zeroing(to,from,n);
-		else
-			mmx_copy_user_zeroing(to, from, n);
-	}
-	else
-		memset(to, 0, n);
-	return n;
+	if (n < 64)
+		return 1;
+	if ((((const long)a1 ^ (const long)a2) & movsl_mask.mask) == 0)
+		return 1;
+	return 0;
 }
-
 #else
-
-unsigned long
-__generic_copy_to_user(void *to, const void *from, unsigned long n)
+static inline int movsl_is_ok(const void *a1, const void *a2, unsigned long n)
 {
-	prefetch(from);
-	if (access_ok(VERIFY_WRITE, to, n))
-		__copy_user(to,from,n);
-	return n;
-}
-
-unsigned long
-__generic_copy_from_user(void *to, const void *from, unsigned long n)
-{
-	prefetchw(to);
-	if (access_ok(VERIFY_READ, from, n))
-		__copy_user_zeroing(to,from,n);
-	else
-		memset(to, 0, n);
-	return n;
+	return 1;
 }
-
 #endif

 /*
@@ -188,3 +150,313 @@ long strnlen_user(const char *s, long n)
 		:"cc");
 	return res & mask;
 }
+
+#ifdef INTEL_MOVSL
+
+static unsigned long
+__copy_user_intel(void *to, const void *from,unsigned long size)
+{
+	int d0, d1;
+	__asm__ __volatile__(
+		       "       .align 2,0x90\n" 
+		       "0:     movl 32(%4), %%eax\n"
+		       "       cmpl $67, %0\n"     
+		       "       jbe 1f\n"            
+		       "       movl 64(%4), %%eax\n"
+		       "       .align 2,0x90\n"     
+		       "1:     movl 0(%4), %%eax\n" 
+		       "       movl 4(%4), %%edx\n" 
+		       "2:     movl %%eax, 0(%3)\n" 
+		       "21:    movl %%edx, 4(%3)\n" 
+		       "       movl 8(%4), %%eax\n" 
+		       "       movl 12(%4),%%edx\n" 
+		       "3:     movl %%eax, 8(%3)\n" 
+		       "31:    movl %%edx, 12(%3)\n"
+		       "       movl 16(%4), %%eax\n"
+		       "       movl 20(%4), %%edx\n"
+		       "4:     movl %%eax, 16(%3)\n"
+		       "41:    movl %%edx, 20(%3)\n"
+		       "       movl 24(%4), %%eax\n"
+		       "       movl 28(%4), %%edx\n"
+		       "10:    movl %%eax, 24(%3)\n"
+		       "51:    movl %%edx, 28(%3)\n"
+		       "       movl 32(%4), %%eax\n"
+		       "       movl 36(%4), %%edx\n"
+		       "11:    movl %%eax, 32(%3)\n"
+		       "61:    movl %%edx, 36(%3)\n"
+		       "       movl 40(%4), %%eax\n"
+		       "       movl 44(%4), %%edx\n"
+		       "12:    movl %%eax, 40(%3)\n"
+		       "71:    movl %%edx, 44(%3)\n"
+		       "       movl 48(%4), %%eax\n"
+		       "       movl 52(%4), %%edx\n"
+		       "13:    movl %%eax, 48(%3)\n"
+		       "81:    movl %%edx, 52(%3)\n"
+		       "       movl 56(%4), %%eax\n"
+		       "       movl 60(%4), %%edx\n"
+		       "14:    movl %%eax, 56(%3)\n"
+		       "91:    movl %%edx, 60(%3)\n"
+		       "       addl $-64, %0\n"     
+		       "       addl $64, %4\n"      
+		       "       addl $64, %3\n"      
+		       "       cmpl $63, %0\n"      
+		       "       ja  0b\n"            
+		       "5:     movl  %0, %%eax\n"   
+		       "       shrl  $2, %0\n"      
+		       "       andl  $3, %%eax\n"   
+		       "       cld\n"               
+		       "6:     rep; movsl\n"        
+		       "       movl %%eax, %0\n"    
+		       "7:     rep; movsb\n"		
+		       "8:\n"				
+		       ".section .fixup,\"ax\"\n"	
+		       "9:     lea 0(%%eax,%0,4),%0\n"	
+		       "       jmp 8b\n"               
+		       ".previous\n"			
+		       ".section __ex_table,\"a\"\n"	
+		       "       .align 4\n"		
+		       "       .long 2b,8b\n"		
+		       "       .long 21b,8b\n"	
+		       "       .long 3b,8b\n"		
+		       "       .long 31b,8b\n"	
+		       "       .long 4b,8b\n"		
+		       "       .long 41b,8b\n"	
+		       "       .long 10b,8b\n"	
+		       "       .long 51b,8b\n"	
+		       "       .long 11b,8b\n"	
+		       "       .long 61b,8b\n"	
+		       "       .long 12b,8b\n"	
+		       "       .long 71b,8b\n"	
+		       "       .long 13b,8b\n"	
+		       "       .long 81b,8b\n"	
+		       "       .long 14b,8b\n"	
+		       "       .long 91b,8b\n"	
+		       "       .long 6b,9b\n"		
+		       "       .long 7b,8b\n"          
+		       ".previous"			
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+		       :  "1"(to), "2"(from), "0"(size)
+		       : "eax", "edx", "memory");			
+	return size;
+}
+
+static unsigned long
+__copy_user_zeroing_intel(void *to, const void *from, unsigned long size)
+{
+	int d0, d1;
+	__asm__ __volatile__(
+		       "        .align 2,0x90\n"
+		       "0:      movl 32(%4), %%eax\n"
+		       "        cmpl $67, %0\n"      
+		       "        jbe 2f\n"            
+		       "1:      movl 64(%4), %%eax\n"
+		       "        .align 2,0x90\n"     
+		       "2:      movl 0(%4), %%eax\n" 
+		       "21:     movl 4(%4), %%edx\n" 
+		       "        movl %%eax, 0(%3)\n" 
+		       "        movl %%edx, 4(%3)\n" 
+		       "3:      movl 8(%4), %%eax\n" 
+		       "31:     movl 12(%4),%%edx\n" 
+		       "        movl %%eax, 8(%3)\n" 
+		       "        movl %%edx, 12(%3)\n"
+		       "4:      movl 16(%4), %%eax\n"
+		       "41:     movl 20(%4), %%edx\n"
+		       "        movl %%eax, 16(%3)\n"
+		       "        movl %%edx, 20(%3)\n"
+		       "10:     movl 24(%4), %%eax\n"
+		       "51:     movl 28(%4), %%edx\n"
+		       "        movl %%eax, 24(%3)\n"
+		       "        movl %%edx, 28(%3)\n"
+		       "11:     movl 32(%4), %%eax\n"
+		       "61:     movl 36(%4), %%edx\n"
+		       "        movl %%eax, 32(%3)\n"
+		       "        movl %%edx, 36(%3)\n"
+		       "12:     movl 40(%4), %%eax\n"
+		       "71:     movl 44(%4), %%edx\n"
+		       "        movl %%eax, 40(%3)\n"
+		       "        movl %%edx, 44(%3)\n"
+		       "13:     movl 48(%4), %%eax\n"
+		       "81:     movl 52(%4), %%edx\n"
+		       "        movl %%eax, 48(%3)\n"
+		       "        movl %%edx, 52(%3)\n"
+		       "14:     movl 56(%4), %%eax\n"
+		       "91:     movl 60(%4), %%edx\n"
+		       "        movl %%eax, 56(%3)\n"
+		       "        movl %%edx, 60(%3)\n"
+		       "        addl $-64, %0\n"     
+		       "        addl $64, %4\n"      
+		       "        addl $64, %3\n"      
+		       "        cmpl $63, %0\n"      
+		       "        ja  0b\n"            
+		       "5:      movl  %0, %%eax\n"   
+		       "        shrl  $2, %0\n"      
+		       "        andl $3, %%eax\n"    
+		       "        cld\n"               
+		       "6:      rep; movsl\n"   
+		       "        movl %%eax,%0\n"
+		       "7:      rep; movsb\n"	
+		       "8:\n"			
+		       ".section .fixup,\"ax\"\n"
+		       "9:      lea 0(%%eax,%0,4),%0\n"	
+		       "16:     pushl %0\n"	
+		       "        pushl %%eax\n"	
+		       "        xorl %%eax,%%eax\n"
+		       "        rep; stosb\n"	
+		       "        popl %%eax\n"	
+		       "        popl %0\n"	
+		       "        jmp 8b\n"	
+		       ".previous\n"		
+		       ".section __ex_table,\"a\"\n"
+		       "	.align 4\n"	   
+		       "	.long 0b,16b\n"	 
+		       "	.long 1b,16b\n"
+		       "	.long 2b,16b\n"
+		       "	.long 21b,16b\n"
+		       "	.long 3b,16b\n"	
+		       "	.long 31b,16b\n"
+		       "	.long 4b,16b\n"	
+		       "	.long 41b,16b\n"
+		       "	.long 10b,16b\n"
+		       "	.long 51b,16b\n"
+		       "	.long 11b,16b\n"
+		       "	.long 61b,16b\n"
+		       "	.long 12b,16b\n"
+		       "	.long 71b,16b\n"
+		       "	.long 13b,16b\n"
+		       "	.long 81b,16b\n"
+		       "	.long 14b,16b\n"
+		       "	.long 91b,16b\n"
+		       "	.long 6b,9b\n"	
+		       "        .long 7b,16b\n" 
+		       ".previous"		
+		       : "=&c"(size), "=&D" (d0), "=&S" (d1)
+		       :  "1"(to), "2"(from), "0"(size)
+		       : "eax", "edx", "memory");
+	return size;
+}
+#else	/* INTEL_MOVSL */
+
+/*
+ * Leave these declared but undefined.  They should not be any references to
+ * them
+ */
+unsigned long
+__copy_user_zeroing_intel(void *to, const void *from, unsigned long size);
+unsigned long
+__copy_user_intel(void *to, const void *from,unsigned long size);
+
+#endif	/* INTEL_MOVSL */
+
+/* Generic arbitrary sized copy.  */
+#define __copy_user(to,from,size)					\
+do {									\
+	int __d0, __d1, __d2;						\
+	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
+		"0:	rep; movsl\n"					\
+		"	movl %3,%0\n"					\
+		"1:	rep; movsb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 2b\n"					\
+		"3:	lea 0(%3,%0,4),%0\n"				\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,2b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
+		: "memory");						\
+} while (0)
+
+#define __copy_user_zeroing(to,from,size)				\
+do {									\
+	int __d0, __d1, __d2;						\
+	__asm__ __volatile__(						\
+		"	cmp  $7,%0\n"					\
+		"	jbe  1f\n"					\
+		"	movl %1,%0\n"					\
+		"	negl %0\n"					\
+		"	andl $7,%0\n"					\
+		"	subl %0,%3\n"					\
+		"4:	rep; movsb\n"					\
+		"	movl %3,%0\n"					\
+		"	shrl $2,%0\n"					\
+		"	andl $3,%3\n"					\
+		"	.align 2,0x90\n"				\
+		"0:	rep; movsl\n"					\
+		"	movl %3,%0\n"					\
+		"1:	rep; movsb\n"					\
+		"2:\n"							\
+		".section .fixup,\"ax\"\n"				\
+		"5:	addl %3,%0\n"					\
+		"	jmp 6f\n"					\
+		"3:	lea 0(%3,%0,4),%0\n"				\
+		"6:	pushl %0\n"					\
+		"	pushl %%eax\n"					\
+		"	xorl %%eax,%%eax\n"				\
+		"	rep; stosb\n"					\
+		"	popl %%eax\n"					\
+		"	popl %0\n"					\
+		"	jmp 2b\n"					\
+		".previous\n"						\
+		".section __ex_table,\"a\"\n"				\
+		"	.align 4\n"					\
+		"	.long 4b,5b\n"					\
+		"	.long 0b,3b\n"					\
+		"	.long 1b,6b\n"					\
+		".previous"						\
+		: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2)	\
+		: "3"(size), "0"(size), "1"(to), "2"(from)		\
+		: "memory");						\
+} while (0)
+
+
+unsigned long __copy_to_user(void *to, const void *from, unsigned long n)
+{
+	if (movsl_is_ok(to, from, n))
+		__copy_user(to, from, n);
+	else
+		n = __copy_user_intel(to, from, n);
+	return n;
+}
+
+unsigned long __copy_from_user(void *to, const void *from, unsigned long n)
+{
+	if (movsl_is_ok(to, from, n))
+		__copy_user_zeroing(to, from, n);
+	else
+		n = __copy_user_zeroing_intel(to, from, n);
+	return n;
+}
+
+unsigned long copy_to_user(void *to, const void *from, unsigned long n)
+{
+	prefetch(from);
+	if (access_ok(VERIFY_WRITE, to, n))
+		n = __copy_to_user(to, from, n);
+	return n;
+}
+
+unsigned long copy_from_user(void *to, const void *from, unsigned long n)
+{
+	prefetchw(to);
+	if (access_ok(VERIFY_READ, from, n))
+		n = __copy_from_user(to, from, n);
+	return n;
+}
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -19,10 +19,12 @@ void kunmap(struct page *page)
 }

 /*
- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
- * gives a more generic (and caching) interface. But kmap_atomic can
- * be used in IRQ contexts, so in some (very limited) cases we need
- * it.
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap is is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
 */
 void *kmap_atomic(struct page *page, enum km_type type)
 {

--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -118,7 +118,6 @@ deadline_find_hash(struct deadline_data *dd, sector_t offset)

 	while ((entry = next) != hash_list) {
 		next = entry->next;
-		prefetch(next);
 		
 		drq = list_entry_hash(entry);

@@ -193,8 +192,6 @@ deadline_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
 	while ((entry = entry->prev) != sort_list) {
 		__rq = list_entry_rq(entry);

-		prefetch(entry->prev);
-
 		BUG_ON(__rq->flags & REQ_STARTED);

 		if (!(__rq->flags & REQ_CMD))
@@ -302,8 +299,6 @@ static void deadline_move_requests(struct deadline_data *dd, struct request *rq)
 		struct list_head *nxt = rq->queuelist.next;
 		int this_rq_cost;

-		prefetch(nxt);
-
 		/*
 		 * take it off the sort and fifo list, move
 		 * to dispatch queue

--- a/drivers/block/scsi_ioctl.c
+++ b/drivers/block/scsi_ioctl.c
@@ -37,6 +37,13 @@

 #include <asm/uaccess.h>

+/* Command group 3 is reserved and should never be used.  */
+const unsigned char scsi_command_size[8] =
+{
+	6, 10, 10, 12,
+	16, 12, 10, 10
+};
+
 #define BLK_DEFAULT_TIMEOUT	(60 * HZ)

 int blk_do_rq(request_queue_t *q, struct block_device *bdev, struct request *rq)
@@ -468,3 +475,4 @@ int scsi_cmd_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long ar
 }

 EXPORT_SYMBOL(scsi_cmd_ioctl);
+EXPORT_SYMBOL(scsi_command_size);
--- a/drivers/message/fusion/mptlan.h
+++ b/drivers/message/fusion/mptlan.h
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/miscdevice.h>
 #include <linux/spinlock.h>
+#include <linux/version.h>
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,41)
 #include <linux/tqueue.h>
 #else

--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -1803,11 +1803,11 @@ static void vortex_tx_timeout(struct net_device *dev)
 		   dev->name, inb(ioaddr + TxStatus),
 		   inw(ioaddr + EL3_STATUS));
 	EL3WINDOW(4);
-	printk(KERN_ERR "  diagnostics: net %04x media %04x dma %08lx fifo %04x\n",
-			inw(ioaddr + Wn4_NetDiag),
-			inw(ioaddr + Wn4_Media),
-			inl(ioaddr + PktStatus),
-			inw(ioaddr + Wn4_FIFODiag));
+	printk(KERN_ERR "  diagnostics: net %04x media %04x dma %08x fifo %04x\n",
+			(unsigned)inw(ioaddr + Wn4_NetDiag),
+			(unsigned)inw(ioaddr + Wn4_Media),
+			(unsigned)inl(ioaddr + PktStatus),
+			(unsigned)inw(ioaddr + Wn4_FIFODiag));
 	/* Slight code bloat to be user friendly. */
 	if ((inb(ioaddr + TxStatus) & 0x88) == 0x88)
 		printk(KERN_ERR "%s: Transmitter encountered 16 collisions --"
@@ -2643,8 +2643,8 @@ dump_tx_ring(struct net_device *dev)
 					vp->full_bus_master_tx,
 					vp->dirty_tx, vp->dirty_tx % TX_RING_SIZE,
 					vp->cur_tx, vp->cur_tx % TX_RING_SIZE);
-			printk(KERN_ERR "  Transmit list %8.8lx vs. %p.\n",
-				   inl(ioaddr + DownListPtr),
+			printk(KERN_ERR "  Transmit list %8.8x vs. %p.\n",
+				   (unsigned)inl(ioaddr + DownListPtr),
 				   &vp->tx_ring[vp->dirty_tx % TX_RING_SIZE]);
 			issue_and_wait(dev, DownStall);
 			for (i = 0; i < TX_RING_SIZE; i++) {

--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -123,12 +123,6 @@ struct scsi_host_sg_pool scsi_sg_pools[SG_MEMPOOL_NR] = {
 */
 unsigned long scsi_pid;
 Scsi_Cmnd *last_cmnd;
-/* Command group 3 is reserved and should never be used.  */
-const unsigned char scsi_command_size[8] =
-{
-	6, 10, 10, 12,
-	16, 12, 10, 10
-};
 static unsigned long serial_number;

 struct softscsi_data {

--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -751,15 +751,9 @@ static int scsi_init_io(Scsi_Cmnd *SCpnt)
 	int count, gfp_mask;

 	/*
-	 * non-sg block request. FIXME: check bouncing for isa hosts!
+	 * if this is a rq->data based REQ_BLOCK_PC, setup for a non-sg xfer
 	 */
 	if ((req->flags & REQ_BLOCK_PC) && !req->bio) {
-		/*
-		 * FIXME: isa bouncing
-		 */
-		if (SCpnt->host->unchecked_isa_dma)
-			goto fail;
-
 		SCpnt->request_bufflen = req->data_len;
 		SCpnt->request_buffer = req->data;
 		req->buffer = req->data;
@@ -816,7 +810,6 @@ static int scsi_init_io(Scsi_Cmnd *SCpnt)
 	/*
 	 * kill it. there should be no leftover blocks in this request
 	 */
-fail:
 	SCpnt = scsi_end_request(SCpnt, 0, req->nr_sectors);
 	BUG_ON(SCpnt);
 out:

--- a/drivers/scsi/scsi_syms.c
+++ b/drivers/scsi/scsi_syms.c
@@ -39,7 +39,6 @@ EXPORT_SYMBOL(scsi_partsize);
 EXPORT_SYMBOL(scsi_bios_ptable);
 EXPORT_SYMBOL(scsi_allocate_device);
 EXPORT_SYMBOL(scsi_do_cmd);
-EXPORT_SYMBOL(scsi_command_size);
 EXPORT_SYMBOL(scsi_ioctl);
 EXPORT_SYMBOL(print_command);
 EXPORT_SYMBOL(print_sense);

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -608,7 +608,7 @@ void kick_iocb(struct kiocb *iocb)
 	}

 	if (!kiocbTryKick(iocb)) {
-		long flags;
+		unsigned long flags;
 		spin_lock_irqsave(&ctx->ctx_lock, flags);
 		list_add_tail(&iocb->ki_run_list, &ctx->run_list);
 		spin_unlock_irqrestore(&ctx->ctx_lock, flags);

--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -120,7 +120,7 @@ blkdev_direct_IO(int rw, struct file *file, const struct iovec *iov,
 {
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;

-	return generic_direct_IO(rw, inode, iov, offset,
+	return generic_direct_IO(rw, inode, inode->i_bdev, iov, offset,
 				nr_segs, blkdev_get_blocks);
 }

@@ -308,6 +308,7 @@ struct block_device *bdget(dev_t dev)
 			new_bdev->bd_dev = dev;
 			new_bdev->bd_contains = NULL;
 			new_bdev->bd_inode = inode;
+			new_bdev->bd_block_size = (1 << inode->i_blkbits);
 			new_bdev->bd_part_count = 0;
 			new_bdev->bd_invalidated = 0;
 			inode->i_mode = S_IFBLK;

--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -29,30 +29,67 @@
 */
 #define DIO_PAGES	64

+/*
+ * This code generally works in units of "dio_blocks".  A dio_block is
+ * somewhere between the hard sector size and the filesystem block size.  it
+ * is determined on a per-invokation basis.   When talking to the filesystem
+ * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
+ * down by dio->blkfactor.  Similarly, fs-blocksize quantities are converted
+ * to bio_block quantities by shifting left by blkfactor.
+ *
+ * If blkfactor is zero then the user's request was aligned to the filesystem's
+ * blocksize.
+ */
+
 struct dio {
 	/* BIO submission state */
 	struct bio *bio;		/* bio under assembly */
 	struct inode *inode;
 	int rw;
 	unsigned blkbits;		/* doesn't change */
-	sector_t block_in_file;		/* changes */
+	unsigned blkfactor;		/* When we're using an aligment which
+					   is finer than the filesystem's soft
+					   blocksize, this specifies how much
+					   finer.  blkfactor=2 means 1/4-block
+					   alignment.  Does not change */
+	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
+					   been performed at the start of a
+					   write */
+	int pages_in_io;		/* approximate total IO pages */
+	sector_t block_in_file;		/* Current offset into the underlying
+					   file in dio_block units. */
 	unsigned blocks_available;	/* At block_in_file.  changes */
 	sector_t final_block_in_request;/* doesn't change */
 	unsigned first_block_in_page;	/* doesn't change, Used only once */
 	int boundary;			/* prev block is at a boundary */
 	int reap_counter;		/* rate limit reaping */
 	get_blocks_t *get_blocks;	/* block mapping function */
-	sector_t last_block_in_bio;	/* current final block in bio */
-	sector_t next_block_in_bio;	/* next block to be added to bio */
+	sector_t final_block_in_bio;	/* current final block in bio + 1 */
+	sector_t next_block_for_io;	/* next block to be put under IO,
+					   in dio_blocks units */
 	struct buffer_head map_bh;	/* last get_blocks() result */

-	/* Page fetching state */
+	/*
+	 * Deferred addition of a page to the dio.  These variables are
+	 * private to dio_send_cur_page(), submit_page_section() and
+	 * dio_bio_add_page().
+	 */
+	struct page *cur_page;		/* The page */
+	unsigned cur_page_offset;	/* Offset into it, in bytes */
+	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
+	sector_t cur_page_block;	/* Where it starts */
+
+	/*
+	 * Page fetching state. These variables belong to dio_refill_pages().
+	 */
 	int curr_page;			/* changes */
 	int total_pages;		/* doesn't change */
-	int pages_left;			/* approximate total IO pages */
 	unsigned long curr_user_address;/* changes */

-	/* Page queue */
+	/*
+	 * Page queue.  These variables belong to dio_refill_pages() and
+	 * dio_get_page().
+	 */
 	struct page *pages[DIO_PAGES];	/* page buffer */
 	unsigned head;			/* next page to process */
 	unsigned tail;			/* last valid page + 1 */
@@ -318,73 +355,40 @@ static int dio_bio_reap(struct dio *dio)
 *
 * In the case of filesystem holes: the fs may return an arbitrarily-large
 * hole by returning an appropriate value in b_size and by clearing
- * buffer_mapped().  This code _should_ handle that case correctly, but it has
- * only been tested against single-block holes (b_size == blocksize).
+ * buffer_mapped().  However the direct-io code will only process holes one
+ * block at a time - it will repeatedly call get_blocks() as it walks the hole.
 */
 static int get_more_blocks(struct dio *dio)
 {
 	int ret;
 	struct buffer_head *map_bh = &dio->map_bh;
-
-	if (dio->blocks_available)
-		return 0;
+	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
+	unsigned long fs_count;	/* Number of filesystem-sized blocks */
+	unsigned long dio_count;/* Number of dio_block-sized blocks */
+	unsigned long blkmask;

 	/*
 	 * If there was a memory error and we've overwritten all the
 	 * mapped blocks then we can now return that memory error
 	 */
-	if (dio->page_errors) {
-		ret = dio->page_errors;
-		goto out;
-	}
-
-	map_bh->b_state = 0;
-	map_bh->b_size = 0;
-	BUG_ON(dio->block_in_file >= dio->final_block_in_request);
-	ret = (*dio->get_blocks)(dio->inode, dio->block_in_file,
-			dio->final_block_in_request - dio->block_in_file,
-			map_bh, dio->rw == WRITE);
-	if (ret)
-		goto out;
-
-	if (buffer_mapped(map_bh)) {
-		BUG_ON(map_bh->b_size == 0);
-		BUG_ON((map_bh->b_size & ((1 << dio->blkbits) - 1)) != 0);
-
-		dio->blocks_available = map_bh->b_size >> dio->blkbits;
-
-		/* blockdevs do not set buffer_new */
-		if (buffer_new(map_bh)) {
-			sector_t block = map_bh->b_blocknr;
-			unsigned i;
-
-			for (i = 0; i < dio->blocks_available; i++)
-				unmap_underlying_metadata(map_bh->b_bdev,
-							block++);
-		}
-	} else {
-		BUG_ON(dio->rw != READ);
-		if (dio->bio)
-			dio_bio_submit(dio);
+	ret = dio->page_errors;
+	if (ret == 0) {
+		map_bh->b_state = 0;
+		map_bh->b_size = 0;
+		BUG_ON(dio->block_in_file >= dio->final_block_in_request);
+		fs_startblk = dio->block_in_file >> dio->blkfactor;
+		dio_count = dio->final_block_in_request - dio->block_in_file;
+		fs_count = dio_count >> dio->blkfactor;
+		blkmask = (1 << dio->blkfactor) - 1;
+		if (dio_count & blkmask)	
+			fs_count++;
+
+		ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
+				map_bh, dio->rw == WRITE);
 	}
-	dio->next_block_in_bio = map_bh->b_blocknr;
-out:
 	return ret;
 }

-/*
- * Check to see if we can continue to grow the BIO. If not, then send it.
- */
-static void dio_prep_bio(struct dio *dio)
-{
-	if (dio->bio == NULL)
-		return;
-
-	if (dio->boundary ||
-			dio->last_block_in_bio != dio->next_block_in_bio - 1)
-		dio_bio_submit(dio);
-}
-
 /*
 * There is no bio.  Make one now.
 */
@@ -397,7 +401,7 @@ static int dio_new_bio(struct dio *dio, sector_t blkno)
 	if (ret)
 		goto out;
 	sector = blkno << (dio->blkbits - 9);
-	nr_pages = min(dio->pages_left, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
 	BUG_ON(nr_pages <= 0);
 	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
 	dio->boundary = 0;
@@ -405,37 +409,199 @@ static int dio_new_bio(struct dio *dio, sector_t blkno)
 	return ret;
 }

+/*
+ * Attempt tp put the current chunk of 'cur_page' into the current BIO.  If
+ * that was successful then update final_block_in_bio and take a ref against
+ * the just-added page.
+ */
+static int dio_bio_add_page(struct dio *dio)
+{
+	int ret;

-static int
-dio_bio_add_page(struct dio *dio, struct page *page,
-		unsigned int bv_len, unsigned int bv_offset, sector_t blkno)
+	ret = bio_add_page(dio->bio, dio->cur_page,
+			dio->cur_page_len, dio->cur_page_offset);
+	if (ret == dio->cur_page_len) {
+		dio->pages_in_io--;
+		page_cache_get(dio->cur_page);
+		dio->final_block_in_bio = dio->cur_page_block +
+			(dio->cur_page_len >> dio->blkbits);
+		ret = 0;
+	}
+	return ret;
+}
+		
+/*
+ * Put cur_page under IO.  The section of cur_page which is described by
+ * cur_page_offset,cur_page_len is put into a BIO.  The section of cur_page
+ * starts on-disk at cur_page_block.
+ *
+ * We take a ref against the page here (on behalf of its presence in the bio).
+ *
+ * The caller of this function is responsible for removing cur_page from the
+ * dio, and for dropping the refcount which came from that presence.
+ */
+static int dio_send_cur_page(struct dio *dio)
 {
 	int ret = 0;

-	if (bv_len == 0) 
-		goto out;
+	if (dio->bio) {
+		/*
+		 * See whether this new request is contiguous with the old
+		 */
+		if (dio->final_block_in_bio != dio->cur_page_block)
+			dio_bio_submit(dio);
+		/*
+		 * Submit now if the underlying fs is about to perform a
+		 * metadata read
+		 */
+		if (dio->boundary)
+			dio_bio_submit(dio);
+	}

-	/* Take a ref against the page each time it is placed into a BIO */
-	page_cache_get(page);
-	if (bio_add_page(dio->bio, page, bv_len, bv_offset) < bv_len) {
+	if (dio->bio == NULL) {
+		ret = dio_new_bio(dio, dio->cur_page_block);
+		if (ret)
+			goto out;
+	}
+
+	if (dio_bio_add_page(dio) != 0) {
 		dio_bio_submit(dio);
-		ret = dio_new_bio(dio, blkno);
+		ret = dio_new_bio(dio, dio->cur_page_block);
 		if (ret == 0) {
-			ret = bio_add_page(dio->bio, page, bv_len, bv_offset);
-			BUG_ON(ret < bv_len);
-		} else {
-			/* The page didn't make it into a BIO */
-			page_cache_release(page);
+			ret = dio_bio_add_page(dio);
+			BUG_ON(ret != 0);
+		}
+	}
+out:
+	return ret;
+}
+
+/*
+ * An autonomous function to put a chunk of a page under deferred IO.
+ *
+ * The caller doesn't actually know (or care) whether this piece of page is in
+ * a BIO, or is under IO or whatever.  We just take care of all possible 
+ * situations here.  The separation between the logic of do_direct_IO() and
+ * that of submit_page_section() is important for clarity.  Please don't break.
+ *
+ * The chunk of page starts on-disk at blocknr.
+ *
+ * We perform deferred IO, by recording the last-submitted page inside our
+ * private part of the dio structure.  If possible, we just expand the IO
+ * across that page here.
+ *
+ * If that doesn't work out then we put the old page into the bio and add this
+ * page to the dio instead.
+ */
+static int
+submit_page_section(struct dio *dio, struct page *page,
+		unsigned offset, unsigned len, sector_t blocknr)
+{
+	int ret = 0;
+
+	/*
+	 * Can we just grow the current page's presence in the dio?
+	 */
+	if (	(dio->cur_page == page) &&
+		(dio->cur_page_offset + dio->cur_page_len == offset) &&
+		(dio->cur_page_block +
+			(dio->cur_page_len >> dio->blkbits) == blocknr)) {
+		dio->cur_page_len += len;
+
+		/*
+		 * If dio->boundary then we want to schedule the IO now to
+		 * avoid metadata seeks.
+		 */
+		if (dio->boundary) {
+			ret = dio_send_cur_page(dio);
+			page_cache_release(dio->cur_page);
+			dio->cur_page = NULL;
 		}
+		goto out;
+	}
+
+	/*
+	 * If there's a deferred page already there then send it.
+	 */
+	if (dio->cur_page) {
+		ret = dio_send_cur_page(dio);
+		page_cache_release(dio->cur_page);
+		dio->cur_page = NULL;
+		if (ret)
+			goto out;
 	}
-	dio->pages_left--;
+
+	page_cache_get(page);		/* It is in dio */
+	dio->cur_page = page;
+	dio->cur_page_offset = offset;
+	dio->cur_page_len = len;
+	dio->cur_page_block = blocknr;
 out:
 	return ret;
 }

+/*
+ * Clean any dirty buffers in the blockdev mapping which alias newly-created
+ * file blocks.  Only called for S_ISREG files - blockdevs do not set
+ * buffer_new
+ */
+static void clean_blockdev_aliases(struct dio *dio)
+{
+	unsigned i;
+
+	for (i = 0; i < dio->blocks_available; i++) {
+		unmap_underlying_metadata(dio->map_bh.b_bdev,
+					dio->map_bh.b_blocknr + i);
+	}
+}
+
+/*
+ * If we are not writing the entire block and get_block() allocated
+ * the block for us, we need to fill-in the unused portion of the
+ * block with zeros. This happens only if user-buffer, fileoffset or
+ * io length is not filesystem block-size multiple.
+ *
+ * `end' is zero if we're doing the start of the IO, 1 at the end of the
+ * IO.
+ */
+static void dio_zero_block(struct dio *dio, int end)
+{
+	unsigned dio_blocks_per_fs_block;
+	unsigned this_chunk_blocks;	/* In dio_blocks */
+	unsigned this_chunk_bytes;
+	struct page *page;
+
+	dio->start_zero_done = 1;
+	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+		return;
+
+	dio_blocks_per_fs_block = 1 << dio->blkfactor;
+	this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+
+	if (!this_chunk_blocks)
+		return;
+
+	/*
+	 * We need to zero out part of an fs block.  It is either at the
+	 * beginning or the end of the fs block.
+	 */
+	if (end) 
+		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
+	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+
+	page = ZERO_PAGE(dio->cur_user_address);
+	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
+				dio->next_block_for_io))
+		return;
+
+	dio->next_block_for_io += this_chunk_blocks;
+}

 /*
- * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
+ * Walk the user pages, and the file, mapping blocks to disk and generating
+ * a sequence of (page,offset,len,block) mappings.  These mappings are injected
+ * into submit_page_section(), which takes care of the next stage of submission
 *
 * Direct IO against a blockdev is different from a file.  Because we can
 * happily perform page-sized but 512-byte aligned IOs.  It is important that
@@ -448,73 +614,101 @@ dio_bio_add_page(struct dio *dio, struct page *page,
 * it should set b_size to PAGE_SIZE or more inside get_blocks().  This gives
 * fine alignment but still allows this function to work in PAGE_SIZE units.
 */
-int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio)
 {
 	const unsigned blkbits = dio->blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
+	struct buffer_head *map_bh = &dio->map_bh;
 	int ret = 0;

 	/* The I/O can start at any block offset within the first page */
 	block_in_page = dio->first_block_in_page;

 	while (dio->block_in_file < dio->final_block_in_request) {
-		int new_page;	/* Need to insert this page into the BIO? */
-		unsigned int bv_offset;
-		unsigned int bv_len;
-		sector_t curr_blkno;
-
 		page = dio_get_page(dio);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			goto out;
 		}

-		new_page = 1;
-		bv_offset = 0;
-		bv_len = 0;
-		curr_blkno = 0;
 		while (block_in_page < blocks_per_page) {
+			unsigned offset_in_page = block_in_page << blkbits;
 			unsigned this_chunk_bytes;	/* # of bytes mapped */
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;

-			ret = get_more_blocks(dio);
-			if (ret)
-				goto fail_release;
-
+			if (dio->blocks_available == 0) {
+				/*
+				 * Need to go and map some more disk
+				 */
+				unsigned long blkmask;
+				unsigned long dio_remainder;
+
+				ret = get_more_blocks(dio);
+				if (ret) {
+					page_cache_release(page);
+					goto out;
+				}
+				if (!buffer_mapped(map_bh))
+					goto do_holes;
+
+				dio->blocks_available =
+						map_bh->b_size >> dio->blkbits;
+				dio->next_block_for_io =
+					map_bh->b_blocknr << dio->blkfactor;
+				if (buffer_new(map_bh))
+					clean_blockdev_aliases(dio);
+
+				if (!dio->blkfactor)
+					goto do_holes;
+
+				blkmask = (1 << dio->blkfactor) - 1;
+				dio_remainder = (dio->block_in_file & blkmask);
+
+				/*
+				 * If we are at the start of IO and that IO
+				 * starts partway into a fs-block,
+				 * dio_remainder will be non-zero.  If the IO
+				 * is a read then we can simply advance the IO
+				 * cursor to the first block which is to be
+				 * read.  But if the IO is a write and the
+				 * block was newly allocated we cannot do that;
+				 * the start of the fs block must be zeroed out
+				 * on-disk
+				 */
+				if (!buffer_new(map_bh))
+					dio->next_block_for_io += dio_remainder;
+				dio->blocks_available -= dio_remainder;
+			}
+do_holes:
 			/* Handle holes */
-			if (!buffer_mapped(&dio->map_bh)) {
+			if (!buffer_mapped(map_bh)) {
 				char *kaddr = kmap_atomic(page, KM_USER0);
 				memset(kaddr + (block_in_page << blkbits),
 						0, 1 << blkbits);
 				flush_dcache_page(page);
 				kunmap_atomic(kaddr, KM_USER0);
 				dio->block_in_file++;
-				dio->next_block_in_bio++;
 				block_in_page++;
 				goto next_block;
 			}

-			dio_prep_bio(dio);
-			if (dio->bio == NULL) {
-				ret = dio_new_bio(dio, dio->next_block_in_bio);
-				if (ret)
-					goto fail_release;
-				new_page = 1;
-			}
-
-			if (new_page) {
-				bv_len = 0;
-				bv_offset = block_in_page << blkbits;
-				curr_blkno = dio->next_block_in_bio;
-				new_page = 0;
-			}
-
-			/* Work out how much disk we can add to this page */
+			/*
+			 * If we're performing IO which has an alignment which
+			 * is finer than the underlying fs, go check to see if
+			 * we must zero out the start of this block.
+			 */
+			if (unlikely(dio->blkfactor && !dio->start_zero_done))
+				dio_zero_block(dio, 0);
+
+			/*
+			 * Work out, in this_chunk_blocks, how much disk we
+			 * can add to this page
+			 */
 			this_chunk_blocks = dio->blocks_available;
-			u = (PAGE_SIZE - (bv_len + bv_offset)) >> blkbits;
+			u = (PAGE_SIZE - offset_in_page) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			u = dio->final_block_in_request - dio->block_in_file;
@@ -523,10 +717,15 @@ int do_direct_IO(struct dio *dio)
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);

-			bv_len += this_chunk_bytes;
-			dio->next_block_in_bio += this_chunk_blocks;
-			dio->last_block_in_bio = dio->next_block_in_bio - 1;
-			dio->boundary = buffer_boundary(&dio->map_bh);
+			dio->boundary = buffer_boundary(map_bh);
+			ret = submit_page_section(dio, page, offset_in_page,
+				this_chunk_bytes, dio->next_block_for_io);
+			if (ret) {
+				page_cache_release(page);
+				goto out;
+			}
+			dio->next_block_for_io += this_chunk_blocks;
+
 			dio->block_in_file += this_chunk_blocks;
 			block_in_page += this_chunk_blocks;
 			dio->blocks_available -= this_chunk_blocks;
@@ -536,27 +735,20 @@ int do_direct_IO(struct dio *dio)
 			if (dio->block_in_file == dio->final_block_in_request)
 				break;
 		}
-		ret = dio_bio_add_page(dio, page, bv_len,
-					bv_offset, curr_blkno);
-		if (ret)
-			goto fail_release;

 		/* Drop the ref which was taken in get_user_pages() */
 		page_cache_release(page);
 		block_in_page = 0;
 	}
-	goto out;
-fail_release:
-	page_cache_release(page);
 out:
 	return ret;
 }

-int
+static int
 direct_io_worker(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+	loff_t offset, unsigned long nr_segs, unsigned blkbits,
+	get_blocks_t get_blocks)
 {
-	const unsigned blkbits = inode->i_blkbits;
 	unsigned long user_addr; 
 	int seg, ret2, ret = 0;
 	struct dio dio;
@@ -566,14 +758,18 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
 	dio.inode = inode;
 	dio.rw = rw;
 	dio.blkbits = blkbits;
+	dio.blkfactor = inode->i_blkbits - blkbits;
+	dio.start_zero_done = 0;
 	dio.block_in_file = offset >> blkbits;
 	dio.blocks_available = 0;

+	dio.cur_page = NULL;
+
 	dio.boundary = 0;
 	dio.reap_counter = 0;
 	dio.get_blocks = get_blocks;
-	dio.last_block_in_bio = -1;
-	dio.next_block_in_bio = -1;
+	dio.final_block_in_bio = -1;
+	dio.next_block_for_io = -1;

 	dio.page_errors = 0;

@@ -582,10 +778,10 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
 	spin_lock_init(&dio.bio_list_lock);
 	dio.bio_list = NULL;
 	dio.waiter = NULL;
-	dio.pages_left = 0;
+	dio.pages_in_io = 0;

 	for (seg = 0; seg < nr_segs; seg++) 
-		dio.pages_left += (iov[seg].iov_len / PAGE_SIZE) + 2; 
+		dio.pages_in_io += (iov[seg].iov_len >> blkbits) + 2; 

 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
@@ -619,6 +815,18 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,

 	} /* end iovec loop */

+	/*
+	 * There may be some unwritten disk at the end of a part-written
+	 * fs-block-sized block.  Go zero that now.
+	 */
+	dio_zero_block(&dio, 1);
+
+	if (dio.cur_page) {
+		ret2 = dio_send_cur_page(&dio);
+		page_cache_release(dio.cur_page);
+		if (ret == 0)
+			ret = ret2;
+	}
 	ret2 = dio_await_completion(&dio);
 	if (ret == 0)
 		ret = ret2;
@@ -634,27 +842,44 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
 * This is a library function for use by filesystem drivers.
 */
 int
-generic_direct_IO(int rw, struct inode *inode, const struct iovec *iov, 
-	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks)
+generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev, 
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
+	get_blocks_t get_blocks)
 {
 	int seg;
 	size_t size;
 	unsigned long addr;
-	unsigned blocksize_mask = (1 << inode->i_blkbits) - 1;
+	unsigned blkbits = inode->i_blkbits;
+	unsigned bdev_blkbits = 0;
+	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;

-	if (offset & blocksize_mask)
-		goto out;
+	if (bdev)
+		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+
+	if (offset & blocksize_mask) {
+		if (bdev)
+			 blkbits = bdev_blkbits;
+		blocksize_mask = (1 << blkbits) - 1;
+		if (offset & blocksize_mask)
+			goto out;
+	}

 	/* Check the memory alignment.  Blocks cannot straddle pages */
 	for (seg = 0; seg < nr_segs; seg++) {
 		addr = (unsigned long)iov[seg].iov_base;
 		size = iov[seg].iov_len;
-		if ((addr & blocksize_mask) || (size & blocksize_mask)) 
-			goto out;	
+		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
+			if (bdev)
+				 blkbits = bdev_blkbits;
+			blocksize_mask = (1 << blkbits) - 1;
+			if ((addr & blocksize_mask) || (size & blocksize_mask))  
+				goto out;
+		}
 	}

-	retval = direct_io_worker(rw, inode, iov, offset, nr_segs, get_blocks);
+	retval = direct_io_worker(rw, inode, iov, offset, 
+				nr_segs, blkbits, get_blocks);
 out:
 	return retval;
 }
@@ -675,7 +900,7 @@ generic_file_direct_IO(int rw, struct file *file, const struct iovec *iov,
 	}

 	retval = mapping->a_ops->direct_IO(rw, file, iov, offset, nr_segs);
-	if (mapping->nrpages)
+	if (rw == WRITE && mapping->nrpages)
 		invalidate_inode_pages2(mapping);
 out:
 	return retval;

--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -624,7 +624,7 @@ ext2_direct_IO(int rw, struct file *file, const struct iovec *iov,
 {
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;

-	return generic_direct_IO(rw, inode, iov,
+	return generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov,
 				offset, nr_segs, ext2_get_blocks);
 }


--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1431,7 +1431,7 @@ static int ext3_direct_IO(int rw, struct file *file,
 		}
 	}

-	ret = generic_direct_IO(rw, inode, iov, offset,
+	ret = generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov, offset,
 				nr_segs, ext3_direct_io_get_blocks);

 out_stop:

--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -986,6 +986,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto out_fail;

 	blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
+		goto out_fail;
+	}

 	/*
 	 * The ext3 superblock will not be buffer aligned for other than 1kB

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -226,7 +226,7 @@ __writeback_single_inode(struct inode *inode, int sync,
 * The inodes to be written are parked on sb->s_io.  They are moved back onto
 * sb->s_dirty as they are selected for writing.  This way, none can be missed
 * on the writer throttling path, and we get decent balancing between many
- * throlttled threads: we don't want them all piling up on __wait_on_inode.
+ * throttled threads: we don't want them all piling up on __wait_on_inode.
 */
 static void
 sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)

--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -315,7 +315,7 @@ static int jfs_direct_IO(int rw, struct file *file, const struct iovec *iov,
 {
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;

-	return generic_direct_IO(rw, inode, iov,
+	return generic_direct_IO(rw, inode, inode->i_sb->s_bdev, iov,
 				offset, nr_segs, jfs_get_blocks);
 }


--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -260,7 +260,8 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }

-int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
+int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
 {
 	int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode);

@@ -277,3 +278,48 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode
 	}
 	return 0;
 }
+
+int simple_readpage(struct file *file, struct page *page)
+{
+	void *kaddr;
+
+	if (PageUptodate(page))
+		goto out;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+int simple_prepare_write(struct file *file, struct page *page,
+			unsigned from, unsigned to)
+{
+	if (!PageUptodate(page)) {
+		if (to - from != PAGE_CACHE_SIZE) {
+			void *kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr, 0, from);
+			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+
+int simple_commit_write(struct file *file, struct page *page,
+			unsigned offset, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (pos > inode->i_size)
+		inode->i_size = pos;
+	set_page_dirty(page);
+	return 0;
+}
--- a/fs/open.c
+++ b/fs/open.c
@@ -643,7 +643,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 			goto cleanup_file;
 	}

-	f->f_ra.ra_pages = inode->i_mapping->backing_dev_info->ra_pages;
+	file_ra_state_init(&f->f_ra, inode->i_mapping);
 	f->f_dentry = dentry;
 	f->f_vfsmnt = mnt;
 	f->f_pos = 0;

--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -39,6 +39,7 @@
 #include <linux/seq_file.h>
 #include <linux/times.h>
 #include <linux/profile.h>
+#include <linux/blkdev.h>

 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -404,10 +405,14 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	len += sprintf(page + len,
 		"\nctxt %lu\n"
 		"btime %lu\n"
-		"processes %lu\n",
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %u\n",
 		nr_context_switches(),
 		xtime.tv_sec - jif / HZ,
-		total_forks);
+		total_forks,
+		nr_running(),
+		atomic_read(&nr_iowait_tasks));

 	return proc_calc_metrics(page, start, off, count, eof, len);
 }

--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -47,48 +47,6 @@ static struct backing_dev_info ramfs_backing_dev_info = {
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
 };

-/*
- * Read a page. Again trivial. If it didn't already exist
- * in the page cache, it is zero-filled.
- */
-static int ramfs_readpage(struct file *file, struct page * page)
-{
-	if (!PageUptodate(page)) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
-
-		memset(kaddr, 0, PAGE_CACHE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	}
-	unlock_page(page);
-	return 0;
-}
-
-static int ramfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
-{
-	if (!PageUptodate(page)) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
-
-		memset(kaddr, 0, PAGE_CACHE_SIZE);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
-		SetPageUptodate(page);
-	}
-	set_page_dirty(page);
-	return 0;
-}
-
-static int ramfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	if (pos > inode->i_size)
-		inode->i_size = pos;
-	return 0;
-}
-
 struct inode *ramfs_get_inode(struct super_block *sb, int mode, int dev)
 {
 	struct inode * inode = new_inode(sb);
@@ -175,10 +133,10 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
 }

 static struct address_space_operations ramfs_aops = {
-	.readpage	= ramfs_readpage,
+	.readpage	= simple_readpage,
 	.writepage	= fail_writepage,
-	.prepare_write	= ramfs_prepare_write,
-	.commit_write	= ramfs_commit_write
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
 };

 static struct file_operations ramfs_file_operations = {

--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
@@ -607,8 +607,8 @@ linvfs_direct_IO(
 {
 	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;

-        return generic_direct_IO(rw, inode, iov, offset, nr_segs,
-					linvfs_get_blocks_direct);
+        return generic_direct_IO(rw, inode, NULL,
+			iov, offset, nr_segs, linvfs_get_blocks_direct);
 }



--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -33,7 +33,21 @@

 #define segment_eq(a,b)	((a).seg == (b).seg)

-extern int __verify_write(const void *, unsigned long);
+/*
+ * movsl can be slow when source and dest are not both 8-byte aligned
+ */
+#if defined(CONFIG_M586MMX) || defined(CONFIG_M686) || \
+	defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUM4)
+#define INTEL_MOVSL
+#endif
+
+#ifdef INTEL_MOVSL
+extern struct movsl_mask {
+	int mask;
+} ____cacheline_aligned_in_smp movsl_mask;
+#endif
+
+int __verify_write(const void *, unsigned long);

 #define __addr_ok(addr) ((unsigned long)(addr) < (current_thread_info()->addr_limit.seg))

@@ -248,354 +262,10 @@ do {									\
 		: "m"(__m(addr)), "i"(-EFAULT), "0"(err))


-/*
- * Copy To/From Userspace
- */
-
-/* Generic arbitrary sized copy.  */
-#define __copy_user(to,from,size)					\
-do {									\
-	int __d0, __d1;							\
-	__asm__ __volatile__(						\
-		"0:	rep; movsl\n"					\
-		"	movl %3,%0\n"					\
-		"1:	rep; movsb\n"					\
-		"2:\n"							\
-		".section .fixup,\"ax\"\n"				\
-		"3:	lea 0(%3,%0,4),%0\n"				\
-		"	jmp 2b\n"					\
-		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,2b\n"					\
-		".previous"						\
-		: "=&c"(size), "=&D" (__d0), "=&S" (__d1)		\
-		: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from)	\
-		: "memory");						\
-} while (0)
-
-#define __copy_user_zeroing(to,from,size)				\
-do {									\
-	int __d0, __d1;							\
-	__asm__ __volatile__(						\
-		"0:	rep; movsl\n"					\
-		"	movl %3,%0\n"					\
-		"1:	rep; movsb\n"					\
-		"2:\n"							\
-		".section .fixup,\"ax\"\n"				\
-		"3:	lea 0(%3,%0,4),%0\n"				\
-		"4:	pushl %0\n"					\
-		"	pushl %%eax\n"					\
-		"	xorl %%eax,%%eax\n"				\
-		"	rep; stosb\n"					\
-		"	popl %%eax\n"					\
-		"	popl %0\n"					\
-		"	jmp 2b\n"					\
-		".previous\n"						\
-		".section __ex_table,\"a\"\n"				\
-		"	.align 4\n"					\
-		"	.long 0b,3b\n"					\
-		"	.long 1b,4b\n"					\
-		".previous"						\
-		: "=&c"(size), "=&D" (__d0), "=&S" (__d1)		\
-		: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from)	\
-		: "memory");						\
-} while (0)
-
-/* We let the __ versions of copy_from/to_user inline, because they're often
- * used in fast paths and have only a small space overhead.
- */
-static inline unsigned long
-__generic_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
-{
-	__copy_user_zeroing(to,from,n);
-	return n;
-}
-
-static inline unsigned long
-__generic_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
-{
-	__copy_user(to,from,n);
-	return n;
-}
-
-
-/* Optimize just a little bit when we know the size of the move. */
-#define __constant_copy_user(to, from, size)			\
-do {								\
-	int __d0, __d1;						\
-	switch (size & 3) {					\
-	default:						\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"2:	shl $2,%0\n"			\
-			"	jmp 1b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,2b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	case 1:							\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:	movsb\n"			\
-			"2:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"3:	shl $2,%0\n"			\
-			"4:	incl %0\n"			\
-			"	jmp 2b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,3b\n"			\
-			"	.long 1b,4b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	case 2:							\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:	movsw\n"			\
-			"2:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"3:	shl $2,%0\n"			\
-			"4:	addl $2,%0\n"			\
-			"	jmp 2b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,3b\n"			\
-			"	.long 1b,4b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	case 3:							\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:	movsw\n"			\
-			"2:	movsb\n"			\
-			"3:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"4:	shl $2,%0\n"			\
-			"5:	addl $2,%0\n"			\
-			"6:	incl %0\n"			\
-			"	jmp 3b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,4b\n"			\
-			"	.long 1b,5b\n"			\
-			"	.long 2b,6b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	}							\
-} while (0)
-
-/* Optimize just a little bit when we know the size of the move. */
-#define __constant_copy_user_zeroing(to, from, size)		\
-do {								\
-	int __d0, __d1;						\
-	switch (size & 3) {					\
-	default:						\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"2:	pushl %0\n"			\
-			"	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	rep; stosl\n"			\
-			"	popl %%eax\n"			\
-			"	popl %0\n"			\
-			"	shl $2,%0\n"			\
-			"	jmp 1b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,2b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	case 1:							\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:	movsb\n"			\
-			"2:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"3:	pushl %0\n"			\
-			"	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	rep; stosl\n"			\
-			"	stosb\n"			\
-			"	popl %%eax\n"			\
-			"	popl %0\n"			\
-			"	shl $2,%0\n"			\
-			"	incl %0\n"			\
-			"	jmp 2b\n"			\
-			"4:	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	stosb\n"			\
-			"	popl %%eax\n"			\
-			"	incl %0\n"			\
-			"	jmp 2b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,3b\n"			\
-			"	.long 1b,4b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	case 2:							\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:	movsw\n"			\
-			"2:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"3:	pushl %0\n"			\
-			"	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	rep; stosl\n"			\
-			"	stosw\n"			\
-			"	popl %%eax\n"			\
-			"	popl %0\n"			\
-			"	shl $2,%0\n"			\
-			"	addl $2,%0\n"			\
-			"	jmp 2b\n"			\
-			"4:	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	stosw\n"			\
-			"	popl %%eax\n"			\
-			"	addl $2,%0\n"			\
-			"	jmp 2b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,3b\n"			\
-			"	.long 1b,4b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	case 3:							\
-		__asm__ __volatile__(				\
-			"0:	rep; movsl\n"			\
-			"1:	movsw\n"			\
-			"2:	movsb\n"			\
-			"3:\n"					\
-			".section .fixup,\"ax\"\n"		\
-			"4:	pushl %0\n"			\
-			"	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	rep; stosl\n"			\
-			"	stosw\n"			\
-			"	stosb\n"			\
-			"	popl %%eax\n"			\
-			"	popl %0\n"			\
-			"	shl $2,%0\n"			\
-			"	addl $3,%0\n"			\
-			"	jmp 2b\n"			\
-			"5:	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	stosw\n"			\
-			"	stosb\n"			\
-			"	popl %%eax\n"			\
-			"	addl $3,%0\n"			\
-			"	jmp 2b\n"			\
-			"6:	pushl %%eax\n"			\
-			"	xorl %%eax,%%eax\n"		\
-			"	stosb\n"			\
-			"	popl %%eax\n"			\
-			"	incl %0\n"			\
-			"	jmp 3b\n"			\
-			".previous\n"				\
-			".section __ex_table,\"a\"\n"		\
-			"	.align 4\n"			\
-			"	.long 0b,4b\n"			\
-			"	.long 1b,5b\n"			\
-			"	.long 2b,6b\n"			\
-			".previous"				\
-			: "=c"(size), "=&S" (__d0), "=&D" (__d1)\
-			: "1"(from), "2"(to), "0"(size/4)	\
-			: "memory");				\
-		break;						\
-	}							\
-} while (0)
-
-unsigned long __generic_copy_to_user(void *, const void *, unsigned long);
-unsigned long __generic_copy_from_user(void *, const void *, unsigned long);
-
-static inline unsigned long
-__constant_copy_to_user(void *to, const void *from, unsigned long n)
-{
-	prefetch(from);
-	if (access_ok(VERIFY_WRITE, to, n))
-		__constant_copy_user(to,from,n);
-	return n;
-}
-
-static inline unsigned long
-__constant_copy_from_user(void *to, const void *from, unsigned long n)
-{
-	if (access_ok(VERIFY_READ, from, n))
-		__constant_copy_user_zeroing(to,from,n);
-	else
-		memset(to, 0, n);
-	return n;
-}
-
-static inline unsigned long
-__constant_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
-{
-	__constant_copy_user(to,from,n);
-	return n;
-}
-
-static inline unsigned long
-__constant_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
-{
-	__constant_copy_user_zeroing(to,from,n);
-	return n;
-}
-
-#define copy_to_user(to,from,n)				\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_to_user((to),(from),(n)) :	\
-	 __generic_copy_to_user((to),(from),(n)))
-
-#define copy_from_user(to,from,n)			\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_from_user((to),(from),(n)) :	\
-	 __generic_copy_from_user((to),(from),(n)))
-
-#define __copy_to_user(to,from,n)			\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_to_user_nocheck((to),(from),(n)) :	\
-	 __generic_copy_to_user_nocheck((to),(from),(n)))
-
-#define __copy_from_user(to,from,n)			\
-	(__builtin_constant_p(n) ?			\
-	 __constant_copy_from_user_nocheck((to),(from),(n)) :	\
-	 __generic_copy_from_user_nocheck((to),(from),(n)))
+unsigned long copy_to_user(void *to, const void *from, unsigned long n);
+unsigned long copy_from_user(void *to, const void *from, unsigned long n);
+unsigned long __copy_to_user(void *to, const void *from, unsigned long n);
+unsigned long __copy_from_user(void *to, const void *from, unsigned long n);

 long strncpy_from_user(char *dst, const char *src, long count);
 long __strncpy_from_user(char *dst, const char *src, long count);

--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -258,7 +258,7 @@ static inline void clear_in_cr4 (unsigned long mask)
 #define TASK_UNMAPPED_32 0x40000000
 #define TASK_UNMAPPED_64 (TASK_SIZE/3) 
 #define TASK_UNMAPPED_BASE	\
-	(test_thread_flags(TIF_IA32) ? TASK_UNMAPPED_32 : TASK_UNMAPPED_64)  
+	(test_thread_flag(TIF_IA32) ? TASK_UNMAPPED_32 : TASK_UNMAPPED_64)  

 /*
 * Size of io_bitmap in longwords: 32 is ports 0-0x3ff.

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,30 +22,35 @@ struct request_list {
 	wait_queue_head_t wait;
 };

+/*
+ * try to put the fields that are referenced together in the same cacheline
+ */
 struct request {
 	struct list_head queuelist; /* looking for ->queue? you must _not_
 				     * access it directly, use
 				     * blkdev_dequeue_request! */
-	int ref_count;
-
-	void *elevator_private;
+	unsigned long flags;		/* see REQ_ bits below */

-	unsigned char cmd[16];
+	kdev_t rq_dev;
+	sector_t sector;
+	unsigned long nr_sectors;
+	unsigned int current_nr_sectors;

-	unsigned long flags;		/* see REQ_ bits below */
+	void *elevator_private;

 	int rq_status;	/* should split this into a few status bits */
-	kdev_t rq_dev;
 	struct gendisk *rq_disk;
 	int errors;
-	sector_t sector;
 	unsigned long start_time;
-	unsigned long nr_sectors;
 	sector_t hard_sector;		/* the hard_* are block layer
 					 * internals, no driver should
 					 * touch them
 					 */
 	unsigned long hard_nr_sectors;
+	unsigned int hard_cur_sectors;
+
+	struct bio *bio;
+	struct bio *biotail;

 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
@@ -59,13 +64,21 @@ struct request {
 	 */
 	unsigned short nr_hw_segments;

-	unsigned int current_nr_sectors;
-	unsigned int hard_cur_sectors;
 	int tag;
-	void *special;
 	char *buffer;

-	/* For packet commands */
+	int ref_count;
+	request_queue_t *q;
+	struct request_list *rl;
+
+	struct completion *waiting;
+	void *special;
+
+	/*
+	 * when request is used as a packet command carrier
+	 */
+	unsigned char cmd[16];
+
 	unsigned int data_len;
 	void *data;

@@ -73,10 +86,6 @@ struct request {
 	void *sense;

 	unsigned int timeout;
-	struct completion *waiting;
-	struct bio *bio, *biotail;
-	request_queue_t *q;
-	struct request_list *rl;
 };

 /*

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1250,10 +1250,12 @@ ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
 extern ssize_t generic_file_sendfile(struct file *, struct file *, loff_t *, size_t);
 extern void do_generic_mapping_read(struct address_space *, struct file_ra_state *, struct file *,
 				    loff_t *, read_descriptor_t *, read_actor_t);
+extern void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_direct_IO(int rw, struct file *file,
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int generic_direct_IO(int rw, struct inode *inode, const struct iovec 
-	*iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
+extern int generic_direct_IO(int rw, struct inode *inode, struct block_device *bdev,
+	const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t *get_blocks);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
 	unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
@@ -1311,6 +1313,12 @@ extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
 extern int simple_sync_file(struct file *, struct dentry *, int);
 extern int simple_empty(struct dentry *);
+extern int simple_readpage(struct file *file, struct page *page);
+extern int simple_prepare_write(struct file *file, struct page *page,
+			unsigned offset, unsigned to);
+extern int simple_commit_write(struct file *file, struct page *page,
+				unsigned offset, unsigned to);
+
 extern struct dentry *simple_lookup(struct inode *, struct dentry *);
 extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
 extern struct file_operations simple_dir_operations;

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -863,6 +863,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,

 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
+	/*
+	 * Check for pending SIGKILL! The new thread should not be allowed
+	 * to slip out of an OOM kill. (or normal SIGKILL.)
+	 */
+	if (sigismember(&current->pending.signal, SIGKILL)) {
+		write_unlock_irq(&tasklist_lock);
+		goto bad_fork_cleanup_namespace;
+	}

 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & CLONE_PARENT)

--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -228,6 +228,7 @@ EXPORT_SYMBOL(generic_block_bmap);
 EXPORT_SYMBOL(generic_file_read);
 EXPORT_SYMBOL(generic_file_sendfile);
 EXPORT_SYMBOL(do_generic_mapping_read);
+EXPORT_SYMBOL(file_ra_state_init);
 EXPORT_SYMBOL(generic_file_write);
 EXPORT_SYMBOL(generic_file_write_nolock);
 EXPORT_SYMBOL(generic_file_mmap);
@@ -306,6 +307,9 @@ EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_rmdir);
 EXPORT_SYMBOL(simple_rename);
 EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(simple_readpage);
+EXPORT_SYMBOL(simple_prepare_write);
+EXPORT_SYMBOL(simple_commit_write);
 EXPORT_SYMBOL(simple_empty);
 EXPORT_SYMBOL(fd_install);
 EXPORT_SYMBOL(put_unused_fd);

--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -282,17 +282,9 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			break;
 		nr_found = __lookup(root, results + ret, cur_index,
 				max_items - ret, &next_index, max_index);
-		if (nr_found == 0) {
-			 if (!(cur_index & RADIX_TREE_MAP_MASK))
-				break;
-			/*
-			 * It could be that there simply were no items to the
-			 * right of `cur_index' in the leaf node.  So we still
-			 * need to search for additional nodes to the right of
-			 * this one.
-			 */
-		}
 		ret += nr_found;
+		if (next_index == max_index)
+			break;
 		cur_index = next_index;
 	}
 out:

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -61,6 +61,10 @@
 *          ->mapping->page_lock
 *  ->inode_lock
 *    ->sb_lock			(fs/fs-writeback.c)
+ *  ->page_table_lock
+ *    ->swap_device_lock	(try_to_unmap_one)
+ *    ->private_lock		(try_to_unmap_one)
+ *    ->page_lock		(try_to_unmap_one)
 */

 /*

--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -399,6 +399,10 @@ static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
 	return 0;
 }

+/*
+ * The caller must hold down_write(current->mm->mmap_sem).
+ */
+
 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 			unsigned long len, unsigned long prot,
 			unsigned long flags, unsigned long pgoff)

--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -175,9 +175,13 @@ static void oom_kill(void)
 	if (p == NULL)
 		panic("Out of memory and no killable processes...\n");

-	/* kill all processes that share the ->mm (i.e. all threads) */
+	oom_kill_task(p);
+	/*
+	 * kill all processes that share the ->mm (i.e. all threads),
+	 * but are in a different thread group
+	 */
 	do_each_thread(g, q)
-		if (q->mm == p->mm)
+		if (q->mm == p->mm && q->tgid != p->tgid)
 			oom_kill_task(q);
 	while_each_thread(g, q);


--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -59,6 +59,14 @@ static inline int bad_range(struct zone *zone, struct page *page)
 	return 0;
 }

+static void bad_page(const char *function, struct page *page)
+{
+	printk("Bad page state at %s\n", function);
+	printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n",
+		page->flags, page->mapping,
+		page_mapped(page), page_count(page));
+}
+
 /*
 * Freeing function for a buddy system allocator.
 *
@@ -91,16 +99,19 @@ void __free_pages_ok (struct page *page, unsigned int order)

 	mod_page_state(pgfree, 1<<order);

-	BUG_ON(PageLRU(page));
-	BUG_ON(PagePrivate(page));
-	BUG_ON(page->mapping != NULL);
-	BUG_ON(PageLocked(page));
-	BUG_ON(PageActive(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(page->pte.direct != 0);
+	if (	page_mapped(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_writeback )))
+		bad_page(__FUNCTION__, page);
+
 	if (PageDirty(page))
 		ClearPageDirty(page);
-	BUG_ON(page_count(page) != 0);

 	if (unlikely(current->flags & PF_FREE_PAGES)) {
 		if (!current->nr_local_pages && !in_interrupt()) {
@@ -181,14 +192,17 @@ expand(struct zone *zone, struct page *page,
 */
 static inline void prep_new_page(struct page *page)
 {
-	BUG_ON(page->mapping);
-	BUG_ON(PagePrivate(page));
-	BUG_ON(PageLocked(page));
-	BUG_ON(PageLRU(page));
-	BUG_ON(PageActive(page));
-	BUG_ON(PageDirty(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(page->pte.direct != 0);
+	if (	page->mapping ||
+		page_mapped(page) ||
+		(page->flags & (
+			1 << PG_private	|
+			1 << PG_locked	|
+			1 << PG_lru	|
+			1 << PG_active	|
+			1 << PG_dirty	|
+			1 << PG_writeback )))
+		bad_page(__FUNCTION__, page);
+
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked);

--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,16 @@ struct backing_dev_info default_backing_dev_info = {
 	.state		= 0,
 };

+/*
+ * Initialise a struct file's readahead state
+ */
+void
+file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+{
+	memset(ra, 0, sizeof(*ra));
+	ra->ra_pages = mapping->backing_dev_info->ra_pages;
+}
+
 /*
 * Return max readahead size for this inode in number-of-pages.
 */

--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -53,7 +53,34 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 	clear_page_dirty(page);
 	ClearPageUptodate(page);
 	remove_from_page_cache(page);
-	page_cache_release(page);
+	page_cache_release(page);	/* pagecache ref */
+}
+
+/*
+ * This is for invalidate_inode_pages().  That function can be called at
+ * any time, and is not supposed to throw away dirty pages.  But pages can
+ * be marked dirty at any time too.  So we re-check the dirtiness inside
+ * ->page_lock.  That provides exclusion against the __set_page_dirty
+ * functions.
+ */
+static void
+invalidate_complete_page(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return;
+
+	if (PagePrivate(page) && !try_to_release_page(page, 0))
+		return;
+
+	write_lock(&mapping->page_lock);
+	if (PageDirty(page)) {
+		write_unlock(&mapping->page_lock);
+	} else {
+		__remove_from_page_cache(page);
+		write_unlock(&mapping->page_lock);
+		ClearPageUptodate(page);
+		page_cache_release(page);	/* pagecache ref */
+	}
 }

 /**
@@ -172,11 +199,9 @@ void invalidate_inode_pages(struct address_space *mapping)
 			next++;
 			if (PageDirty(page) || PageWriteback(page))
 				goto unlock;
-			if (PagePrivate(page) && !try_to_release_page(page, 0))
-				goto unlock;
 			if (page_mapped(page))
 				goto unlock;
-			truncate_complete_page(mapping, page);
+			invalidate_complete_page(mapping, page);
 unlock:
 			unlock_page(page);
 		}
@@ -213,7 +238,7 @@ void invalidate_inode_pages2(struct address_space *mapping)
 				if (page_mapped(page))
 					clear_page_dirty(page);
 				else
-					truncate_complete_page(mapping, page);
+					invalidate_complete_page(mapping, page);
 			}
 			unlock_page(page);
 		}

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -31,6 +31,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/topology.h>
+#include <asm/div64.h>

 #include <linux/swapops.h>

@@ -85,7 +86,7 @@ struct shrinker {
 	shrinker_t		shrinker;
 	struct list_head	list;
 	int			seeks;	/* seeks to recreate an obj */
-	int			nr;	/* objs pending delete */
+	long			nr;	/* objs pending delete */
 };

 static LIST_HEAD(shrinker_list);
@@ -121,7 +122,7 @@ void remove_shrinker(struct shrinker *shrinker)
 	kfree(shrinker);
 }
 
-#define SHRINK_BATCH 32
+#define SHRINK_BATCH 128
 /*
 * Call the shrink functions to age shrinkable caches
 *
@@ -134,29 +135,27 @@ void remove_shrinker(struct shrinker *shrinker)
 * slab to avoid swapping.
 *
 * FIXME: do not do for zone highmem
+ *
+ * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
 */
-static int shrink_slab(int scanned,  unsigned int gfp_mask)
+static int shrink_slab(long scanned,  unsigned int gfp_mask)
 {
-	struct list_head *lh;
-	int pages;
+	struct shrinker *shrinker;
+	long pages;

 	if (down_trylock(&shrinker_sem))
 		return 0;

 	pages = nr_used_zone_pages();
-	list_for_each(lh, &shrinker_list) {
-		struct shrinker *shrinker;
-		int entries;
-		unsigned long delta;
-
-		shrinker = list_entry(lh, struct shrinker, list);
-		entries = (*shrinker->shrinker)(0, gfp_mask);
-		if (!entries)
-			continue;
-		delta = scanned * shrinker->seeks * entries;
-		shrinker->nr += delta / (pages + 1);
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		long long delta;
+
+		delta = scanned * shrinker->seeks;
+		delta *= (*shrinker->shrinker)(0, gfp_mask);
+		do_div(delta, pages + 1);
+		shrinker->nr += delta;
 		if (shrinker->nr > SHRINK_BATCH) {
-			int nr = shrinker->nr;
+			long nr = shrinker->nr;

 			shrinker->nr = 0;
 			(*shrinker->shrinker)(nr, gfp_mask);
@@ -824,7 +823,7 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 	int i;

 	for (priority = DEF_PRIORITY; priority; priority--) {
-		int success = 1;
+		int all_zones_ok = 1;

 		for (i = 0; i < pgdat->nr_zones; i++) {
 			struct zone *zone = pgdat->node_zones + i;
@@ -832,20 +831,24 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 			int max_scan;
 			int to_reclaim;

-			to_reclaim = zone->pages_high - zone->free_pages;
-			if (nr_pages && to_free > 0)
+			if (nr_pages && to_free > 0) {	/* Software suspend */
 				to_reclaim = min(to_free, SWAP_CLUSTER_MAX*8);
-			if (to_reclaim <= 0)
-				continue;
-			success = 0;
+			} else {			/* Zone balancing */
+				to_reclaim = zone->pages_high-zone->free_pages;
+				if (to_reclaim <= 0)
+					continue;
+			}
+			all_zones_ok = 0;
 			max_scan = zone->nr_inactive >> priority;
 			if (max_scan < to_reclaim * 2)
 				max_scan = to_reclaim * 2;
+			if (max_scan < SWAP_CLUSTER_MAX)
+				max_scan = SWAP_CLUSTER_MAX;
 			to_free -= shrink_zone(zone, max_scan, GFP_KSWAPD,
 					to_reclaim, &nr_mapped, ps, priority);
 			shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
 		}
-		if (success)
+		if (all_zones_ok)
 			break;
 		blk_congestion_wait(WRITE, HZ/4);
 	}