Merge home.transmeta.com:/home/torvalds/v2.5/akpm

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge home.transmeta.com:/home/torvalds/v2.5/akpm
into home.transmeta.com:/home/torvalds/v2.5/linux
78f1f626 · Linus Torvalds · 6fe152cf · f1dfe022 · 78f1f626 · 78f1f626
Commit 78f1f626 authored Jul 04, 2002 by Linus Torvalds
55 changed files
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -3,11 +3,13 @@ Changes since 2.5.0:
 --- 
 [recommended]

-New helpers: sb_bread(), sb_getblk(), sb_get_hash_table(), set_bh(),
+New helpers: sb_bread(), sb_getblk(), sb_find_get_block(), set_bh(),
 	sb_set_blocksize() and sb_min_blocksize().

 Use them.

+(sb_find_get_block() replaces 2.4's get_hash_table())
+
 --- 
 [recommended]


--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -56,12 +56,16 @@ int __verify_write(const void * addr, unsigned long size)

 	for (;;) {
 	survive:
-		{
-			int fault = handle_mm_fault(current->mm, vma, start, 1);
-			if (!fault)
+		switch (handle_mm_fault(current->mm, vma, start, 1)) {
+			case VM_FAULT_SIGBUS:
 				goto bad_area;
-			if (fault < 0)
+			case VM_FAULT_OOM:
 				goto out_of_memory;
+			case VM_FAULT_MINOR:
+			case VM_FAULT_MAJOR:
+				break;
+			default:
+				BUG();
 		}
 		if (!size)
 			break;
@@ -239,16 +243,18 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * the fault.
 	 */
 	switch (handle_mm_fault(mm, vma, address, write)) {
-	case 1:
-		tsk->min_flt++;
-		break;
-	case 2:
-		tsk->maj_flt++;
-		break;
-	case 0:
-		goto do_sigbus;
-	default:
-		goto out_of_memory;
+		case VM_FAULT_MINOR:
+			tsk->min_flt++;
+			break;
+		case VM_FAULT_MAJOR:
+			tsk->maj_flt++;
+			break;
+		case VM_FAULT_SIGBUS:
+			goto do_sigbus;
+		case VM_FAULT_OOM:
+			goto out_of_memory;
+		default:
+			BUG();
 	}

 	/*

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -964,8 +964,7 @@ void blk_run_queues(void)
 		return;
 	}

-	list_splice(&blk_plug_list, &local_plug_list);
-	INIT_LIST_HEAD(&blk_plug_list);
+	list_splice_init(&blk_plug_list, &local_plug_list);
 	spin_unlock_irq(&blk_plug_lock);
 	
 	while (!list_empty(&local_plug_list)) {

--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -740,8 +740,7 @@ void abort_requests(struct hpsb_host *host)
        host->ops->devctl(host, CANCEL_REQUESTS, 0);

        spin_lock_irqsave(&host->pending_pkt_lock, flags);
-        list_splice(&host->pending_packets, &llist);
-        INIT_LIST_HEAD(&host->pending_packets);
+        list_splice_init(&host->pending_packets, &llist);
        spin_unlock_irqrestore(&host->pending_pkt_lock, flags);

        list_for_each(lh, &llist) {

--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -174,6 +174,10 @@
    - Add `global_options' as default for options[].  Ditto global_enable_wol,
      global_full_duplex.

+   LK1.1.18 01Jul02 akpm
+    - Fix for undocumented transceiver power-up bit on some 3c566B's
+      (Donald Becker, Rahul Karnik)
+
    - See http://www.zip.com.au/~akpm/linux/#3c59x-2.3 for more details.
    - Also see Documentation/networking/vortex.txt
 */
@@ -189,8 +193,8 @@


 #define DRV_NAME	"3c59x"
-#define DRV_VERSION	"LK1.1.17"
-#define DRV_RELDATE	"18 Dec 2001"
+#define DRV_VERSION	"LK1.1.18"
+#define DRV_RELDATE	"1 Jul 2002"



@@ -414,7 +418,7 @@ enum {	IS_VORTEX=1, IS_BOOMERANG=2, IS_CYCLONE=4, IS_TORNADO=8,
 	EEPROM_8BIT=0x10,	/* AKPM: Uses 0x230 as the base bitmaps for EEPROM reads */
 	HAS_PWR_CTRL=0x20, HAS_MII=0x40, HAS_NWAY=0x80, HAS_CB_FNS=0x100,
 	INVERT_MII_PWR=0x200, INVERT_LED_PWR=0x400, MAX_COLLISION_RESET=0x800,
-	EEPROM_OFFSET=0x1000, HAS_HWCKSM=0x2000 };
+	EEPROM_OFFSET=0x1000, HAS_HWCKSM=0x2000, WNO_XCVR_PWR=0x4000 };

 enum vortex_chips {
 	CH_3C590 = 0,
@@ -522,7 +526,7 @@ static struct vortex_chip_info {
 									HAS_HWCKSM, 128, },
 	{"3c556B Laptop Hurricane",
 	 PCI_USES_IO|PCI_USES_MASTER, IS_TORNADO|HAS_NWAY|EEPROM_OFFSET|HAS_CB_FNS|INVERT_MII_PWR|
-									HAS_HWCKSM, 128, },
+	                                WNO_XCVR_PWR|HAS_HWCKSM, 128, },
 	{"3c575 [Megahertz] 10/100 LAN 	CardBus",
 	PCI_USES_IO|PCI_USES_MASTER, IS_BOOMERANG|HAS_MII|EEPROM_8BIT, 128, },

@@ -1222,6 +1226,10 @@ static int __devinit vortex_probe1(struct pci_dev *pdev,
 		if (vp->drv_flags & INVERT_MII_PWR)
 			n |= 0x4000;
 		outw(n, ioaddr + Wn2_ResetOptions);
+		if (vp->drv_flags & WNO_XCVR_PWR) {
+			EL3WINDOW(0);
+			outw(0x0800, ioaddr);
+		}
 	}

 	/* Extract our information from the EEPROM data. */

--- a/drivers/net/e100/e100.h
+++ b/drivers/net/e100/e100.h
@@ -1031,14 +1031,6 @@ extern unsigned char e100_selftest(struct e100_private *bdp, u32 *st_timeout,
 extern unsigned char e100_get_link_state(struct e100_private *bdp);
 extern unsigned char e100_wait_scb(struct e100_private *bdp);

-#ifndef yield
-#define yield()					\
-        do {					\
-                current->policy |= SCHED_YIELD;	\
-                schedule();			\
-        } while (0)                                     
-#endif
-
 extern void e100_deisolate_driver(struct e100_private *bdp,
 				  u8 recover, u8 full_reset);
 extern unsigned char e100_hw_reset_recover(struct e100_private *bdp,

--- a/drivers/pci/pool.c
+++ b/drivers/pci/pool.c
@@ -303,15 +303,15 @@ pci_pool_free (struct pci_pool *pool, void *vaddr, dma_addr_t dma)

 #ifdef	CONFIG_DEBUG_SLAB
 	if (((dma - page->dma) + (void *)page->vaddr) != vaddr) {
-		printk (KERN_ERR "pci_pool_free %s/%s, %p (bad vaddr)/%lx\n",
+		printk (KERN_ERR "pci_pool_free %s/%s, %p (bad vaddr)/%Lx\n",
 			pool->dev ? pool->dev->slot_name : NULL,
-			pool->name, vaddr, (unsigned long) dma);
+			pool->name, vaddr, (unsigned long long) dma);
 		return;
 	}
 	if (page->bitmap [map] & (1UL << block)) {
-		printk (KERN_ERR "pci_pool_free %s/%s, dma %x already free\n",
+		printk (KERN_ERR "pci_pool_free %s/%s, dma %Lx already free\n",
 			pool->dev ? pool->dev->slot_name : NULL,
-			pool->name, dma);
+			pool->name, (unsigned long long)dma);
 		return;
 	}
 	memset (vaddr, POOL_POISON_BYTE, pool->size);

--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -2467,7 +2467,9 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)

 	sgp = scsi_sg_pools + SCpnt->sglist_len;

+	current->flags |= PF_NOWARN;
 	sgl = mempool_alloc(sgp->pool, gfp_mask);
+	current->flags &= ~PF_NOWARN;
 	if (sgl) {
 		memset(sgl, 0, sgp->size);
 		return sgl;

--- a/drivers/scsi/scsi_merge.c
+++ b/drivers/scsi/scsi_merge.c
@@ -74,8 +74,10 @@ int scsi_init_io(Scsi_Cmnd *SCpnt)
 	SCpnt->use_sg = count;

 	gfp_mask = GFP_NOIO;
-	if (in_interrupt())
+	if (in_interrupt()) {
 		gfp_mask &= ~__GFP_WAIT;
+		gfp_mask |= __GFP_HIGH;
+	}

 	/*
 	 * if sg table allocation fails, requeue request later.

--- a/fs/bio.c
+++ b/fs/bio.c
@@ -135,21 +135,26 @@ inline void bio_init(struct bio *bio)
 **/
 struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
 {
-	struct bio *bio = mempool_alloc(bio_pool, gfp_mask);
+	struct bio *bio;
 	struct bio_vec *bvl = NULL;

+	current->flags |= PF_NOWARN;
+	bio = mempool_alloc(bio_pool, gfp_mask);
 	if (unlikely(!bio))
-		return NULL;
+		goto out;

 	if (!nr_iovecs || (bvl = bvec_alloc(gfp_mask,nr_iovecs,&bio->bi_max))) {
 		bio_init(bio);
 		bio->bi_destructor = bio_destructor;
 		bio->bi_io_vec = bvl;
-		return bio;
+		goto out;
 	}

 	mempool_free(bio, bio_pool);
-	return NULL;
+	bio = NULL;
+out:
+	current->flags &= ~PF_NOWARN;
+	return bio;
 }

 /**

--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -23,8 +23,6 @@

 #include <asm/uaccess.h>

-#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
-
 static unsigned long max_block(struct block_device *bdev)
 {
 	unsigned int retval = ~0U;

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -36,6 +36,8 @@
 #include <linux/buffer_head.h>
 #include <asm/bitops.h>

+static void invalidate_bh_lrus(void);
+
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

 /*
@@ -389,7 +391,7 @@ asmlinkage long sys_fdatasync(unsigned int fd)
 * private_lock is contended then so is mapping->page_lock).
 */
 struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, int unused)
+__find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
 {
 	struct inode *bd_inode = bdev->bd_inode;
 	struct address_space *bd_mapping = bd_inode->i_mapping;
@@ -459,12 +461,15 @@ __find_get_block(struct block_device *bdev, sector_t block, int unused)
   pass does the actual I/O. */
 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 {
+	invalidate_bh_lrus();
 	/*
 	 * FIXME: what about destroy_dirty_buffers?
 	 * We really want to use invalidate_inode_pages2() for
 	 * that, but not until that's cleaned up.
 	 */
+	current->flags |= PF_INVALIDATE;
 	invalidate_inode_pages(bdev->bd_inode);
+	current->flags &= ~PF_INVALIDATE;
 }

 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
@@ -489,7 +494,6 @@ static void free_more_memory(void)
 	wakeup_bdflush();
 	try_to_free_pages(zone, GFP_NOFS, 0);
 	blk_run_queues();
-	__set_current_state(TASK_RUNNING);
 	yield();
 }

@@ -961,7 +965,9 @@ create_buffers(struct page * page, unsigned long size, int retry)
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
+		current->flags |= PF_NOWARN;
 		bh = alloc_buffer_head();
+		current->flags &= ~PF_NOWARN;
 		if (!bh)
 			goto no_grow;

@@ -1159,7 +1165,7 @@ grow_buffers(struct block_device *bdev, unsigned long block, int size)
 * attempt is failing.  FIXME, perhaps?
 */
 struct buffer_head *
-__getblk(struct block_device *bdev, sector_t block, int size)
+__getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	for (;;) {
 		struct buffer_head * bh;
@@ -1259,7 +1265,8 @@ void __bforget(struct buffer_head *bh)
 *  Reads a specified block, and returns buffer head that contains it.
 *  It returns NULL if the block was unreadable.
 */
-struct buffer_head * __bread(struct block_device *bdev, int block, int size)
+struct buffer_head *
+__bread_slow(struct block_device *bdev, sector_t block, int size)
 {
 	struct buffer_head *bh = __getblk(bdev, block, size);

@@ -1283,6 +1290,165 @@ struct buffer_head * __bread(struct block_device *bdev, int block, int size)
 	return NULL;
 }

+/*
+ * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
+ * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
+ * refcount elevated by one when they're in an LRU.  A buffer can only appear
+ * once in a particular CPU's LRU.  A single buffer can be present in multiple
+ * CPU's LRUs at the same time.
+ *
+ * This is a transparent caching front-end to sb_bread(), sb_getblk() and
+ * sb_find_get_block().
+ */
+
+#define BH_LRU_SIZE	7
+
+static struct bh_lru {
+	spinlock_t lock;
+	struct buffer_head *bhs[BH_LRU_SIZE];
+} ____cacheline_aligned_in_smp bh_lrus[NR_CPUS];
+
+/*
+ * The LRU management algorithm is dopey-but-simple.  Sorry.
+ */
+static void bh_lru_install(struct buffer_head *bh)
+{
+	struct buffer_head *evictee = NULL;
+	struct bh_lru *lru;
+
+	if (bh == NULL)
+		return;
+
+	lru = &bh_lrus[get_cpu()];
+	spin_lock(&lru->lock);
+	if (lru->bhs[0] != bh) {
+		struct buffer_head *bhs[BH_LRU_SIZE];
+		int in;
+		int out = 0;
+
+		get_bh(bh);
+		bhs[out++] = bh;
+		for (in = 0; in < BH_LRU_SIZE; in++) {
+			struct buffer_head *bh2 = lru->bhs[in];
+
+			if (bh2 == bh) {
+				__brelse(bh2);
+			} else {
+				if (out >= BH_LRU_SIZE) {
+					BUG_ON(evictee != NULL);
+					evictee = bh2;
+				} else {
+					bhs[out++] = bh2;
+				}
+			}
+		}
+		while (out < BH_LRU_SIZE)
+			bhs[out++] = NULL;
+		memcpy(lru->bhs, bhs, sizeof(bhs));
+	}
+	spin_unlock(&lru->lock);
+	put_cpu();
+
+	if (evictee) {
+		touch_buffer(evictee);
+		__brelse(evictee);
+	}
+}
+
+static inline struct buffer_head *
+lookup_bh(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *ret = NULL;
+	struct bh_lru *lru;
+	int i;
+
+	lru = &bh_lrus[get_cpu()];
+	spin_lock(&lru->lock);
+	for (i = 0; i < BH_LRU_SIZE; i++) {
+		struct buffer_head *bh = lru->bhs[i];
+
+		if (bh && bh->b_bdev == bdev &&
+				bh->b_blocknr == block && bh->b_size == size) {
+			if (i) {
+				while (i) {
+					lru->bhs[i] = lru->bhs[i - 1];
+					i--;
+				}
+				lru->bhs[0] = bh;
+			}
+			get_bh(bh);
+			ret = bh;
+			break;
+		}
+	}
+	spin_unlock(&lru->lock);
+	put_cpu();
+	return ret;
+}
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = lookup_bh(bdev, block, size);
+
+	if (bh == NULL) {
+		bh = __find_get_block_slow(bdev, block, size);
+		bh_lru_install(bh);
+	}
+	return bh;
+}
+EXPORT_SYMBOL(__find_get_block);
+
+struct buffer_head *
+__getblk(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = __find_get_block(bdev, block, size);
+
+	if (bh == NULL) {
+		bh = __getblk_slow(bdev, block, size);
+		bh_lru_install(bh);
+	}
+	return bh;
+}
+EXPORT_SYMBOL(__getblk);
+
+struct buffer_head *
+__bread(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+
+	if (bh) {
+		if (buffer_uptodate(bh))
+			return bh;
+		__brelse(bh);
+	}
+	bh = __bread_slow(bdev, block, size);
+	bh_lru_install(bh);
+	return bh;
+}
+EXPORT_SYMBOL(__bread);
+
+/*
+ * This is called rarely - at unmount.
+ */
+static void invalidate_bh_lrus(void)
+{
+	int cpu_idx;
+
+	for (cpu_idx = 0; cpu_idx < NR_CPUS; cpu_idx++)
+		spin_lock(&bh_lrus[cpu_idx].lock);
+	for (cpu_idx = 0; cpu_idx < NR_CPUS; cpu_idx++) {
+		int i;
+
+		for (i = 0; i < BH_LRU_SIZE; i++) {
+			brelse(bh_lrus[cpu_idx].bhs[i]);
+			bh_lrus[cpu_idx].bhs[i] = NULL;
+		}
+	}
+	for (cpu_idx = 0; cpu_idx < NR_CPUS; cpu_idx++)
+		spin_unlock(&bh_lrus[cpu_idx].lock);
+}
+
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset)
 {
@@ -2306,7 +2472,8 @@ static inline int buffer_busy(struct buffer_head *bh)
 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
 }

-static /*inline*/ int drop_buffers(struct page *page)
+static inline int
+drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
 {
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh;
@@ -2330,9 +2497,9 @@ static /*inline*/ int drop_buffers(struct page *page)

 		if (!list_empty(&bh->b_assoc_buffers))
 			__remove_assoc_queue(bh);
-		free_buffer_head(bh);
 		bh = next;
 	} while (bh != head);
+	*buffers_to_free = head;
 	__clear_page_buffers(page);
 	return 1;
 failed:
@@ -2342,17 +2509,20 @@ static /*inline*/ int drop_buffers(struct page *page)
 int try_to_free_buffers(struct page *page)
 {
 	struct address_space * const mapping = page->mapping;
+	struct buffer_head *buffers_to_free = NULL;
 	int ret = 0;

 	BUG_ON(!PageLocked(page));
 	if (PageWriteback(page))
 		return 0;

-	if (mapping == NULL)		/* swapped-in anon page */
-		return drop_buffers(page);
+	if (mapping == NULL) {		/* swapped-in anon page */
+		ret = drop_buffers(page, &buffers_to_free);
+		goto out;
+	}

 	spin_lock(&mapping->private_lock);
-	ret = drop_buffers(page);
+	ret = drop_buffers(page, &buffers_to_free);
 	if (ret && !PageSwapCache(page)) {
 		/*
 		 * If the filesystem writes its buffers by hand (eg ext3)
@@ -2365,6 +2535,16 @@ int try_to_free_buffers(struct page *page)
 		ClearPageDirty(page);
 	}
 	spin_unlock(&mapping->private_lock);
+out:
+	if (buffers_to_free) {
+		struct buffer_head *bh = buffers_to_free;
+
+		do {
+			struct buffer_head *next = bh->b_this_page;
+			free_buffer_head(bh);
+			bh = next;
+		} while (bh != buffers_to_free);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(try_to_free_buffers);
@@ -2435,6 +2615,9 @@ void __init buffer_init(void)
 {
 	int i;

+	for (i = 0; i < NR_CPUS; i++)
+		spin_lock_init(&bh_lrus[i].lock);
+
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
 			SLAB_HWCACHE_ALIGN, init_buffer_head, NULL);

--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -30,8 +30,7 @@
 *
 * The file system contains group descriptors which are located after the
 * super block.  Each descriptor contains the number of the bitmap block and
- * the free blocks count in the block.  The descriptors are loaded in memory
- * when a file system is mounted (see ext2_read_super).
+ * the free blocks count in the block.
 */


@@ -41,8 +40,8 @@
 *
 * Return buffer_head of bitmap on success or NULL.
 */
-static struct buffer_head *read_inode_bitmap (struct super_block * sb,
-					       unsigned long block_group)
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
 {
 	struct ext2_group_desc *desc;
 	struct buffer_head *bh = NULL;
@@ -53,7 +52,7 @@ static struct buffer_head *read_inode_bitmap (struct super_block * sb,

 	bh = sb_bread(sb, le32_to_cpu(desc->bg_inode_bitmap));
 	if (!bh)
-		ext2_error (sb, "read_inode_bitmap",
+		ext2_error(sb, "read_inode_bitmap",
 			    "Cannot read inode bitmap - "
 			    "block_group = %lu, inode_bitmap = %lu",
 			    block_group, (unsigned long) desc->bg_inode_bitmap);
@@ -61,75 +60,6 @@ static struct buffer_head *read_inode_bitmap (struct super_block * sb,
 	return bh;
 }

-/*
- * load_inode_bitmap loads the inode bitmap for a blocks group
- *
- * It maintains a cache for the last bitmaps loaded.  This cache is managed
- * with a LRU algorithm.
- *
- * Notes:
- * 1/ There is one cache per mounted file system.
- * 2/ If the file system contains less than EXT2_MAX_GROUP_LOADED groups,
- *    this function reads the bitmap without maintaining a LRU cache.
- * 
- * Return the buffer_head of the bitmap or the ERR_PTR(error)
- */
-static struct buffer_head *load_inode_bitmap (struct super_block * sb,
-					      unsigned int block_group)
-{
-	int i, slot = 0;
-	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	struct buffer_head *bh = sbi->s_inode_bitmap[0];
-
-	if (block_group >= sbi->s_groups_count)
-		ext2_panic (sb, "load_inode_bitmap",
-			    "block_group >= groups_count - "
-			    "block_group = %d, groups_count = %lu",
-			     block_group, sbi->s_groups_count);
-
-	if (sbi->s_loaded_inode_bitmaps > 0 &&
-	    sbi->s_inode_bitmap_number[0] == block_group && bh)
-		goto found;
-
-	if (sbi->s_groups_count <= EXT2_MAX_GROUP_LOADED) {
-		slot = block_group;
-		bh = sbi->s_inode_bitmap[slot];
-		if (!bh)
-			goto read_it;
-		if (sbi->s_inode_bitmap_number[slot] == slot)
-			goto found;
-		ext2_panic (sb, "load_inode_bitmap",
-			    "block_group != inode_bitmap_number");
-	}
-
-	bh = NULL;
-	for (i = 0; i < sbi->s_loaded_inode_bitmaps &&
-		    sbi->s_inode_bitmap_number[i] != block_group;
-	     i++)
-		;
-	if (i < sbi->s_loaded_inode_bitmaps)
-		bh = sbi->s_inode_bitmap[i];
-	else if (sbi->s_loaded_inode_bitmaps < EXT2_MAX_GROUP_LOADED)
-		sbi->s_loaded_inode_bitmaps++;
-	else
-		brelse (sbi->s_inode_bitmap[--i]);
-
-	while (i--) {
-		sbi->s_inode_bitmap_number[i+1] = sbi->s_inode_bitmap_number[i];
-		sbi->s_inode_bitmap[i+1] = sbi->s_inode_bitmap[i];
-	}
-
-read_it:
-	if (!bh)
-		bh = read_inode_bitmap (sb, block_group);
-	sbi->s_inode_bitmap_number[slot] = block_group;
-	sbi->s_inode_bitmap[slot] = bh;
-	if (!bh)
-		return ERR_PTR(-EIO);
-found:
-	return bh;
-}
-
 /*
 * NOTE! When we get the inode, we're the only people
 * that have access to it, and as such there are no
@@ -151,8 +81,8 @@ void ext2_free_inode (struct inode * inode)
 	struct super_block * sb = inode->i_sb;
 	int is_directory;
 	unsigned long ino;
-	struct buffer_head * bh;
-	struct buffer_head * bh2;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
 	unsigned long block_group;
 	unsigned long bit;
 	struct ext2_group_desc * desc;
@@ -186,12 +116,13 @@ void ext2_free_inode (struct inode * inode)
 	}
 	block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb);
-	bh = load_inode_bitmap (sb, block_group);
-	if (IS_ERR(bh))
+	brelse(bitmap_bh);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
 		goto error_return;

 	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext2_clear_bit (bit, bh->b_data))
+	if (!ext2_clear_bit(bit, bitmap_bh->b_data))
 		ext2_error (sb, "ext2_free_inode",
 			      "bit already cleared for inode %lu", ino);
 	else {
@@ -208,13 +139,14 @@ void ext2_free_inode (struct inode * inode)
 			cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
 		mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 	}
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty(bitmap_bh);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ll_rw_block (WRITE, 1, &bh);
-		wait_on_buffer (bh);
+		ll_rw_block(WRITE, 1, &bitmap_bh);
+		wait_on_buffer(bitmap_bh);
 	}
 	sb->s_dirt = 1;
 error_return:
+	brelse(bitmap_bh);
 	unlock_super (sb);
 }

@@ -351,9 +283,9 @@ static int find_group_other(struct super_block *sb, int parent_group)

 struct inode * ext2_new_inode(struct inode * dir, int mode)
 {
-	struct super_block * sb;
-	struct buffer_head * bh;
-	struct buffer_head * bh2;
+	struct super_block *sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
 	int group, i;
 	ino_t ino;
 	struct inode * inode;
@@ -361,6 +293,7 @@ struct inode * ext2_new_inode(struct inode * dir, int mode)
 	struct ext2_super_block * es;
 	struct ext2_inode_info *ei;
 	int err;
+	struct inode *ret;

 	sb = dir->i_sb;
 	inode = new_inode(sb);
@@ -381,20 +314,21 @@ struct inode * ext2_new_inode(struct inode * dir, int mode)
 		goto fail;

 	err = -EIO;
-	bh = load_inode_bitmap (sb, group);
-	if (IS_ERR(bh))
+	brelse(bitmap_bh);
+	bitmap_bh = read_inode_bitmap(sb, group);
+	if (!bitmap_bh)
 		goto fail2;

-	i = ext2_find_first_zero_bit ((unsigned long *) bh->b_data,
+	i = ext2_find_first_zero_bit((unsigned long *)bitmap_bh->b_data,
 				      EXT2_INODES_PER_GROUP(sb));
 	if (i >= EXT2_INODES_PER_GROUP(sb))
 		goto bad_count;
-	ext2_set_bit (i, bh->b_data);
+	ext2_set_bit(i, bitmap_bh->b_data);

-	mark_buffer_dirty(bh);
+	mark_buffer_dirty(bitmap_bh);
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ll_rw_block (WRITE, 1, &bh);
-		wait_on_buffer (bh);
+		ll_rw_block(WRITE, 1, &bitmap_bh);
+		wait_on_buffer(bitmap_bh);
 	}

 	ino = group * EXT2_INODES_PER_GROUP(sb) + i + 1;
@@ -452,17 +386,19 @@ struct inode * ext2_new_inode(struct inode * dir, int mode)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);

-	unlock_super (sb);
+	unlock_super(sb);
+	ret = inode;
 	if(DQUOT_ALLOC_INODE(inode)) {
 		DQUOT_DROP(inode);
 		inode->i_flags |= S_NOQUOTA;
 		inode->i_nlink = 0;
 		iput(inode);
-		return ERR_PTR(-EDQUOT);
+		ret = ERR_PTR(-EDQUOT);
+	} else {
+		ext2_debug("allocating inode %lu\n", inode->i_ino);
+		ext2_preread_inode(inode);
 	}
-	ext2_debug ("allocating inode %lu\n", inode->i_ino);
-	ext2_preread_inode(inode);
-	return inode;
+	goto out;

 fail2:
 	desc = ext2_get_group_desc (sb, group, &bh2);
@@ -476,7 +412,8 @@ struct inode * ext2_new_inode(struct inode * dir, int mode)
 	unlock_super(sb);
 	make_bad_inode(inode);
 	iput(inode);
-	return ERR_PTR(err);
+	ret = ERR_PTR(err);
+	goto out;

 bad_count:
 	ext2_error (sb, "ext2_new_inode",
@@ -491,6 +428,9 @@ struct inode * ext2_new_inode(struct inode * dir, int mode)
 	desc->bg_free_inodes_count = 0;
 	mark_buffer_dirty(bh2);
 	goto repeat;
+out:
+	brelse(bitmap_bh);
+	return ret;
 }

 unsigned long ext2_count_free_inodes (struct super_block * sb)
@@ -498,30 +438,33 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
 #ifdef EXT2FS_DEBUG
 	struct ext2_super_block * es;
 	unsigned long desc_count = 0, bitmap_count = 0;
+	struct buffer_head *bitmap_bh = NULL;
 	int i;

 	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
 	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
-		struct ext2_group_desc *desc = ext2_get_group_desc (sb, i, NULL);
-		struct buffer_head *bh;
+		struct ext2_group_desc *desc;
 		unsigned x;

+		desc = ext2_get_group_desc (sb, i, NULL);
 		if (!desc)
 			continue;
 		desc_count += le16_to_cpu(desc->bg_free_inodes_count);
-		bh = load_inode_bitmap (sb, i);
-		if (IS_ERR(bh))
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
 			continue;

-		x = ext2_count_free (bh, EXT2_INODES_PER_GROUP(sb) / 8);
+		x = ext2_count_free(bitmap_bh, EXT2_INODES_PER_GROUP(sb) / 8);
 		printk ("group %d: stored = %d, counted = %lu\n",
 			i, le16_to_cpu(desc->bg_free_inodes_count), x);
 		bitmap_count += x;
 	}
+	brelse(bitmap_bh);
 	printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
 		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
-	unlock_super (sb);
+	unlock_super(sb);
 	return desc_count;
 #else
 	return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_inodes_count);
@@ -534,21 +477,23 @@ void ext2_check_inodes_bitmap (struct super_block * sb)
 {
 	struct ext2_super_block * es = EXT2_SB(sb)->s_es;
 	unsigned long desc_count = 0, bitmap_count = 0;
+	struct buffer_head *bitmap_bh = NULL;
 	int i;

 	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
-		struct ext2_group_desc *desc = ext2_get_group_desc(sb, i, NULL);
-		struct buffer_head *bh;
+		struct ext2_group_desc *desc;
 		unsigned x;

+		desc = ext2_get_group_desc(sb, i, NULL);
 		if (!desc)
 			continue;
 		desc_count += le16_to_cpu(desc->bg_free_inodes_count);
-		bh = load_inode_bitmap (sb, i);
-		if (IS_ERR(bh))
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
 			continue;
 		
-		x = ext2_count_free (bh, EXT2_INODES_PER_GROUP(sb) / 8);
+		x = ext2_count_free(bitmap_bh, EXT2_INODES_PER_GROUP(sb) / 8);
 		if (le16_to_cpu(desc->bg_free_inodes_count) != x)
 			ext2_error (sb, "ext2_check_inodes_bitmap",
 				    "Wrong free inodes count in group %d, "
@@ -556,8 +501,9 @@ void ext2_check_inodes_bitmap (struct super_block * sb)
 				    le16_to_cpu(desc->bg_free_inodes_count), x);
 		bitmap_count += x;
 	}
+	brelse(bitmap_bh);
 	if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
-		ext2_error (sb, "ext2_check_inodes_bitmap",
+		ext2_error(sb, "ext2_check_inodes_bitmap",
 			    "Wrong free inodes count in super block, "
 			    "stored = %lu, counted = %lu",
 			    (unsigned long)le32_to_cpu(es->s_free_inodes_count),

--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -142,12 +142,6 @@ static void ext2_put_super (struct super_block * sb)
 		if (sbi->s_group_desc[i])
 			brelse (sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++)
-		if (sbi->s_inode_bitmap[i])
-			brelse (sbi->s_inode_bitmap[i]);
-	for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++)
-		if (sbi->s_block_bitmap[i])
-			brelse (sbi->s_block_bitmap[i]);
 	brelse (sbi->s_sbh);
 	sb->u.generic_sbp = NULL;
 	kfree(sbi);
@@ -686,14 +680,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		db_count = i;
 		goto failed_mount2;
 	}
-	for (i = 0; i < EXT2_MAX_GROUP_LOADED; i++) {
-		sbi->s_inode_bitmap_number[i] = 0;
-		sbi->s_inode_bitmap[i] = NULL;
-		sbi->s_block_bitmap_number[i] = 0;
-		sbi->s_block_bitmap[i] = NULL;
-	}
-	sbi->s_loaded_inode_bitmaps = 0;
-	sbi->s_loaded_block_bitmaps = 0;
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	/*

--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1632,8 +1632,10 @@ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
 		}
 		ext3_mark_inode_dirty(handle, inode);
 		ext3_journal_test_restart(handle, inode);
-		BUFFER_TRACE(bh, "get_write_access");
-		ext3_journal_get_write_access(handle, bh);
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			ext3_journal_get_write_access(handle, bh);
+		}
 	}

 	/*

--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -417,10 +417,6 @@ void ext3_put_super (struct super_block * sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
-		brelse(sbi->s_inode_bitmap[i]);
-	for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
-		brelse(sbi->s_block_bitmap[i]);
 	brelse(sbi->s_sbh);

 	/* Debugging code just in case the in-memory inode orphan list
@@ -1150,14 +1146,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
 		goto failed_mount2;
 	}
-	for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) {
-		sbi->s_inode_bitmap_number[i] = 0;
-		sbi->s_inode_bitmap[i] = NULL;
-		sbi->s_block_bitmap_number[i] = 0;
-		sbi->s_block_bitmap[i] = NULL;
-	}
-	sbi->s_loaded_inode_bitmaps = 0;
-	sbi->s_loaded_block_bitmaps = 0;
 	sbi->s_gdb_count = db_count;
 	/*
 	 * set up enough so that it can read an inode

--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -245,6 +245,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	}

 	if (arg & O_DIRECT) {
+		if (inode->i_mapping && inode->i_mapping->a_ops) {
+			if (!inode->i_mapping->a_ops->direct_IO)
+				return -EINVAL;
+		}
+
 		/*
 		 * alloc_kiovec() can sleep and we are only serialized by
 		 * the big kernel lock here, so abuse the i_sem to serialize

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -220,8 +220,7 @@ static void sync_sb_inodes(struct super_block *sb, int sync_mode,
 	struct list_head *head;
 	const unsigned long start = jiffies;	/* livelock avoidance */

-	list_splice(&sb->s_dirty, &sb->s_io);
-	INIT_LIST_HEAD(&sb->s_dirty);
+	list_splice_init(&sb->s_dirty, &sb->s_io);
 	head = &sb->s_io;
 	while ((tmp = head->prev) != head) {
 		struct inode *inode = list_entry(tmp, struct inode, i_list);
@@ -262,13 +261,10 @@ static void sync_sb_inodes(struct super_block *sb, int sync_mode,
 			break;
 	}
 out:
-	if (!list_empty(&sb->s_io)) {
-		/*
-		 * Put the rest back, in the correct order.
-		 */
-		list_splice(&sb->s_io, sb->s_dirty.prev);
-		INIT_LIST_HEAD(&sb->s_io);
-	}
+	/*
+	 * Put the rest back, in the correct order.
+	 */
+	list_splice_init(&sb->s_io, sb->s_dirty.prev);
 	return;
 }

@@ -287,8 +283,9 @@ static void sync_sb_inodes(struct super_block *sb, int sync_mode,
 *
 * This is a "memory cleansing" operation, not a "data integrity" operation.
 */
-void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
-				unsigned long *older_than_this)
+void writeback_unlocked_inodes(int *nr_to_write,
+			       enum writeback_sync_modes sync_mode,
+			       unsigned long *older_than_this)
 {
 	struct super_block *sb;


--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -592,7 +592,8 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT (transaction->t_log_list == NULL);
 	J_ASSERT (transaction->t_checkpoint_list == NULL);
 	J_ASSERT (transaction->t_updates == 0);
-	
+	J_ASSERT (list_empty(&transaction->t_jcb));
+
 	J_ASSERT (transaction->t_journal->j_committing_transaction !=
 					transaction);
 	

--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -471,7 +471,7 @@ void journal_commit_transaction(journal_t *journal)
           transaction's t_log_list queue, and metadata buffers are on
           the t_iobuf_list queue.

-	   Wait for the transactions in reverse order.  That way we are
+	   Wait for the buffers in reverse order.  That way we are
 	   less likely to be woken up until all IOs have completed, and
 	   so we incur less scheduling load.
 	*/
@@ -563,8 +563,10 @@ void journal_commit_transaction(journal_t *journal)

 	jbd_debug(3, "JBD: commit phase 6\n");

-	if (is_journal_aborted(journal))
+	if (is_journal_aborted(journal)) {
+		unlock_journal(journal);
 		goto skip_commit;
+	}

 	/* Done it all: now write the commit record.  We should have
 	 * cleaned up our previous buffers by now, so if we are in abort
@@ -574,9 +576,10 @@ void journal_commit_transaction(journal_t *journal)
 	descriptor = journal_get_descriptor_buffer(journal);
 	if (!descriptor) {
 		__journal_abort_hard(journal);
+		unlock_journal(journal);
 		goto skip_commit;
 	}
-	
+
 	/* AKPM: buglet - add `i' to tmp! */
 	for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
 		journal_header_t *tmp =
@@ -596,14 +599,32 @@ void journal_commit_transaction(journal_t *journal)
 		__brelse(bh);		/* One for getblk() */
 		journal_unlock_journal_head(descriptor);
 	}
-	lock_journal(journal);

 	/* End of a transaction!  Finally, we can do checkpoint
           processing: any buffers committed as a result of this
           transaction can be removed from any checkpoint list it was on
           before. */

-skip_commit:
+skip_commit: /* The journal should be unlocked by now. */
+
+	/* Call any callbacks that had been registered for handles in this
+	 * transaction.  It is up to the callback to free any allocated
+	 * memory.
+	 */
+	if (!list_empty(&commit_transaction->t_jcb)) {
+		struct list_head *p, *n;
+		int error = is_journal_aborted(journal);
+
+		list_for_each_safe(p, n, &commit_transaction->t_jcb) {
+			struct journal_callback *jcb;
+
+			jcb = list_entry(p, struct journal_callback, jcb_list);
+			list_del(p);
+			jcb->jcb_func(jcb, error);
+		}
+	}
+
+	lock_journal(journal);

 	jbd_debug(3, "JBD: commit phase 7\n");


--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -58,6 +58,7 @@ EXPORT_SYMBOL(journal_sync_buffer);
 #endif
 EXPORT_SYMBOL(journal_flush);
 EXPORT_SYMBOL(journal_revoke);
+EXPORT_SYMBOL(journal_callback_set);

 EXPORT_SYMBOL(journal_init_dev);
 EXPORT_SYMBOL(journal_init_inode);

--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -57,6 +57,7 @@ static transaction_t * get_transaction (journal_t * journal, int is_try)
 	transaction->t_state = T_RUNNING;
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
+	INIT_LIST_HEAD(&transaction->t_jcb);

 	/* Set up the commit timer for the new transaction. */
 	J_ASSERT (!journal->j_commit_timer_active);
@@ -90,7 +91,14 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 	transaction_t *transaction;
 	int needed;
 	int nblocks = handle->h_buffer_credits;
-	
+
+	if (nblocks > journal->j_max_transaction_buffers) {
+		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+		       current->comm, nblocks,
+		       journal->j_max_transaction_buffers);
+		return -ENOSPC;
+	}
+
 	jbd_debug(3, "New handle %p going live.\n", handle);

 repeat:
@@ -200,6 +208,20 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 	return 0;
 }

+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+	handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+	if (!handle)
+		return NULL;
+	memset(handle, 0, sizeof (handle_t));
+	handle->h_buffer_credits = nblocks;
+	handle->h_ref = 1;
+	INIT_LIST_HEAD(&handle->h_jcb);
+
+	return handle;
+}
+
 /*
 * Obtain a new handle.  
 *
@@ -226,14 +248,11 @@ handle_t *journal_start(journal_t *journal, int nblocks)
 		handle->h_ref++;
 		return handle;
 	}
-	
-	handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+
+	handle = new_handle(nblocks);
 	if (!handle)
 		return ERR_PTR(-ENOMEM);
-	memset (handle, 0, sizeof (handle_t));

-	handle->h_buffer_credits = nblocks;
-	handle->h_ref = 1;
 	current->journal_info = handle;

 	err = start_this_handle(journal, handle);
@@ -332,14 +351,11 @@ handle_t *journal_try_start(journal_t *journal, int nblocks)
 	
 	if (is_journal_aborted(journal))
 		return ERR_PTR(-EIO);
-	
-	handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+
+	handle = new_handle(nblocks);
 	if (!handle)
 		return ERR_PTR(-ENOMEM);
-	memset (handle, 0, sizeof (handle_t));

-	handle->h_buffer_credits = nblocks;
-	handle->h_ref = 1;
 	current->journal_info = handle;

 	err = try_start_this_handle(journal, handle);
@@ -1347,6 +1363,28 @@ void journal_sync_buffer(struct buffer_head *bh)
 }
 #endif

+/*
+ * Register a callback function for this handle.  The function will be
+ * called when the transaction that this handle is part of has been
+ * committed to disk with the original callback data struct and the
+ * error status of the journal as parameters.  There is no guarantee of
+ * ordering between handles within a single transaction, nor between
+ * callbacks registered on the same handle.
+ *
+ * The caller is responsible for allocating the journal_callback struct.
+ * This is to allow the caller to add as much extra data to the callback
+ * as needed, but reduce the overhead of multiple allocations.  The caller
+ * allocated struct must start with a struct journal_callback at offset 0,
+ * and has the caller-specific data afterwards.
+ */
+void journal_callback_set(handle_t *handle,
+			  void (*func)(struct journal_callback *jcb, int error),
+			  struct journal_callback *jcb)
+{
+	list_add_tail(&jcb->jcb_list, &handle->h_jcb);
+	jcb->jcb_func = func;
+}
+
 /*
 * All done for a particular handle.
 *
@@ -1411,7 +1449,10 @@ int journal_stop(handle_t *handle)
 			wake_up(&journal->j_wait_transaction_locked);
 	}

-	/* 
+	/* Move callbacks from the handle to the transaction. */
+	list_splice(&handle->h_jcb, &transaction->t_jcb);
+
+	/*
 	 * If the handle is marked SYNC, we need to set another commit
 	 * going!  We also want to force a commit if the current
 	 * transaction is occupying too much of the log, or if the

--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2975,10 +2975,7 @@ int jfs_sync(void)
 			}
 		}
 		/* Add anon_list2 back to anon_list */
-		if (!list_empty(&TxAnchor.anon_list2)) {
-			list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
-			INIT_LIST_HEAD(&TxAnchor.anon_list2);
-		}
+		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
 		add_wait_queue(&jfs_sync_thread_wait, &wq);
 		set_current_state(TASK_INTERRUPTIBLE);
 		TXN_UNLOCK();

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -431,7 +431,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 		unsigned nr_bvecs = MPAGE_BIO_MAX_SIZE / PAGE_CACHE_SIZE;

 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
-					nr_bvecs, GFP_NOFS);
+					nr_bvecs, GFP_NOFS|__GFP_HIGH);
 		if (bio == NULL)
 			goto confused;
 	}
@@ -475,9 +475,44 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	return bio;
 }

-/*
- * This is a cut-n-paste of generic_writepages().  We _could_
- * generalise that function.  It'd get a bit messy.  We'll see.
+/**
+ * mpage_writepages - walk the list of dirty pages of the given
+ * address space and writepage() all of them.
+ * 
+ * @mapping: address space structure to write
+ * @nr_to_write: subtract the number of written pages from *@nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *             If this is NULL then use a_ops->writepage.  Otherwise, go
+ *             direct-to-BIO.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * (The next two paragraphs refer to code which isn't here yet, but they
+ *  explain the presence of address_space.io_pages)
+ *
+ * Pages can be moved from clean_pages or locked_pages onto dirty_pages
+ * at any time - it's not possible to lock against that.  So pages which
+ * have already been added to a BIO may magically reappear on the dirty_pages
+ * list.  And generic_writepages() will again try to lock those pages.
+ * But I/O has not yet been started against the page.  Thus deadlock.
+ *
+ * To avoid this, the entire contents of the dirty_pages list are moved
+ * onto io_pages up-front.  We then walk io_pages, locking the
+ * pages and submitting them for I/O, moving them to locked_pages.
+ *
+ * This has the added benefit of preventing a livelock which would otherwise
+ * occur if pages are being dirtied faster than we can write them out.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarentee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  The way to do this is
+ * to run filemap_fdatawait() before calling filemap_fdatawrite().
+ *
+ * It's fairly rare for PageWriteback pages to be on ->dirty_pages.  It
+ * means that someone redirtied the page while it was under I/O.
 */
 int
 mpage_writepages(struct address_space *mapping,
@@ -487,11 +522,15 @@ mpage_writepages(struct address_space *mapping,
 	sector_t last_block_in_bio = 0;
 	int ret = 0;
 	int done = 0;
+	int (*writepage)(struct page *);
+
+	writepage = NULL;
+	if (get_block == NULL)
+		writepage = mapping->a_ops->writepage;

 	write_lock(&mapping->page_lock);

-	list_splice(&mapping->dirty_pages, &mapping->io_pages);
-	INIT_LIST_HEAD(&mapping->dirty_pages);
+	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);

        while (!list_empty(&mapping->io_pages) && !done) {
 		struct page *page = list_entry(mapping->io_pages.prev,
@@ -516,8 +555,8 @@ mpage_writepages(struct address_space *mapping,

 		lock_page(page);

-		if (page->mapping && TestClearPageDirty(page) &&
-					!PageWriteback(page)) {
+		if (page->mapping && !PageWriteback(page) &&
+					TestClearPageDirty(page)) {
 			/* FIXME: batch this up */
 			if (!PageActive(page) && PageLRU(page)) {
 				spin_lock(&pagemap_lru_lock);
@@ -527,8 +566,13 @@ mpage_writepages(struct address_space *mapping,
 				}
 				spin_unlock(&pagemap_lru_lock);
 			}
-			bio = mpage_writepage(bio, page, get_block,
-					&last_block_in_bio, &ret);
+
+			if (writepage) {
+				ret = (*writepage)(page);
+			} else {
+				bio = mpage_writepage(bio, page, get_block,
+						&last_block_in_bio, &ret);
+			}
 			if (ret || (nr_to_write && --(*nr_to_write) <= 0))
 				done = 1;
 		} else {
@@ -538,13 +582,10 @@ mpage_writepages(struct address_space *mapping,
 		page_cache_release(page);
 		write_lock(&mapping->page_lock);
 	}
-	if (!list_empty(&mapping->io_pages)) {
-		/*
-		 * Put the rest back, in the correct order.
-		 */
-		list_splice(&mapping->io_pages, mapping->dirty_pages.prev);
-		INIT_LIST_HEAD(&mapping->io_pages);
-	}
+	/*
+	 * Put the rest back, in the correct order.
+	 */
+	list_splice_init(&mapping->io_pages, mapping->dirty_pages.prev);
 	write_unlock(&mapping->page_lock);
 	if (bio)
 		mpage_bio_submit(WRITE, bio);

--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1110,8 +1110,7 @@ nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data)
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */

-	list_splice(head, &data->pages);
-	INIT_LIST_HEAD(head);
+	list_splice_init(head, &data->pages);
 	first = nfs_list_entry(data->pages.next);
 	last = nfs_list_entry(data->pages.prev);
 	inode = first->wb_inode;

--- a/fs/open.c
+++ b/fs/open.c
@@ -665,6 +665,14 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 	}
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

+	/* NB: we're sure to have correct a_ops only after f_op->open */
+	if (f->f_flags & O_DIRECT) {
+		error = -EINVAL;
+		if (inode->i_mapping && inode->i_mapping->a_ops)
+			if (!inode->i_mapping->a_ops->direct_IO)
+				goto cleanup_all;
+	}
+
 	return f;

 cleanup_all:

--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -164,7 +164,7 @@ struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
 struct buffer_head * __getblk(struct block_device *, sector_t, int);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
-struct buffer_head * __bread(struct block_device *, int, int);
+struct buffer_head *__bread(struct block_device *, sector_t block, int size);
 void wakeup_bdflush(void);
 struct buffer_head *alloc_buffer_head(void);
 void free_buffer_head(struct buffer_head * bh);
@@ -201,9 +201,9 @@ int generic_osync_inode(struct inode *, int);
 * inline definitions
 */

-static inline void get_bh(struct buffer_head * bh)
+static inline void get_bh(struct buffer_head *bh)
 {
-        atomic_inc(&(bh)->b_count);
+        atomic_inc(&bh->b_count);
 }

 static inline void put_bh(struct buffer_head *bh)
@@ -212,68 +212,49 @@ static inline void put_bh(struct buffer_head *bh)
        atomic_dec(&bh->b_count);
 }

-/*
- * If an error happens during the make_request, this function
- * has to be recalled. It marks the buffer as clean and not
- * uptodate, and it notifys the upper layer about the end
- * of the I/O.
- */
-static inline void buffer_IO_error(struct buffer_head * bh)
-{
-	clear_buffer_dirty(bh);
-
-	/*
-	 * b_end_io has to clear the BH_Uptodate bitflag in the read error
-	 * case, however buffer contents are not necessarily bad if a
-	 * write fails
-	 */
-	bh->b_end_io(bh, buffer_uptodate(bh));
-}
-
-
-static inline void brelse(struct buffer_head *buf)
+static inline void brelse(struct buffer_head *bh)
 {
-	if (buf)
-		__brelse(buf);
+	if (bh)
+		__brelse(bh);
 }

-static inline void bforget(struct buffer_head *buf)
+static inline void bforget(struct buffer_head *bh)
 {
-	if (buf)
-		__bforget(buf);
+	if (bh)
+		__bforget(bh);
 }

-static inline struct buffer_head * sb_bread(struct super_block *sb, int block)
+static inline struct buffer_head *sb_bread(struct super_block *sb, sector_t block)
 {
 	return __bread(sb->s_bdev, block, sb->s_blocksize);
 }

-static inline struct buffer_head * sb_getblk(struct super_block *sb, int block)
+static inline struct buffer_head *sb_getblk(struct super_block *sb, sector_t block)
 {
 	return __getblk(sb->s_bdev, block, sb->s_blocksize);
 }

 static inline struct buffer_head *
-sb_find_get_block(struct super_block *sb, int block)
+sb_find_get_block(struct super_block *sb, sector_t block)
 {
 	return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
 }

 static inline void
-map_bh(struct buffer_head *bh, struct super_block *sb, int block)
+map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
 {
 	set_buffer_mapped(bh);
 	bh->b_bdev = sb->s_bdev;
 	bh->b_blocknr = block;
 }

-static inline void wait_on_buffer(struct buffer_head * bh)
+static inline void wait_on_buffer(struct buffer_head *bh)
 {
 	if (buffer_locked(bh))
 		__wait_on_buffer(bh);
 }

-static inline void lock_buffer(struct buffer_head * bh)
+static inline void lock_buffer(struct buffer_head *bh)
 {
 	while (test_set_buffer_locked(bh))
 		__wait_on_buffer(bh);

--- a/include/linux/ext2_fs_sb.h
+++ b/include/linux/ext2_fs_sb.h
@@ -16,14 +16,6 @@
 #ifndef _LINUX_EXT2_FS_SB
 #define _LINUX_EXT2_FS_SB

-/*
- * The following is not needed anymore since the descriptors buffer
- * heads are now dynamically allocated
- */
-/* #define EXT2_MAX_GROUP_DESC	8 */
-
-#define EXT2_MAX_GROUP_LOADED	8
-
 /*
 * second extended-fs super-block data in memory
 */
@@ -41,12 +33,6 @@ struct ext2_sb_info {
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext2_super_block * s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head ** s_group_desc;
-	unsigned short s_loaded_inode_bitmaps;
-	unsigned short s_loaded_block_bitmaps;
-	unsigned long s_inode_bitmap_number[EXT2_MAX_GROUP_LOADED];
-	struct buffer_head * s_inode_bitmap[EXT2_MAX_GROUP_LOADED];
-	unsigned long s_block_bitmap_number[EXT2_MAX_GROUP_LOADED];
-	struct buffer_head * s_block_bitmap[EXT2_MAX_GROUP_LOADED];
 	unsigned long  s_mount_opt;
 	uid_t s_resuid;
 	gid_t s_resgid;

--- a/include/linux/ext3_fs_sb.h
+++ b/include/linux/ext3_fs_sb.h
@@ -21,14 +21,6 @@
 #include <linux/wait.h>
 #endif

-/*
- * The following is not needed anymore since the descriptors buffer
- * heads are now dynamically allocated
- */
-/* #define EXT3_MAX_GROUP_DESC	8 */
-
-#define EXT3_MAX_GROUP_LOADED	8
-
 /*
 * third extended-fs super-block data in memory
 */
@@ -46,12 +38,6 @@ struct ext3_sb_info {
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext3_super_block * s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head ** s_group_desc;
-	unsigned short s_loaded_inode_bitmaps;
-	unsigned short s_loaded_block_bitmaps;
-	unsigned long s_inode_bitmap_number[EXT3_MAX_GROUP_LOADED];
-	struct buffer_head * s_inode_bitmap[EXT3_MAX_GROUP_LOADED];
-	unsigned long s_block_bitmap_number[EXT3_MAX_GROUP_LOADED];
-	struct buffer_head * s_block_bitmap[EXT3_MAX_GROUP_LOADED];
 	unsigned long  s_mount_opt;
 	uid_t s_resuid;
 	gid_t s_resgid;

--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -18,14 +18,14 @@
 #define __GFP_HIGHIO	0x80	/* Can start high mem physical IO? */
 #define __GFP_FS	0x100	/* Can call down to low-level FS? */

-#define GFP_NOHIGHIO	(__GFP_HIGH | __GFP_WAIT | __GFP_IO)
-#define GFP_NOIO	(__GFP_HIGH | __GFP_WAIT)
-#define GFP_NOFS	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO)
+#define GFP_NOHIGHIO	(             __GFP_WAIT | __GFP_IO)
+#define GFP_NOIO	(             __GFP_WAIT)
+#define GFP_NOFS	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO)
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_USER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
 #define GFP_HIGHUSER	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM)
-#define GFP_KERNEL	(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_NFS		(__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_KERNEL	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_NFS		(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
 #define GFP_KSWAPD	(             __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)

 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some

--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -250,6 +250,13 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh)
 	return bh->b_private;
 }

+#define HAVE_JOURNAL_CALLBACK_STATUS
+struct journal_callback {
+	struct list_head jcb_list;
+	void (*jcb_func)(struct journal_callback *jcb, int error);
+	/* user data goes here */
+};
+
 struct jbd_revoke_table_s;

 /* The handle_t type represents a single atomic update being performed
@@ -280,6 +287,12 @@ struct handle_s
 	   operations */
 	int			h_err;

+	/* List of application registered callbacks for this handle.
+	 * The function(s) will be called after the transaction that
+	 * this handle is part of has been committed to disk.
+	 */
+	struct list_head	h_jcb;
+
 	/* Flags */
 	unsigned int	h_sync:		1;	/* sync-on-close */
 	unsigned int	h_jdata:	1;	/* force data journaling */
@@ -399,6 +412,10 @@ struct transaction_s

 	/* How many handles used this transaction? */
 	int t_handle_count;
+
+	/* List of registered callback functions for this transaction.
+	 * Called when the transaction is committed. */
+	struct list_head	t_jcb;
 };


@@ -647,6 +664,9 @@ extern int	 journal_invalidatepage(journal_t *,
 extern int	 journal_try_to_free_buffers(journal_t *, struct page *, int);
 extern int	 journal_stop(handle_t *);
 extern int	 journal_flush (journal_t *);
+extern void	 journal_callback_set(handle_t *handle,
+				      void (*fn)(struct journal_callback *,int),
+				      struct journal_callback *jcb);

 extern void	 journal_lock_updates (journal_t *);
 extern void	 journal_unlock_updates (journal_t *);

--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -136,6 +136,19 @@ static inline int list_empty(list_t *head)
 	return head->next == head;
 }

+static inline void __list_splice(list_t *list, list_t *head)
+{
+	list_t *first = list->next;
+	list_t *last = list->prev;
+	list_t *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
 /**
 * list_splice - join two lists
 * @list: the new list to add.
@@ -145,15 +158,22 @@ static inline void list_splice(list_t *list, list_t *head)
 {
 	list_t *first = list->next;

-	if (first != list) {
-		list_t *last = list->prev;
-		list_t *at = head->next;
-
-		first->prev = head;
-		head->next = first;
+	if (first != list)
+		__list_splice(list, head);
+}

-		last->next = at;
-		at->prev = last;
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(list_t *list, list_t *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
 	}
 }


--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -305,6 +305,16 @@ static inline void set_page_zone(struct page *page, unsigned long zone_num)
 #define NOPAGE_SIGBUS	(NULL)
 #define NOPAGE_OOM	((struct page *) (-1))

+/*
+ * Different kinds of faults, as returned by handle_mm_fault().
+ * Used to decide whether a process gets delivered SIGBUS or
+ * just gets major/minor fault counters bumped up.
+ */
+#define VM_FAULT_OOM	(-1)
+#define VM_FAULT_SIGBUS	0
+#define VM_FAULT_MINOR	1
+#define VM_FAULT_MAJOR	2
+
 /* The array of struct pages */
 extern struct page *mem_map;


--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -385,12 +385,12 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
 #define PF_FREE_PAGES	0x00002000	/* per process page freeing */
 #define PF_FLUSHER	0x00004000	/* responsible for disk writeback */
-#define PF_RADIX_TREE	0x00008000	/* debug: performing radix tree alloc */
+#define PF_NOWARN	0x00008000	/* debug: don't warn if alloc fails */

 #define PF_FREEZE	0x00010000	/* this task should be frozen for suspend */
 #define PF_IOTHREAD	0x00020000	/* this thread is needed for doing I/O to swap */
 #define PF_FROZEN	0x00040000	/* frozen for system suspend */
-
+#define PF_INVALIDATE	0x00080000	/* debug: unmounting an fs. killme. */
 /*
 * Ptrace flags
 */
@@ -417,8 +417,7 @@ extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
 extern int idle_cpu(int cpu);

-asmlinkage long sys_sched_yield(void);
-#define yield() sys_sched_yield()
+void yield(void);

 /*
 * The default (Linux) execution domain.
@@ -836,10 +835,11 @@ static inline int need_resched(void)
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }

+extern void __cond_resched(void);
 static inline void cond_resched(void)
 {
 	if (need_resched())
-		schedule();
+		__cond_resched();
 }

 /* Reevaluate whether the task has signals pending delivery.

--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -30,7 +30,10 @@

 struct file;

-#define CTL_MAXNAME 10
+#define CTL_MAXNAME 10		/* how many path components do we allow in a
+				   call to sysctl?   In other words, what is
+				   the largest acceptable value for the nlen
+				   member of a struct __sysctl_args to have? */

 struct __sysctl_args {
 	int *name;
@@ -145,6 +148,7 @@ enum
 	VM_DIRTY_SYNC=13,	/* dirty_sync_ratio */
 	VM_DIRTY_WB_CS=14,	/* dirty_writeback_centisecs */
 	VM_DIRTY_EXPIRE_CS=15,	/* dirty_expire_centisecs */
+	VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */
 };



--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -24,18 +24,22 @@ static inline int current_is_pdflush(void)
 /*
 * fs/fs-writeback.c
 */
-#define WB_SYNC_NONE	0	/* Don't wait on anything */
-#define WB_SYNC_LAST	1	/* Wait on the last-written mapping */
-#define WB_SYNC_ALL	2	/* Wait on every mapping */
-#define WB_SYNC_HOLD	3	/* Hold the inode on sb_dirty for sys_sync() */
+enum writeback_sync_modes {
+	WB_SYNC_NONE =  0,	/* Don't wait on anything */
+	WB_SYNC_LAST =  1,	/* Wait on the last-written mapping */
+	WB_SYNC_ALL =   2,	/* Wait on every mapping */
+	WB_SYNC_HOLD =  3,	/* Hold the inode on sb_dirty for sys_sync() */
+};

-void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
-				unsigned long *older_than_this);
+void writeback_unlocked_inodes(int *nr_to_write,
+			       enum writeback_sync_modes sync_mode,
+			       unsigned long *older_than_this);
 void wake_up_inode(struct inode *inode);
 void __wait_on_inode(struct inode * inode);
 void sync_inodes_sb(struct super_block *, int wait);
 void sync_inodes(int wait);

+/* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
 	if (inode->i_state & I_LOCK)
@@ -45,15 +49,22 @@ static inline void wait_on_inode(struct inode *inode)
 /*
 * mm/page-writeback.c
 */
+/* These 5 are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int dirty_async_ratio;
 extern int dirty_sync_ratio;
 extern int dirty_writeback_centisecs;
 extern int dirty_expire_centisecs;

+
 void balance_dirty_pages(struct address_space *mapping);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, int *nr_to_write);

+/* pdflush.c */
+extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
+				   read-only. */
+
+
 #endif		/* WRITEBACK_H */
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -196,14 +196,12 @@ EXPORT_SYMBOL(notify_change);
 EXPORT_SYMBOL(set_blocksize);
 EXPORT_SYMBOL(sb_set_blocksize);
 EXPORT_SYMBOL(sb_min_blocksize);
-EXPORT_SYMBOL(__getblk);
 EXPORT_SYMBOL(cdget);
 EXPORT_SYMBOL(cdput);
 EXPORT_SYMBOL(bdget);
 EXPORT_SYMBOL(bdput);
 EXPORT_SYMBOL(bd_claim);
 EXPORT_SYMBOL(bd_release);
-EXPORT_SYMBOL(__bread);
 EXPORT_SYMBOL(__brelse);
 EXPORT_SYMBOL(__bforget);
 EXPORT_SYMBOL(ll_rw_block);
@@ -475,7 +473,8 @@ EXPORT_SYMBOL(schedule);
 EXPORT_SYMBOL(preempt_schedule);
 #endif
 EXPORT_SYMBOL(schedule_timeout);
-EXPORT_SYMBOL(sys_sched_yield);
+EXPORT_SYMBOL(yield);
+EXPORT_SYMBOL(__cond_resched);
 EXPORT_SYMBOL(set_user_nice);
 EXPORT_SYMBOL(task_nice);
 EXPORT_SYMBOL_GPL(idle_cpu);
@@ -550,7 +549,6 @@ EXPORT_SYMBOL(file_fsync);
 EXPORT_SYMBOL(fsync_buffers_list);
 EXPORT_SYMBOL(clear_inode);
 EXPORT_SYMBOL(init_special_inode);
-EXPORT_SYMBOL(__find_get_block);
 EXPORT_SYMBOL(new_inode);
 EXPORT_SYMBOL(__insert_inode_hash);
 EXPORT_SYMBOL(remove_inode_hash);

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1447,6 +1447,18 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }

+void __cond_resched(void)
+{
+	set_current_state(TASK_RUNNING);
+	schedule();
+}
+
+void yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+
 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;

--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -262,7 +262,6 @@ void tasklet_kill(struct tasklet_struct *t)
 		printk("Attempt to kill tasklet from interrupt\n");

 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
-		current->state = TASK_RUNNING;
 		do
 			yield();
 		while (test_bit(TASKLET_STATE_SCHED, &t->state));

--- a/kernel/suspend.c
+++ b/kernel/suspend.c
@@ -237,8 +237,7 @@ int freeze_processes(void)
 			todo++;
 		}
 		read_unlock(&tasklist_lock);
-		sys_sched_yield();
-		schedule();
+		yield();
 		if (time_after(jiffies, start_time + TIMEOUT)) {
 			PRINTK( "\n" );
 			printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );

--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -258,6 +258,13 @@ static ctl_table kern_table[] = {
 	{0}
 };

+/* Constants for minimum and maximum testing in vm_table.
+   We use these as one-element integer vectors. */
+static int zero = 0;
+static int one = 1;
+static int one_hundred = 100;
+
+
 static ctl_table vm_table[] = {
 	{VM_OVERCOMMIT_MEMORY, "overcommit_memory", &sysctl_overcommit_memory,
 	 sizeof(sysctl_overcommit_memory), 0644, NULL, &proc_dointvec},
@@ -266,18 +273,37 @@ static ctl_table vm_table[] = {
 	{VM_PAGE_CLUSTER, "page-cluster", 
 	 &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
 	{VM_DIRTY_BACKGROUND, "dirty_background_ratio",
-	&dirty_background_ratio, sizeof(dirty_background_ratio),
-	0644, NULL, &proc_dointvec},
+	 &dirty_background_ratio, sizeof(dirty_background_ratio),
+	 0644, NULL, &proc_dointvec_minmax,  &sysctl_intvec, NULL,
+	 &zero, &one_hundred },
 	{VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio,
-	sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec},
+	 sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, &one_hundred },
 	{VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio,
-	sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec},
+	 sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec_minmax,
+	 &sysctl_intvec, NULL, &zero, &one_hundred },
 	{VM_DIRTY_WB_CS, "dirty_writeback_centisecs",
-	&dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644,
-	NULL, &proc_dointvec},
+	 &dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644,
+	 NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL,
+	 /* Here, we define the range of possible values for
+	    dirty_writeback_centisecs.
+
+	    The default value is 5 seconds (500 centisec).  We will use 1
+	    centisec, the smallest possible value that could make any sort of
+	    sense.  If we allowed the user to set the interval to 0 seconds
+	    (which would presumably mean to chew up all of the CPU looking for
+	    dirty pages and writing them out, without taking a break), the
+	    interval would effectively become 1 second (100 centisecs), due to
+	    some nicely documented throttling code in wb_kupdate().
+
+	    There is no maximum legal value for dirty_writeback. */
+	 &one , NULL},
 	{VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs",
-	&dirty_expire_centisecs, sizeof(dirty_expire_centisecs), 0644,
-	NULL, &proc_dointvec},
+	 &dirty_expire_centisecs, sizeof(dirty_expire_centisecs), 0644,
+	 NULL, &proc_dointvec},
+	{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
+	  &nr_pdflush_threads, sizeof nr_pdflush_threads,
+	  0444 /* read-only*/, NULL, &proc_dointvec},
 	{0}
 };


--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -177,8 +177,13 @@ static inline void truncate_partial_page(struct page *page, unsigned partial)
 static void truncate_complete_page(struct page *page)
 {
 	/* Leave it on the LRU if it gets converted into anonymous buffers */
-	if (!PagePrivate(page) || do_invalidatepage(page, 0))
+	if (!PagePrivate(page) || do_invalidatepage(page, 0)) {
 		lru_cache_del(page);
+	} else {
+		if (current->flags & PF_INVALIDATE)
+			printk("%s: buffer heads were leaked\n",
+				current->comm);
+	}
 	ClearPageDirty(page);
 	ClearPageUptodate(page);
 	remove_inode_page(page);
@@ -362,16 +367,18 @@ static int invalidate_list_pages2(struct address_space * mapping,
 	while (curr != head) {
 		page = list_entry(curr, struct page, list);

-		if (PageWriteback(page)) {
-			write_unlock(&mapping->page_lock);
-			wait_on_page_writeback(page);
-			unlocked = 1;
-			write_lock(&mapping->page_lock);
-			goto restart;
-		}
 		if (!TestSetPageLocked(page)) {
 			int __unlocked;

+			if (PageWriteback(page)) {
+				write_unlock(&mapping->page_lock);
+				wait_on_page_writeback(page);
+				unlocked = 1;
+				write_lock(&mapping->page_lock);
+				unlock_page(page);
+				goto restart;
+			}
+
 			__unlocked = invalidate_this_page2(mapping, page, curr, head);
 			unlock_page(page);
 			unlocked |= __unlocked;
@@ -510,24 +517,32 @@ int filemap_fdatawait(struct address_space * mapping)
 }

 /*
- * This adds a page to the page cache, starting out as locked,
- * owned by us, but unreferenced, not uptodate and with no errors.
- * The caller must hold a write_lock on the mapping->page_lock.
+ * This adds a page to the page cache, starting out as locked, unreferenced,
+ * not uptodate and with no errors.
+ *
+ * The caller must hold a write_lock on mapping->page_lock.
+ *
+ * This function is used for two things: adding newly allocated pagecache
+ * pages and for moving existing anon pages into swapcache.
+ *
+ * In the case of pagecache pages, the page is new, so we can just run
+ * SetPageLocked() against it.  The other page state flags were set by
+ * rmqueue()
+ *
+ * In the case of swapcache, try_to_swap_out() has already locked the page, so
+ * SetPageLocked() is ugly-but-OK there too.  The required page state has been
+ * set up by swap_out_add_to_swap_cache().
 */
 static int __add_to_page_cache(struct page *page,
 		struct address_space *mapping, unsigned long offset)
 {
-	page_cache_get(page);
-	if (radix_tree_insert(&mapping->page_tree, offset, page) < 0)
-		goto nomem;
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-			1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
-	SetPageLocked(page);
-	ClearPageDirty(page);
-	___add_to_page_cache(page, mapping, offset);
-	return 0;
- nomem:
-	page_cache_release(page);
+	if (radix_tree_insert(&mapping->page_tree, offset, page) == 0) {
+		SetPageLocked(page);
+		ClearPageDirty(page);
+		___add_to_page_cache(page, mapping, offset);
+		page_cache_get(page);
+		return 0;
+	}
 	return -ENOMEM;
 }

@@ -1116,8 +1131,6 @@ static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, si
 	retval = -EINVAL;
 	if ((offset & blocksize_mask) || (count & blocksize_mask))
 		goto out_free;
-	if (!mapping->a_ops->direct_IO)
-		goto out_free;

 	/*
 	 * Flush to disk exclusively the _data_, metadata must remain

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -503,18 +503,18 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 			while (!(map = follow_page(mm, start, write))) {
 				spin_unlock(&mm->page_table_lock);
 				switch (handle_mm_fault(mm, vma, start, write)) {
-				case 1:
+				case VM_FAULT_MINOR:
 					tsk->min_flt++;
 					break;
-				case 2:
+				case VM_FAULT_MAJOR:
 					tsk->maj_flt++;
 					break;
-				case 0:
-					if (i) return i;
-					return -EFAULT;
+				case VM_FAULT_SIGBUS:
+					return i ? i : -EFAULT;
+				case VM_FAULT_OOM:
+					return i ? i : -ENOMEM;
 				default:
-					if (i) return i;
-					return -ENOMEM;
+					BUG();
 				}
 				spin_lock(&mm->page_table_lock);
 			}
@@ -612,7 +612,7 @@ void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
 		page = iobuf->maplist[index];
 		
 		if (!PageReserved(page))
-			SetPageDirty(page);
+			set_page_dirty(page);

 		remaining -= (PAGE_SIZE - offset);
 		offset = 0;
@@ -968,7 +968,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
 			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
-			return 1;	/* Minor fault */
+			return VM_FAULT_MINOR;
 		}
 	}
 	pte_unmap(page_table);
@@ -1002,16 +1002,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	spin_unlock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
-	return 1;	/* Minor fault */
+	return VM_FAULT_MINOR;

 bad_wp_page:
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 	printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address);
-	return -1;
+	/*
+	 * This should really halt the system so it can be debugged or
+	 * at least the kernel stops what it's doing before it corrupts
+	 * data, but for the moment just pretend this is OOM.
+	 */
+	return VM_FAULT_OOM;
 no_mem:
 	page_cache_release(old_page);
-	return -1;
+	return VM_FAULT_OOM;
 }

 static void vmtruncate_list(list_t *head, unsigned long pgoff)
@@ -1135,7 +1140,7 @@ static int do_swap_page(struct mm_struct * mm,
 	struct page *page;
 	swp_entry_t entry = pte_to_swp_entry(orig_pte);
 	pte_t pte;
-	int ret = 1;
+	int ret = VM_FAULT_MINOR;

 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
@@ -1148,17 +1153,19 @@ static int do_swap_page(struct mm_struct * mm,
 			 * Back out if somebody else faulted in this pte while
 			 * we released the page table lock.
 			 */
-			int retval;
 			spin_lock(&mm->page_table_lock);
 			page_table = pte_offset_map(pmd, address);
-			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
+			if (pte_same(*page_table, orig_pte))
+				ret = VM_FAULT_OOM;
+			else
+				ret = VM_FAULT_MINOR;
 			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
-			return retval;
+			return ret;
 		}

 		/* Had to read the page from swap area: Major fault */
-		ret = 2;
+		ret = VM_FAULT_MAJOR;
 	}

 	lock_page(page);
@@ -1174,7 +1181,7 @@ static int do_swap_page(struct mm_struct * mm,
 		spin_unlock(&mm->page_table_lock);
 		unlock_page(page);
 		page_cache_release(page);
-		return 1;
+		return VM_FAULT_MINOR;
 	}

 	/* The page isn't present yet, go ahead with the fault. */
@@ -1232,7 +1239,7 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
 			pte_unmap(page_table);
 			page_cache_release(page);
 			spin_unlock(&mm->page_table_lock);
-			return 1;
+			return VM_FAULT_MINOR;
 		}
 		mm->rss++;
 		flush_page_to_ram(page);
@@ -1246,10 +1253,10 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, entry);
 	spin_unlock(&mm->page_table_lock);
-	return 1;	/* Minor fault */
+	return VM_FAULT_MINOR;

 no_mem:
-	return -1;
+	return VM_FAULT_OOM;
 }

 /*
@@ -1277,10 +1284,11 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,

 	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);

-	if (new_page == NULL)	/* no page was available -- SIGBUS */
-		return 0;
+	/* no page was available -- either SIGBUS or OOM */
+	if (new_page == NOPAGE_SIGBUS)
+		return VM_FAULT_SIGBUS;
 	if (new_page == NOPAGE_OOM)
-		return -1;
+		return VM_FAULT_OOM;

 	/*
 	 * Should we do an early C-O-W break?
@@ -1289,7 +1297,7 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 		struct page * page = alloc_page(GFP_HIGHUSER);
 		if (!page) {
 			page_cache_release(new_page);
-			return -1;
+			return VM_FAULT_OOM;
 		}
 		copy_user_highpage(page, new_page, address);
 		page_cache_release(new_page);
@@ -1325,13 +1333,13 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 		pte_unmap(page_table);
 		page_cache_release(new_page);
 		spin_unlock(&mm->page_table_lock);
-		return 1;
+		return VM_FAULT_MINOR;
 	}

 	/* no need to invalidate: a not-present page shouldn't be cached */
 	update_mmu_cache(vma, address, entry);
 	spin_unlock(&mm->page_table_lock);
-	return 2;	/* Major fault */
+	return VM_FAULT_MAJOR;
 }

 /*
@@ -1383,7 +1391,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	establish_pte(vma, address, pte, entry);
 	pte_unmap(pte);
 	spin_unlock(&mm->page_table_lock);
-	return 1;
+	return VM_FAULT_MINOR;
 }

 /*
@@ -1411,7 +1419,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 			return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
 	}
 	spin_unlock(&mm->page_table_lock);
-	return -1;
+	return VM_FAULT_OOM;
 }

 /*

--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -19,8 +19,9 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/init.h>
-#include <linux/sysrq.h>
+//#include <linux/sysrq.h>
 #include <linux/backing-dev.h>
+#include <linux/mpage.h>

 /*
 * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -47,6 +48,8 @@
 #define SYNC_WRITEBACK_PAGES	1500


+/* The following parameters are exported via /proc/sys/vm */
+
 /*
 * Dirty memory thresholds, in percentages
 */
@@ -67,15 +70,18 @@ int dirty_async_ratio = 50;
 int dirty_sync_ratio = 60;

 /*
- * The interval between `kupdate'-style writebacks.
+ * The interval between `kupdate'-style writebacks, in centiseconds
+ * (hundredths of a second)
 */
 int dirty_writeback_centisecs = 5 * 100;

 /*
- * The largest amount of time for which data is allowed to remain dirty
+ * The longest amount of time for which data is allowed to remain dirty
 */
 int dirty_expire_centisecs = 30 * 100;

+/* End of sysctl-exported parameters */
+

 static void background_writeout(unsigned long _min_pages);

@@ -233,7 +239,8 @@ static void wb_kupdate(unsigned long arg)
 static void wb_timer_fn(unsigned long unused)
 {
 	if (pdflush_operation(wb_kupdate, 0) < 0)
-		mod_timer(&wb_timer, jiffies + HZ);
+		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+
 }

 static int __init wb_timer_init(void)
@@ -307,108 +314,9 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
 }
 EXPORT_SYMBOL(generic_vm_writeback);

-/**
- * generic_writepages - walk the list of dirty pages of the given
- * address space and writepage() all of them.
- * 
- * @mapping: address space structure to write
- * @nr_to_write: subtract the number of written pages from *@nr_to_write
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- *
- * (The next two paragraphs refer to code which isn't here yet, but they
- *  explain the presence of address_space.io_pages)
- *
- * Pages can be moved from clean_pages or locked_pages onto dirty_pages
- * at any time - it's not possible to lock against that.  So pages which
- * have already been added to a BIO may magically reappear on the dirty_pages
- * list.  And generic_writepages() will again try to lock those pages.
- * But I/O has not yet been started against the page.  Thus deadlock.
- *
- * To avoid this, the entire contents of the dirty_pages list are moved
- * onto io_pages up-front.  We then walk io_pages, locking the
- * pages and submitting them for I/O, moving them to locked_pages.
- *
- * This has the added benefit of preventing a livelock which would otherwise
- * occur if pages are being dirtied faster than we can write them out.
- *
- * If a page is already under I/O, generic_writepages() skips it, even
- * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
- * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
- * and msync() need to guarentee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  The way to do this is
- * to run filemap_fdatawait() before calling filemap_fdatawrite().
- *
- * It's fairly rare for PageWriteback pages to be on ->dirty_pages.  It
- * means that someone redirtied the page while it was under I/O.
- */
 int generic_writepages(struct address_space *mapping, int *nr_to_write)
 {
-	int (*writepage)(struct page *) = mapping->a_ops->writepage;
-	int ret = 0;
-	int done = 0;
-	int err;
-
-	write_lock(&mapping->page_lock);
-
-	list_splice(&mapping->dirty_pages, &mapping->io_pages);
-	INIT_LIST_HEAD(&mapping->dirty_pages);
-
-        while (!list_empty(&mapping->io_pages) && !done) {
-		struct page *page = list_entry(mapping->io_pages.prev,
-					struct page, list);
-		list_del(&page->list);
-		if (PageWriteback(page)) {
-			if (PageDirty(page)) {
-				list_add(&page->list, &mapping->dirty_pages);
-				continue;
-			}
-			list_add(&page->list, &mapping->locked_pages);
-			continue;
-		}
-		if (!PageDirty(page)) {
-			list_add(&page->list, &mapping->clean_pages);
-			continue;
-		}
-		list_add(&page->list, &mapping->locked_pages);
-		page_cache_get(page);
-		write_unlock(&mapping->page_lock);
-		lock_page(page);
-
-		/* It may have been removed from swapcache: check ->mapping */
-		if (page->mapping && TestClearPageDirty(page) &&
-					!PageWriteback(page)) {
-			/* FIXME: batch this up */
-			if (!PageActive(page) && PageLRU(page)) {
-				spin_lock(&pagemap_lru_lock);
-				if (!PageActive(page) && PageLRU(page)) {
-					list_del(&page->lru);
-					list_add(&page->lru, &inactive_list);
-				}
-				spin_unlock(&pagemap_lru_lock);
-			}
-			err = writepage(page);
-			if (!ret)
-				ret = err;
-			if (nr_to_write && --(*nr_to_write) <= 0)
-				done = 1;
-		} else {
-			unlock_page(page);
-		}
-
-		page_cache_release(page);
-		write_lock(&mapping->page_lock);
-	}
-	if (!list_empty(&mapping->io_pages)) {
-		/*
-		 * Put the rest back, in the correct order.
-		 */
-		list_splice(&mapping->io_pages, mapping->dirty_pages.prev);
-		INIT_LIST_HEAD(&mapping->io_pages);
-	}
-	write_unlock(&mapping->page_lock);
-	return ret;
+	return mpage_writepages(mapping, nr_to_write, NULL);
 }
 EXPORT_SYMBOL(generic_writepages);


--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -86,24 +86,24 @@ static void __free_pages_ok (struct page *page, unsigned int order)
 	struct page *base;
 	zone_t *zone;

-	if (PagePrivate(page))
-		BUG();
-	if (page->mapping)
-		BUG();
-	if (PageLocked(page))
-		BUG();
-	if (PageLRU(page))
-		BUG();
-	if (PageActive(page))
-		BUG();
-	if (PageWriteback(page))
-		BUG();
-	ClearPageDirty(page);
-	page->flags &= ~(1<<PG_referenced);
-
-	if (current->flags & PF_FREE_PAGES)
-		goto local_freelist;
- back_local_freelist:
+	BUG_ON(PagePrivate(page));
+	BUG_ON(page->mapping != NULL);
+	BUG_ON(PageLocked(page));
+	BUG_ON(PageLRU(page));
+	BUG_ON(PageActive(page));
+	BUG_ON(PageWriteback(page));
+	if (PageDirty(page))
+		ClearPageDirty(page);
+	BUG_ON(page_count(page) != 0);
+
+	if (unlikely(current->flags & PF_FREE_PAGES)) {
+		if (!current->nr_local_pages && !in_interrupt()) {
+			list_add(&page->list, &current->local_pages);
+			page->index = order;
+			current->nr_local_pages++;
+			goto out;
+		}
+	}

 	zone = page_zone(page);

@@ -113,18 +113,14 @@ static void __free_pages_ok (struct page *page, unsigned int order)
 	if (page_idx & ~mask)
 		BUG();
 	index = page_idx >> (1 + order);
-
 	area = zone->free_area + order;

 	spin_lock_irqsave(&zone->lock, flags);
-
 	zone->free_pages -= mask;
-
 	while (mask + (1 << (MAX_ORDER-1))) {
 		struct page *buddy1, *buddy2;

-		if (area >= zone->free_area + MAX_ORDER)
-			BUG();
+		BUG_ON(area >= zone->free_area + MAX_ORDER);
 		if (!__test_and_change_bit(index, area->map))
 			/*
 			 * the buddy page is still allocated.
@@ -137,11 +133,8 @@ static void __free_pages_ok (struct page *page, unsigned int order)
 		 */
 		buddy1 = base + (page_idx ^ -mask);
 		buddy2 = base + page_idx;
-		if (bad_range(zone, buddy1))
-			BUG();
-		if (bad_range(zone, buddy2))
-			BUG();
-
+		BUG_ON(bad_range(zone, buddy1));
+		BUG_ON(bad_range(zone, buddy2));
 		list_del(&buddy1->list);
 		mask <<= 1;
 		area++;
@@ -149,19 +142,9 @@ static void __free_pages_ok (struct page *page, unsigned int order)
 		page_idx &= mask;
 	}
 	list_add(&(base + page_idx)->list, &area->free_list);
-
 	spin_unlock_irqrestore(&zone->lock, flags);
+out:
 	return;
-
- local_freelist:
-	if (current->nr_local_pages)
-		goto back_local_freelist;
-	if (in_interrupt())
-		goto back_local_freelist;		
-
-	list_add(&page->list, &current->local_pages);
-	page->index = order;
-	current->nr_local_pages++;
 }

 #define MARK_USED(index, order, area) \
@@ -173,8 +156,7 @@ static inline struct page * expand (zone_t *zone, struct page *page,
 	unsigned long size = 1 << high;

 	while (high > low) {
-		if (bad_range(zone, page))
-			BUG();
+		BUG_ON(bad_range(zone, page));
 		area--;
 		high--;
 		size >>= 1;
@@ -183,11 +165,28 @@ static inline struct page * expand (zone_t *zone, struct page *page,
 		index += size;
 		page += size;
 	}
-	if (bad_range(zone, page))
-		BUG();
+	BUG_ON(bad_range(zone, page));
 	return page;
 }

+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline void prep_new_page(struct page *page)
+{
+	BUG_ON(page->mapping);
+	BUG_ON(PagePrivate(page));
+	BUG_ON(PageLocked(page));
+	BUG_ON(PageLRU(page));
+	BUG_ON(PageActive(page));
+	BUG_ON(PageDirty(page));
+	BUG_ON(PageWriteback(page));
+	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+			1 << PG_referenced | 1 << PG_arch_1 |
+			1 << PG_checked);
+	set_page_count(page, 1);
+}
+
 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
 static struct page * rmqueue(zone_t *zone, unsigned int order)
 {
@@ -206,8 +205,7 @@ static struct page * rmqueue(zone_t *zone, unsigned int order)
 			unsigned int index;

 			page = list_entry(curr, struct page, list);
-			if (bad_range(zone, page))
-				BUG();
+			BUG_ON(bad_range(zone, page));
 			list_del(curr);
 			index = page - zone->zone_mem_map;
 			if (curr_order != MAX_ORDER-1)
@@ -217,13 +215,9 @@ static struct page * rmqueue(zone_t *zone, unsigned int order)
 			page = expand(zone, page, index, order, curr_order, area);
 			spin_unlock_irqrestore(&zone->lock, flags);

-			set_page_count(page, 1);
 			if (bad_range(zone, page))
 				BUG();
-			if (PageLRU(page))
-				BUG();
-			if (PageActive(page))
-				BUG();
+			prep_new_page(page);
 			return page;	
 		}
 		curr_order++;
@@ -266,16 +260,14 @@ struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
 }
 #endif

-static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
-static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
+static /* inline */ struct page *
+balance_classzone(zone_t * classzone, unsigned int gfp_mask,
+			unsigned int order, int * freed)
 {
 	struct page * page = NULL;
 	int __freed = 0;

-	if (!(gfp_mask & __GFP_WAIT))
-		goto out;
-	if (in_interrupt())
-		BUG();
+	BUG_ON(in_interrupt());

 	current->allocation_order = order;
 	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
@@ -298,25 +290,9 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask
 				tmp = list_entry(entry, struct page, list);
 				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
 					list_del(entry);
-					current->nr_local_pages--;
-					set_page_count(tmp, 1);
 					page = tmp;
-
-					if (PagePrivate(page))
-						BUG();
-					if (page->mapping)
-						BUG();
-					if (PageLocked(page))
-						BUG();
-					if (PageLRU(page))
-						BUG();
-					if (PageActive(page))
-						BUG();
-					if (PageDirty(page))
-						BUG();
-					if (PageWriteback(page))
-						BUG();
-
+					current->nr_local_pages--;
+					prep_new_page(page);
 					break;
 				}
 			} while ((entry = entry->next) != local_pages);
@@ -333,7 +309,6 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask
 		}
 		current->nr_local_pages = 0;
 	}
- out:
 	*freed = __freed;
 	return page;
 }
@@ -380,7 +355,7 @@ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_
 			break;

 		local_min = z->pages_min;
-		if (!(gfp_mask & __GFP_WAIT))
+		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
 		min += local_min;
 		if (z->free_pages > min) {
@@ -405,7 +380,7 @@ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_
 				return page;
 		}
 nopage:
-		if (!(current->flags & PF_RADIX_TREE)) {
+		if (!(current->flags & PF_NOWARN)) {
 			printk("%s: page allocation failure."
 				" order:%d, mode:0x%x\n",
 				current->comm, order, gfp_mask);
@@ -441,7 +416,6 @@ struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_
 		goto nopage;

 	/* Yield for kswapd, and try again */
-	__set_current_state(TASK_RUNNING);
 	yield();
 	goto rebalance;
 }
@@ -800,8 +774,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 	unsigned long totalpages, offset, realtotalpages;
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);

-	if (zone_start_paddr & ~PAGE_MASK)
-		BUG();
+	BUG_ON(zone_start_paddr & ~PAGE_MASK);

 	totalpages = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++) {

--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -15,37 +15,26 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
-#include <linux/buffer_head.h>
-#include <asm/pgtable.h>
 #include <linux/swapops.h>
-
-static int
-swap_get_block(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh_result, int create)
-{
-	struct swap_info_struct *sis;
-	swp_entry_t entry;
-
-	entry.val = iblock;
-	sis = get_swap_info_struct(swp_type(entry));
-	bh_result->b_bdev = sis->bdev;
-	bh_result->b_blocknr = map_swap_page(sis, swp_offset(entry));
-	bh_result->b_size = PAGE_SIZE;
-	set_buffer_mapped(bh_result);
-	return 0;
-}
+#include <linux/buffer_head.h>	/* for block_sync_page() */
+#include <asm/pgtable.h>

 static struct bio *
 get_swap_bio(int gfp_flags, struct page *page, bio_end_io_t end_io)
 {
 	struct bio *bio;
-	struct buffer_head bh;

 	bio = bio_alloc(gfp_flags, 1);
 	if (bio) {
-		swap_get_block(NULL, page->index, &bh, 1);
-		bio->bi_sector = bh.b_blocknr * (PAGE_SIZE >> 9);
-		bio->bi_bdev = bh.b_bdev;
+		struct swap_info_struct *sis;
+		swp_entry_t entry;
+
+		entry.val = page->index;
+		sis = get_swap_info_struct(swp_type(entry));
+
+		bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
+					(PAGE_SIZE >> 9);
+		bio->bi_bdev = sis->bdev;
 		bio->bi_io_vec[0].bv_page = page;
 		bio->bi_io_vec[0].bv_len = PAGE_SIZE;
 		bio->bi_io_vec[0].bv_offset = 0;
@@ -98,6 +87,7 @@ int swap_writepage(struct page *page)
 	}
 	bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
 	if (bio == NULL) {
+		set_page_dirty(page);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -129,7 +119,7 @@ int swap_readpage(struct file *file, struct page *page)
 * swapper_space doesn't have a real inode, so it gets a special vm_writeback()
 * so we don't need swap special cases in generic_vm_writeback().
 *
- * Swap pages are PageLocked and PageWriteback while under writeout so that
+ * Swap pages are !PageLocked and PageWriteback while under writeout so that
 * memory allocators will throttle against them.
 */
 static int swap_vm_writeback(struct page *page, int *nr_to_write)

--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -15,6 +15,9 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
+#include <linux/sched.h>	// Needed by writeback.h
+#include <linux/fs.h>		// Needed by writeback.h
+#include <linux/writeback.h>	// Prototypes pdflush_operation()


 /*
@@ -44,8 +47,11 @@ static spinlock_t pdflush_lock = SPIN_LOCK_UNLOCKED;
 /*
 * The count of currently-running pdflush threads.  Protected
 * by pdflush_lock.
+ *
+ * Readable by sysctl, but not writable.  Published to userspace at
+ * /proc/sys/vm/nr_pdflush_threads.
 */
-static int nr_pdflush_threads = 0;
+int nr_pdflush_threads = 0;

 /*
 * The time at which the pdflush thread pool last went empty

--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -426,22 +426,15 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	swap_free(entry);
 	ptr[offset] = (swp_entry_t) {0};

-	while (inode && (PageWriteback(page) ||
-			move_from_swap_cache(page, idx, inode->i_mapping))) {
+	while (inode && move_from_swap_cache(page, idx, inode->i_mapping)) {
 		/*
 		 * Yield for kswapd, and try again - but we're still
 		 * holding the page lock - ugh! fix this up later on.
 		 * Beware of inode being unlinked or truncated: just
 		 * leave try_to_unuse to delete_from_swap_cache if so.
-		 *
-		 * AKPM: We now wait on writeback too.  Note that it's
-		 * the page lock which prevents new writeback from starting.
 		 */
 		spin_unlock(&info->lock);
-		if (PageWriteback(page))
-			wait_on_page_writeback(page);
-		else
-			yield();
+		yield();
 		spin_lock(&info->lock);
 		ptr = shmem_swp_entry(info, idx, 0);
 		if (IS_ERR(ptr))
@@ -607,6 +600,7 @@ static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct
 			spin_unlock(&info->lock);
 			wait_on_page_writeback(page);
 			unlock_page(page);
+			page_cache_release(page);
 			goto repeat;
 		}
 		error = move_from_swap_cache(page, idx, mapping);

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1153,12 +1153,12 @@ static int kmem_cache_grow (kmem_cache_t * cachep, int flags)
 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
 	 * will eventually be caught here (where it matters).
 	 */
-	if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC)
+	if (in_interrupt() && (flags & __GFP_WAIT))
 		BUG();

 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 	local_flags = (flags & SLAB_LEVEL_MASK);
-	if (local_flags == SLAB_ATOMIC)
+	if (!(local_flags & __GFP_WAIT))
 		/*
 		 * Not allowed to sleep.  Need to tell a constructor about
 		 * this - it might need to know...

--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,16 +21,9 @@
 /*
 * swapper_inode doesn't do anything much.  It is really only here to
 * avoid some special-casing in other parts of the kernel.
- *
- * We set i_size to "infinity" to keep the page I/O functions happy.  The swap
- * block allocator makes sure that allocations are in-range.  A strange
- * number is chosen to prevent various arith overflows elsewhere.  For example,
- * `lblock' in block_read_full_page().
 */
 static struct inode swapper_inode = {
 	i_mapping:	&swapper_space,
-	i_size:		PAGE_SIZE * 0xffffffffLL,
-	i_blkbits:	PAGE_SHIFT,
 };

 extern struct address_space_operations swap_aops;
@@ -160,9 +153,13 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)

 		/* Add it to the swap cache */
 		*pslot = page;
-		page->flags &= ~(1 << PG_uptodate | 1 << PG_error
-				| 1 << PG_referenced | 1 << PG_arch_1
-				| 1 << PG_checked);
+		/*
+		 * This code used to clear PG_uptodate, PG_error, PG_arch1,
+		 * PG_referenced and PG_checked.  What _should_ it clear?
+		 */
+		ClearPageUptodate(page);
+		ClearPageReferenced(page);
+
 		SetPageLocked(page);
 		ClearPageDirty(page);
 		___add_to_page_cache(page, &swapper_space, entry.val);
@@ -205,9 +202,14 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 		__delete_from_swap_cache(page);

 		*pslot = page;
-		page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-				 1 << PG_referenced | 1 << PG_arch_1 |
-				 1 << PG_checked);
+
+		/*
+		 * This code used to clear PG_uptodate, PG_error, PG_referenced,
+		 * PG_arch_1 and PG_checked.  It's not really clear why.
+		 */
+		ClearPageUptodate(page);
+		ClearPageReferenced(page);
+
 		/*
 		 * ___add_to_page_cache puts the page on ->clean_pages,
 		 * but it's dirty.  If it's on ->clean_pages, it will basically

--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -687,11 +687,10 @@ static int try_to_unuse(unsigned int type)
 		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
 			swap_writepage(page);
 			lock_page(page);
-		}
-		if (PageSwapCache(page)) {
 			wait_on_page_writeback(page);
-			delete_from_swap_cache(page);
 		}
+		if (PageSwapCache(page))
+			delete_from_swap_cache(page);

 		/*
 		 * So we could skip searching mms once swap count went

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ static inline int is_page_cache_freeable(struct page * page)
 * So PF_MEMALLOC is dropped here.  This causes the slab allocations to fail
 * earlier, so radix-tree nodes will then be allocated from the mempool
 * reserves.
+ *
+ * We're still using __GFP_HIGH for radix-tree node allocations, so some of
+ * the emergency pools are available - just not all of them.
 */
 static inline int
 swap_out_add_to_swap_cache(struct page *page, swp_entry_t entry)
@@ -60,7 +63,9 @@ swap_out_add_to_swap_cache(struct page *page, swp_entry_t entry)
 	int ret;

 	current->flags &= ~PF_MEMALLOC;
-	current->flags |= PF_RADIX_TREE;
+	current->flags |= PF_NOWARN;
+	ClearPageUptodate(page);		/* why? */
+	ClearPageReferenced(page);		/* why? */
 	ret = add_to_swap_cache(page, entry);
 	current->flags = flags;
 	return ret;