Merge master.kernel.org:/home/davem/BK/net-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge master.kernel.org:/home/davem/BK/net-2.5
into home.transmeta.com:/home/torvalds/v2.5/linux
e7144e64 · Linus Torvalds · da29f6a8 · 407ee6c8 · e7144e64 · e7144e64
Commit e7144e64 authored Sep 22, 2002 by Linus Torvalds
18 changed files
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -46,13 +46,76 @@ static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
 /*
- * How many reqeusts do we allocate per queue,
+ * Number of requests per queue.  This many for reads and for writes (twice
- * and how many do we "batch" on freeing them?
+ * this number, total).
 */
-int queue_nr_requests, batch_requests;
+static int queue_nr_requests;
+/*
+ * How many free requests must be available before we wake a process which
+ * is waiting for a request?
+ */
+static int batch_requests;
 unsigned long blk_max_low_pfn, blk_max_pfn;
 int blk_nohighio = 0;
+static struct congestion_state {
+	wait_queue_head_t wqh;
+	atomic_t nr_congested_queues;
+} congestion_states[2];
+/*
+ * Return the threshold (number of free requests) at which the queue is
+ * considered to be congested.  It include a little hysteresis to keep the
+ * context switch rate down.
+ */
+static inline int queue_congestion_on_threshold(void)
+{
+	int ret;
+	ret = queue_nr_requests / 4 - 1;
+	if (ret < 0)
+		ret = 1;
+	return ret;
+}
+/*
+ * The threshold at which a queue is considered to be uncongested
+ */
+static inline int queue_congestion_off_threshold(void)
+{
+	int ret;
+	ret = queue_nr_requests / 4 + 1;
+	if (ret > queue_nr_requests)
+		ret = queue_nr_requests;
+	return ret;
+}
+static void clear_queue_congested(request_queue_t *q, int rw)
+{
+	enum bdi_state bit;
+	struct congestion_state *cs = &congestion_states[rw];
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	if (test_and_clear_bit(bit, &q->backing_dev_info.state))
+		atomic_dec(&cs->nr_congested_queues);
+	if (waitqueue_active(&cs->wqh))
+		wake_up(&cs->wqh);
+}
+static void set_queue_congested(request_queue_t *q, int rw)
+{
+	enum bdi_state bit;
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	if (!test_and_set_bit(bit, &q->backing_dev_info.state))
+		atomic_inc(&congestion_states[rw].nr_congested_queues);
+}
 /**
 * bdev_get_queue: - return the queue that matches the given device
 * @bdev:    device
@@ -360,8 +423,8 @@ int blk_queue_init_tags(request_queue_t *q, int depth)
 	struct blk_queue_tag *tags;
 	int bits, i;
-	if (depth > queue_nr_requests) {
+	if (depth > (queue_nr_requests*2)) {
-		depth = queue_nr_requests;
+		depth = (queue_nr_requests*2);
 		printk("blk_queue_init_tags: adjusted depth to %d\n", depth);
 	}
@@ -1019,7 +1082,7 @@ static int __blk_cleanup_queue(struct request_list *list)
 **/
 void blk_cleanup_queue(request_queue_t * q)
 {
-	int count = queue_nr_requests;
+	int count = (queue_nr_requests*2);
 	count -= __blk_cleanup_queue(&q->rq[READ]);
 	count -= __blk_cleanup_queue(&q->rq[WRITE]);
@@ -1050,7 +1113,7 @@ static int blk_init_free_list(request_queue_t *q)
 	 * Divide requests in half between read and write
 	 */
 	rl = &q->rq[READ];
-	for (i = 0; i < queue_nr_requests; i++) {
+	for (i = 0; i < (queue_nr_requests*2); i++) {
 		rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
 		if (!rq)
 			goto nomem;
@@ -1058,7 +1121,7 @@ static int blk_init_free_list(request_queue_t *q)
 		/*
 		 * half way through, switch to WRITE list
 		 */
-		if (i == queue_nr_requests / 2)
+		if (i == queue_nr_requests)
 			rl = &q->rq[WRITE];
 		memset(rq, 0, sizeof(struct request));
@@ -1144,7 +1207,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 * Get a free request. queue lock must be held and interrupts
 * disabled on the way in.
 */
-static inline struct request *get_request(request_queue_t *q, int rw)
+static struct request *get_request(request_queue_t *q, int rw)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = q->rq + rw;
@@ -1153,6 +1216,8 @@ static inline struct request *get_request(request_queue_t *q, int rw)
 		rq = blkdev_free_rq(&rl->free);
 		list_del(&rq->queuelist);
 		rl->count--;
+		if (rl->count < queue_congestion_on_threshold())
+			set_queue_congested(q, rw);
 		rq->flags = 0;
 		rq->rq_status = RQ_ACTIVE;
 		rq->special = NULL;
@@ -1365,13 +1430,50 @@ void blk_put_request(struct request *req)
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (rl) {
+		int rw = 0;
 		list_add(&req->queuelist, &rl->free);
-		if (++rl->count >= batch_requests &&waitqueue_active(&rl->wait))
+		if (rl == &q->rq[WRITE])
+			rw = WRITE;
+		else if (rl == &q->rq[READ])
+			rw = READ;
+		else
+			BUG();
+		rl->count++;
+		if (rl->count >= queue_congestion_off_threshold())
+			clear_queue_congested(q, rw);
+		if (rl->count >= batch_requests && waitqueue_active(&rl->wait))
 			wake_up(&rl->wait);
 	}
 }
+/**
+ * blk_congestion_wait - wait for a queue to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
+ * If no queues are congested then just return, in the hope that the caller
+ * will submit some more IO.
+ */
+void blk_congestion_wait(int rw, long timeout)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	struct congestion_state *cs = &congestion_states[rw];
+	if (atomic_read(&cs->nr_congested_queues) == 0)
+		return;
+	blk_run_queues();
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&cs->wqh, &wait);
+	if (atomic_read(&cs->nr_congested_queues) != 0)
+		schedule_timeout(timeout);
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&cs->wqh, &wait);
+}
 /*
 * Has to be called with the request spinlock acquired
 */
@@ -1868,6 +1970,7 @@ void end_that_request_last(struct request *req)
 int __init blk_dev_init(void)
 {
 	int total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
+	int i;
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0,
@@ -1876,26 +1979,33 @@ int __init blk_dev_init(void)
 		panic("Can't create request pool slab cache\n");
 	/*
-	 * Free request slots per queue.
+	 * Free request slots per queue.  One per quarter-megabyte.
-	 * (Half for reads, half for writes)
+	 * We use this many requests for reads, and this many for writes.
-	 */
-	queue_nr_requests = (total_ram >> 8) & ~15;	/* One per quarter-megabyte */
-	if (queue_nr_requests < 32)
-		queue_nr_requests = 32;
-	if (queue_nr_requests > 256)
-		queue_nr_requests = 256;
-	/*
-	 * Batch frees according to queue length
 	 */
-	if ((batch_requests = queue_nr_requests / 4) > 32)
+	queue_nr_requests = (total_ram >> 9) & ~7;
-		batch_requests = 32;
+	if (queue_nr_requests < 16)
-	printk("block: %d slots per queue, batch=%d\n",
+		queue_nr_requests = 16;
-			queue_nr_requests, batch_requests);
+	if (queue_nr_requests > 128)
+		queue_nr_requests = 128;
+	batch_requests = queue_nr_requests / 8;
+	if (batch_requests > 8)
+		batch_requests = 8;
+	printk("block request queues:\n");
+	printk(" %d requests per read queue\n", queue_nr_requests);
+	printk(" %d requests per write queue\n", queue_nr_requests);
+	printk(" %d requests per batch\n", batch_requests);
+	printk(" enter congestion at %d\n", queue_congestion_on_threshold());
+	printk(" exit congestion at %d\n", queue_congestion_off_threshold());
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
+	for (i = 0; i < ARRAY_SIZE(congestion_states); i++) {
+		init_waitqueue_head(&congestion_states[i].wqh);
+		atomic_set(&congestion_states[i].nr_congested_queues, 0);
+	}
 	return 0;
 };

--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -28,7 +28,8 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
-#include <linux/buffer_head.h>		/* for fsync_bdev()/wakeup_bdflush() */
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>		/* for fsync_bdev() */
 #include <linux/spinlock.h>
@@ -227,7 +228,7 @@ static void sysrq_handle_sync(int key, struct pt_regs *pt_regs,
 			      struct tty_struct *tty) 
 {
 	emergency_sync_scheduled = EMERG_SYNC;
-	wakeup_bdflush();
+	wakeup_bdflush(0);
 }
 static struct sysrq_key_op sysrq_sync_op = {
 	handler:	sysrq_handle_sync,
@@ -239,7 +240,7 @@ static void sysrq_handle_mountro(int key, struct pt_regs *pt_regs,
 				 struct tty_struct *tty) 
 {
 	emergency_sync_scheduled = EMERG_REMOUNT;
-	wakeup_bdflush();
+	wakeup_bdflush(0);
 }
 static struct sysrq_key_op sysrq_mountro_op = {
 	handler:	sysrq_handle_mountro,

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -458,19 +458,17 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 }
 /*
- * FIXME: What is this function actually trying to do?  Why "zones[0]"?
+ * Kick pdflush then try to free up some ZONE_NORMAL memory.
- * Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
 */
 static void free_more_memory(void)
 {
 	struct zone *zone;
-	zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
+	zone = contig_page_data.node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
+	wakeup_bdflush(1024);
-	wakeup_bdflush();
-	try_to_free_pages(zone, GFP_NOFS, 0);
 	blk_run_queues();
 	yield();
+	try_to_free_pages(zone, GFP_NOFS, 0);
 }
 /*

--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -16,9 +16,9 @@
 #include "ext2.h"
 #include <linux/quotaops.h>
 #include <linux/sched.h>
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 /*
 * ialloc.c contains the inodes allocation and deallocation routines
 */
@@ -169,6 +169,13 @@ static void ext2_preread_inode(struct inode *inode)
 	unsigned long block;
 	struct buffer_head *bh;
 	struct ext2_group_desc * gdp;
+	struct backing_dev_info *bdi;
+	bdi = inode->i_mapping->backing_dev_info;
+	if (bdi_read_congested(bdi))
+		return;
+	if (bdi_write_congested(bdi))
+		return;
 	block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
 	gdp = ext2_get_group_desc(inode->i_sb, block_group, &bh);

--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1473,7 +1473,7 @@ struct address_space_operations ext3_aops = {
 };
 /* For writeback mode, we can use mpage_writepages() */
+#if 0	/* Doesn't work for shared mappings */
 static int
 ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
@@ -1486,12 +1486,12 @@ ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
 		ret = err;
 	return ret;
 }
+#endif
 struct address_space_operations ext3_writeback_aops = {
 	.readpage	= ext3_readpage,	/* BKL not held.  Don't need */
 	.readpages	= ext3_readpages,	/* BKL not held.  Don't need */
 	.writepage	= ext3_writepage,	/* BKL not held.  We take it */
-	.writepages	= ext3_writepages,	/* BKL not held.  Don't need */
 	.sync_page	= block_sync_page,
 	.prepare_write	= ext3_prepare_write,	/* BKL not held.  We take it */
 	.commit_write	= ext3_commit_write,	/* BKL not held.  We take it */

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync,
 *
 * FIXME: this linear search could get expensive with many fileystems.  But
 * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.
+ * a queue with that address_space.  (Easy: have a global "dirty superblocks"
+ * list).
 *
 * The inodes to be written are parked on sb->s_io.  They are moved back onto
 * sb->s_dirty as they are selected for writing.  This way, none can be missed
 * on the writer throttling path, and we get decent balancing between many
- * thrlttled threads: we don't want them all piling up on __wait_on_inode.
+ * throlttled threads: we don't want them all piling up on __wait_on_inode.
 */
 static void
 sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
-	struct list_head *tmp;
-	struct list_head *head;
 	const unsigned long start = jiffies;	/* livelock avoidance */
 	list_splice_init(&sb->s_dirty, &sb->s_io);
-	head = &sb->s_io;
+	while (!list_empty(&sb->s_io)) {
-	while ((tmp = head->prev) != head) {
+		struct inode *inode = list_entry(sb->s_io.prev,
-		struct inode *inode = list_entry(tmp, struct inode, i_list);
+						struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
-		struct backing_dev_info *bdi;
+		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		int really_sync;
-		if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
 			if (sb != blockdev_superblock)
-				break;		/* inappropriate superblock */
+				break;		/* Skip a congested fs */
 			list_move(&inode->i_list, &sb->s_dirty);
-			continue;		/* not this blockdev */
+			continue;		/* Skip a congested blockdev */
+		}
+		if (wbc->bdi && bdi != wbc->bdi) {
+			if (sb != blockdev_superblock)
+				break;		/* fs has the wrong queue */
+			list_move(&inode->i_list, &sb->s_dirty);
+			continue;		/* blockdev has wrong queue */
 		}
 		/* Was this inode dirtied after sync_sb_inodes was called? */
 		if (time_after(mapping->dirtied_when, start))
 			break;
+		/* Was this inode dirtied too recently? */
 		if (wbc->older_than_this && time_after(mapping->dirtied_when,
 						*wbc->older_than_this))
-			goto out;
+			break;
-		bdi = mapping->backing_dev_info;
+		/* Is another pdflush already flushing this queue? */
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;
@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		if (wbc->nr_to_write <= 0)
 			break;
 	}
-out:
+	return;		/* Leave any unwritten inodes on s_io */
-	/*
-	 * Leave any unwritten inodes on s_io.
-	 */
-	return;
 }
 /*

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -22,6 +22,7 @@
 #include <linux/prefetch.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 /*
@@ -522,6 +523,7 @@ int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
 	int ret = 0;
@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int (*writepage)(struct page *);
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		blk_run_queues();
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
 	writepage = NULL;
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping,
 			}
 			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				blk_run_queues();
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
 		} else {
 			unlock_page(page);
 		}

--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -8,11 +8,15 @@
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
+#include <asm/atomic.h>
 /*
 * Bits in backing_dev_info.state
 */
 enum bdi_state {
 	BDI_pdflush,		/* A pdflush thread is working this device */
+	BDI_write_congested,	/* The write queue is getting full */
+	BDI_read_congested,	/* The read queue is getting full */
 	BDI_unused,		/* Available bits start here */
 };
@@ -28,4 +32,14 @@ int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
 void writeback_release(struct backing_dev_info *bdi);
+static inline int bdi_read_congested(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_read_congested, &bdi->state);
+}
+static inline int bdi_write_congested(struct backing_dev_info *bdi)
+{
+	return test_bit(BDI_write_congested, &bdi->state);
+}
 #endif		/* _LINUX_BACKING_DEV_H */
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -345,6 +345,7 @@ extern void blk_queue_end_tag(request_queue_t *, struct request *);
 extern int blk_queue_init_tags(request_queue_t *, int);
 extern void blk_queue_free_tags(request_queue_t *);
 extern void blk_queue_invalidate_tags(request_queue_t *);
+extern void blk_congestion_wait(int rw, long timeout);
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128

--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -163,7 +163,6 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 struct buffer_head *__bread(struct block_device *, sector_t block, int size);
-void wakeup_bdflush(void);
 struct buffer_head *alloc_buffer_head(void);
 void free_buffer_head(struct buffer_head * bh);
 void FASTCALL(unlock_buffer(struct buffer_head *bh));

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,6 +273,7 @@ extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 typedef struct prio_array prio_array_t;
+struct backing_dev_info;
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -398,6 +399,7 @@ struct task_struct {
 /* journalling filesystem info */
 	void *journal_info;
 	struct dentry *proc_dentry;
+	struct backing_dev_info *backing_dev_info;
 };
 extern void __put_task_struct(struct task_struct *tsk);

--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -43,6 +43,8 @@ struct writeback_control {
 					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	int nonblocking;		/* Don't get stuck on request queues */
+	int encountered_congestion;	/* An output: a queue is full */
 };
 void writeback_inodes(struct writeback_control *wbc);
@@ -61,6 +63,8 @@ static inline void wait_on_inode(struct inode *inode)
 /*
 * mm/page-writeback.c
 */
+int wakeup_bdflush(long nr_pages);
 /* These 5 are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int dirty_async_ratio;

--- a/kernel/suspend.c
+++ b/kernel/suspend.c
@@ -81,7 +81,6 @@ unsigned char software_suspend_enabled = 0;
 #define TIMEOUT	(6 * HZ)			/* Timeout for stopping processes */
 #define ADDRESS(x) ((unsigned long) phys_to_virt(((x) << PAGE_SHIFT)))
-extern void wakeup_bdflush(void);
 extern int C_A_D;
 /* References to section boundaries */

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1755,6 +1755,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	if (unlikely(pos < 0))
 		return -EINVAL;
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
 	pagevec_init(&lru_pvec);
 	if (unlikely(file->f_error)) {
@@ -1959,6 +1962,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	err = written ? written : status;
 out:
 	pagevec_lru_add(&lru_pvec);
+	current->backing_dev_info = 0;
 	return err;
 }

--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -12,7 +12,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mempool.h>
-#include <linux/buffer_head.h>		/* for wakeup_bdflush() */
+#include <linux/writeback.h>
 static void add_element(mempool_t *pool, void *element)
 {
@@ -210,7 +210,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
 	/*
 	 * Kick the VM at this point.
 	 */
-	wakeup_bdflush();
+	wakeup_bdflush(0);
 	spin_lock_irqsave(&pool->lock, flags);
 	if (likely(pool->curr_nr)) {

--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/sysrq.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/mpage.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
@@ -172,33 +173,47 @@ static void background_writeout(unsigned long _min_pages)
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = NULL,
 		.nr_to_write	= 0,
+		.nonblocking	= 1,
 	};
 	CHECK_EMERGENCY_SYNC
 	background_thresh = (dirty_background_ratio * total_pages) / 100;
+	for ( ; ; ) {
-	do {
 		struct page_state ps;
 		get_page_state(&ps);
 		if (ps.nr_dirty < background_thresh && min_pages <= 0)
 			break;
+		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-	} while (wbc.nr_to_write <= 0);
+		if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+			/* Wrote nothing */
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ/10);
+			else
+				break;
+		}
+	}
 	blk_run_queues();
 }
 /*
- * Start heavy writeback of everything.
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
+ * -1 if all pdflush threads were busy.
 */
-void wakeup_bdflush(void)
+int wakeup_bdflush(long nr_pages)
 {
+	if (nr_pages == 0) {
 		struct page_state ps;
 		get_page_state(&ps);
-	pdflush_operation(background_writeout, ps.nr_dirty);
+		nr_pages = ps.nr_dirty;
+	}
+	return pdflush_operation(background_writeout, nr_pages);
 }
 static struct timer_list wb_timer;
@@ -223,25 +238,36 @@ static void wb_kupdate(unsigned long arg)
 	unsigned long oldest_jif;
 	unsigned long start_jif;
 	unsigned long next_jif;
+	long nr_to_write;
 	struct page_state ps;
 	struct writeback_control wbc = {
 		.bdi		= NULL,
 		.sync_mode	= WB_SYNC_NONE,
 		.older_than_this = &oldest_jif,
 		.nr_to_write	= 0,
+		.nonblocking	= 1,
 	};
 	sync_supers();
-	get_page_state(&ps);
+	get_page_state(&ps);
 	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 	start_jif = jiffies;
 	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
-	wbc.nr_to_write = ps.nr_dirty;
+	nr_to_write = ps.nr_dirty;
+	while (nr_to_write > 0) {
+		wbc.encountered_congestion = 0;
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		writeback_inodes(&wbc);
+		if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
+			if (wbc.encountered_congestion)
+				blk_congestion_wait(WRITE, HZ);
+			else
+				break;	/* All the old data is written */
+		}
+		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+	}
 	blk_run_queues();
-	yield();
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
 	mod_timer(&wb_timer, next_jif);
@@ -493,7 +519,6 @@ int __set_page_dirty_buffers(struct page *page)
 		buffer_error();
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
 		struct buffer_head *bh = head;
@@ -506,6 +531,7 @@ int __set_page_dirty_buffers(struct page *page)
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
+	spin_unlock(&mapping->private_lock);
 	if (!TestSetPageDirty(page)) {
 		write_lock(&mapping->page_lock);
@@ -519,7 +545,6 @@ int __set_page_dirty_buffers(struct page *page)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
-	spin_unlock(&mapping->private_lock);
 out:
 	return ret;
 }

--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -918,6 +918,26 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 	return ret;
 }
+#if 0	/* We don't need this yet */
+#include <linux/backing-dev.h>
+int page_queue_congested(struct page *page)
+{
+	struct backing_dev_info *bdi;
+	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
+	bdi = page->mapping->backing_dev_info;
+	if (PageSwapCache(page)) {
+		swp_entry_t entry = { .val = page->index };
+		struct swap_info_struct *sis;
+		sis = get_swap_info_struct(swp_type(entry));
+		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
+	}
+	return bdi_write_congested(bdi);
+}
+#endif
 asmlinkage long sys_swapoff(const char * specialfile)
 {
 	struct swap_info_struct * p = NULL;

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -21,9 +21,11 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/suspend.h>
+#include <linux/blkdev.h>
 #include <linux/buffer_head.h>		/* for try_to_release_page() */
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
+#include <linux/backing-dev.h>
 #include <linux/rmap-locking.h>
 #include <asm/pgalloc.h>
@@ -32,11 +34,11 @@
 /*
 * The "priority" of VM scanning is how much of the queues we
- * will scan in one go. A value of 6 for DEF_PRIORITY implies
+ * will scan in one go. A value of 12 for DEF_PRIORITY implies
- * that we'll scan 1/64th of the queues ("queue_length >> 6")
+ * that we'll scan 1/4096th of the queues ("queue_length >> 12")
 * during a normal aging round.
 */
-#define DEF_PRIORITY (6)
+#define DEF_PRIORITY 12
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
@@ -95,7 +97,7 @@ static inline int is_page_cache_freeable(struct page *page)
 static /* inline */ int
 shrink_list(struct list_head *page_list, int nr_pages,
-		unsigned int gfp_mask, int priority, int *max_scan)
+		unsigned int gfp_mask, int *max_scan)
 {
 	struct address_space *mapping;
 	LIST_HEAD(ret_pages);
@@ -117,10 +119,21 @@ shrink_list(struct list_head *page_list, int nr_pages,
 		BUG_ON(PageActive(page));
 		may_enter_fs = (gfp_mask & __GFP_FS) ||
 				(PageSwapCache(page) && (gfp_mask & __GFP_IO));
+		/*
+		 * If the page is mapped into pagetables then wait on it, to
+		 * throttle this allocator to the rate at which we can clear
+		 * MAP_SHARED data.  This will also throttle against swapcache
+		 * writes.
+		 */
 		if (PageWriteback(page)) {
-			if (may_enter_fs)
+			if (may_enter_fs) {
-				wait_on_page_writeback(page);  /* throttling */
+				if (page->pte.direct ||
-			else
+					page->mapping->backing_dev_info ==
+						current->backing_dev_info) {
+					wait_on_page_writeback(page);
+				}
+			}
 			goto keep_locked;
 		}
@@ -172,15 +185,43 @@ shrink_list(struct list_head *page_list, int nr_pages,
 		 * will write it.  So we're back to page-at-a-time writepage
 		 * in LRU order.
 		 */
-		if (PageDirty(page) && is_page_cache_freeable(page) &&
+		/*
-					mapping && may_enter_fs) {
+		 * If the page is dirty, only perform writeback if that write
+		 * will be non-blocking.  To prevent this allocation from being
+		 * stalled by pagecache activity.  But note that there may be
+		 * stalls if we need to run get_block().  We could test
+		 * PagePrivate for that.
+		 *
+		 * If this process is currently in generic_file_write() against
+		 * this page's queue, we can perform writeback even if that
+		 * will block.
+		 *
+		 * If the page is swapcache, write it back even if that would
+		 * block, for some throttling. This happens by accident, because
+		 * swap_backing_dev_info is bust: it doesn't reflect the
+		 * congestion state of the swapdevs.  Easy to fix, if needed.
+		 * See swapfile.c:page_queue_congested().
+		 */
+		if (PageDirty(page)) {
 			int (*writeback)(struct page *,
 					struct writeback_control *);
+			struct backing_dev_info *bdi;
 			const int cluster_size = SWAP_CLUSTER_MAX;
 			struct writeback_control wbc = {
 				.nr_to_write = cluster_size,
 			};
+			if (!is_page_cache_freeable(page))
+				goto keep_locked;
+			if (!mapping)
+				goto keep_locked;
+			if (!may_enter_fs)
+				goto keep_locked;
+			bdi = mapping->backing_dev_info;
+			if (bdi != current->backing_dev_info &&
+					bdi_write_congested(bdi))
+				goto keep_locked;
 			writeback = mapping->a_ops->vm_writeback;
 			if (writeback == NULL)
 				writeback = generic_vm_writeback;
@@ -279,7 +320,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
 */
 static /* inline */ int
 shrink_cache(int nr_pages, struct zone *zone,
-		unsigned int gfp_mask, int priority, int max_scan)
+		unsigned int gfp_mask, int max_scan)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -298,9 +339,11 @@ shrink_cache(int nr_pages, struct zone *zone,
 	spin_lock_irq(&zone->lru_lock);
 	while (max_scan > 0 && nr_pages > 0) {
 		struct page *page;
-		int n = 0;
+		int nr_taken = 0;
+		int nr_scan = 0;
-		while (n < nr_to_process && !list_empty(&zone->inactive_list)) {
+		while (nr_scan++ < nr_to_process &&
+				!list_empty(&zone->inactive_list)) {
 			page = list_entry(zone->inactive_list.prev,
 						struct page, lru);
@@ -318,18 +361,17 @@ shrink_cache(int nr_pages, struct zone *zone,
 			}
 			list_add(&page->lru, &page_list);
 			page_cache_get(page);
-			n++;
+			nr_taken++;
 		}
-		zone->nr_inactive -= n;
+		zone->nr_inactive -= nr_taken;
 		spin_unlock_irq(&zone->lru_lock);
-		if (list_empty(&page_list))
+		if (nr_taken == 0)
 			goto done;
-		max_scan -= n;
+		max_scan -= nr_scan;
-		KERNEL_STAT_ADD(pgscan, n);
+		KERNEL_STAT_ADD(pgscan, nr_scan);
-		nr_pages = shrink_list(&page_list, nr_pages,
+		nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan);
-					gfp_mask, priority, &max_scan);
 		if (nr_pages <= 0 && list_empty(&page_list))
 			goto done;
@@ -420,6 +462,15 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 			}
 			pte_chain_unlock(page);
 		}
+		/*
+		 * FIXME: need to consider page_count(page) here if/when we
+		 * reap orphaned pages via the LRU (Daniel's locking stuff)
+		 */
+		if (total_swap_pages == 0 && !page->mapping &&
+						!PagePrivate(page)) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
 		list_add(&page->lru, &l_inactive);
 		pgdeactivate++;
 	}
@@ -470,11 +521,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 }
 static /* inline */ int
-shrink_zone(struct zone *zone, int priority,
+shrink_zone(struct zone *zone, int max_scan,
 		unsigned int gfp_mask, int nr_pages)
 {
 	unsigned long ratio;
-	int max_scan;
 	/* This is bogus for ZONE_HIGHMEM? */
 	if (kmem_cache_reap(gfp_mask) >= nr_pages)
@@ -497,43 +547,50 @@ shrink_zone(struct zone *zone, int priority,
 		atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
 		refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
 	}
+	nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan);
-	max_scan = zone->nr_inactive / priority;
-	nr_pages = shrink_cache(nr_pages, zone,
-				gfp_mask, priority, max_scan);
-	if (nr_pages <= 0)
-		return 0;
-	wakeup_bdflush();
-	shrink_dcache_memory(priority, gfp_mask);
-	/* After shrinking the dcache, get rid of unused inodes too .. */
-	shrink_icache_memory(1, gfp_mask);
-#ifdef CONFIG_QUOTA
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
 	return nr_pages;
 }
 static int
 shrink_caches(struct zone *classzone, int priority,
-		int gfp_mask, int nr_pages)
+		int *total_scanned, int gfp_mask, int nr_pages)
 {
 	struct zone *first_classzone;
 	struct zone *zone;
 	first_classzone = classzone->zone_pgdat->node_zones;
-	zone = classzone;
+	for (zone = classzone; zone >= first_classzone; zone--) {
-	while (zone >= first_classzone && nr_pages > 0) {
+		int max_scan;
-		if (zone->free_pages <= zone->pages_high) {
+		int to_reclaim;
-			nr_pages = shrink_zone(zone, priority,
+		int unreclaimed;
-					gfp_mask, nr_pages);
-		}
+		to_reclaim = zone->pages_high - zone->free_pages;
-		zone--;
+		if (to_reclaim < 0)
+			continue;	/* zone has enough memory */
+		if (to_reclaim > SWAP_CLUSTER_MAX)
+			to_reclaim = SWAP_CLUSTER_MAX;
+		if (to_reclaim < nr_pages)
+			to_reclaim = nr_pages;
+		/*
+		 * If we cannot reclaim `nr_pages' pages by scanning twice
+		 * that many pages then fall back to the next zone.
+		 */
+		max_scan = zone->nr_inactive >> priority;
+		if (max_scan < to_reclaim * 2)
+			max_scan = to_reclaim * 2;
+		unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim);
+		nr_pages -= to_reclaim - unreclaimed;
+		*total_scanned += max_scan;
 	}
+	shrink_dcache_memory(priority, gfp_mask);
+	shrink_icache_memory(1, gfp_mask);
+#ifdef CONFIG_QUOTA
+	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+#endif
 	return nr_pages;
 }
@@ -564,12 +621,25 @@ try_to_free_pages(struct zone *classzone,
 	KERNEL_STAT_INC(pageoutrun);
 	for (priority = DEF_PRIORITY; priority; priority--) {
-		nr_pages = shrink_caches(classzone, priority,
+		int total_scanned = 0;
+		nr_pages = shrink_caches(classzone, priority, &total_scanned,
 					gfp_mask, nr_pages);
 		if (nr_pages <= 0)
 			return 1;
+		if (total_scanned == 0)
+			return 1;	/* All zones had enough free memory */
 		if (!(gfp_mask & __GFP_FS))
-			break;
+			break;		/* Let the caller handle it */
+		/*
+		 * Try to write back as many pages as we just scanned.  Not
+		 * sure if that makes sense, but it's an attempt to avoid
+		 * creating IO storms unnecessarily
+		 */
+		wakeup_bdflush(total_scanned);
+		/* Take a nap, wait for some writeback to complete */
+		blk_congestion_wait(WRITE, HZ/4);
 	}
 	if (gfp_mask & __GFP_FS)
 		out_of_memory();