Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/andrew

into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux

Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/andrew
into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux
fda0b1ed · Linus Torvalds · 81ad17d7 · 5de3d3bd · fda0b1ed · fda0b1ed
Commit fda0b1ed authored Oct 02, 2002 by Linus Torvalds
39 changed files
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,7 +31,7 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.

-Last updated: January 22, 2002
+Last updated: October 1st, 2002

 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).

@@ -60,7 +60,8 @@ o  xfsprogs               2.1.0                   # xfs_db -V
 o  pcmcia-cs              3.1.21                  # cardmgr -V
 o  PPP                    2.4.0                   # pppd --version
 o  isdn4k-utils           3.1pre1                 # isdnctrl 2>&1|grep version
-			  
+o  procps                 2.0.9                   # ps --version
+
 Kernel compilation
 ==================

@@ -80,9 +81,7 @@ almost certainly bugs (mainly, but not exclusively, in the kernel) that
 will need to be fixed in order to use these compilers. In any case, using
 pgcc instead of plain gcc is just asking for trouble.

-Note that gcc 2.7.2.3 and  gcc 2.91.66 (egcs-1.1.2) are no longer supported
-kernel compilers. The kernel no longer works around bugs in these versions,
-and, in fact, will refuse to be compiled with it.
+gcc 2.91.66 (egcs-1.1.2) continues to be supported for SPARC64 requirements.

 The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
 You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build

--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -191,6 +191,7 @@ Table 1-3: Kernel info in /proc
 ..............................................................................
 File        Content                                           
 apm         Advanced power management info                    
+ buddyinfo   Kernel memory allocator information (see text)	(2.5)
 bus         Directory containing bus specific information     
 cmdline     Kernel command line                               
 cpuinfo     Info about the CPU                                
@@ -275,7 +276,7 @@ output of a SMP machine):
  ERR:       2155

 NMI is incremented in this case because every timer interrupt generates a NMI
-(Non Maskable Interrupt) which is used by the NMI Watchdog to detect lookups.
+(Non Maskable Interrupt) which is used by the NMI Watchdog to detect lockups.

 LOC is the local interrupt counter of the internal APIC of every CPU.

@@ -326,6 +327,25 @@ Linux uses  slab  pools for memory management above page level in version 2.2.
 Commonly used  objects  have  their  own  slab  pool (such as network buffers,
 directory cache, and so on).

+..............................................................................
+
+> cat /proc/buddyinfo
+
+Node 0, zone      DMA      0      4      5      4      4      3 ...
+Node 0, zone   Normal      1      0      0      1    101      8 ...
+Node 0, zone  HighMem      2      0      0      1      1      0 ...
+
+Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+useful tool for helping diagnose these problems.  Buddyinfo will give you a 
+clue as to how big an area you can safely allocate, or why a previous
+allocation failed.
+
+Each column represents the number of pages of a certain order which are 
+available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in 
+ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
+availble in ZONE_NORMAL, etc... 
+
+
 1.3 IDE devices in /proc/ide
 ----------------------------


--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -25,13 +25,13 @@ key: If a user application wants to share hugepages with other
      memory (mapped by hugeTLBs) in their address space.  When a process
      forks, then children share the same physical memory with their parent.

-      For the cases when an application wishes to keep the huge pages
-      private, the key value of 0 is defined.  In this case kernel allocates
-      hugetlb pages to the process that are not shareable across different
-      processes.  These segments are marked private for the process.  These
-      segments are not copied to children's address space on forks.
-
-AKPM: So what is present at that address within the child?
+      For the cases when an application wishes to keep the huge
+      pages private, the key value of 0 is defined.  In this case
+      kernel allocates hugetlb pages to the process that are not
+      shareable across different processes.  These segments are marked
+      private for the process.  These segments are not copied to
+      children's address space on forks - the child will have no
+      mapping for these virtual addresses.

      The key manangement (and assignment) part is left to user
      applications.

--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -56,6 +56,7 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 {
 	struct thread_struct * t = &current->thread;
 	struct tss_struct * tss;
+	unsigned long *bitmap = NULL;
 	int ret = 0;

 	if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32))
@@ -63,15 +64,12 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	if (turn_on && !capable(CAP_SYS_RAWIO))
 		return -EPERM;

-	tss = init_tss + get_cpu();
-
 	/*
 	 * If it's the first ioperm() call in this thread's lifetime, set the
 	 * IO bitmap up. ioperm() is much less timing critical than clone(),
 	 * this is why we delay this operation until now:
 	 */
 	if (!t->ts_io_bitmap) {
-		unsigned long *bitmap;
 		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!bitmap) {
 			ret = -ENOMEM;
@@ -83,20 +81,19 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 		 */
 		memset(bitmap, 0xff, IO_BITMAP_BYTES);
 		t->ts_io_bitmap = bitmap;
-		/*
-		 * this activates it in the TSS
-		 */
-		tss->bitmap = IO_BITMAP_OFFSET;
 	}

+	tss = init_tss + get_cpu();
+	if (bitmap)
+		tss->bitmap = IO_BITMAP_OFFSET;	/* Activate it in the TSS */
+
 	/*
 	 * do it in the per-thread copy and in the TSS ...
 	 */
 	set_bitmap(t->ts_io_bitmap, from, num, !turn_on);
 	set_bitmap(tss->io_bitmap, from, num, !turn_on);
-
-out:
 	put_cpu();
+out:
 	return ret;
 }


--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -58,6 +58,7 @@ static int queue_nr_requests;
 static int batch_requests;

 unsigned long blk_max_low_pfn, blk_max_pfn;
+atomic_t nr_iowait_tasks = ATOMIC_INIT(0);
 int blk_nohighio = 0;

 static struct congestion_state {
@@ -116,6 +117,27 @@ static void set_queue_congested(request_queue_t *q, int rw)
 		atomic_inc(&congestion_states[rw].nr_congested_queues);
 }

+/*
+ * This task is about to go to sleep on IO.  Increment nr_iowait_tasks so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void io_schedule(void)
+{
+	atomic_inc(&nr_iowait_tasks);
+	schedule();
+	atomic_dec(&nr_iowait_tasks);
+}
+
+void io_schedule_timeout(long timeout)
+{
+	atomic_inc(&nr_iowait_tasks);
+	schedule_timeout(timeout);
+	atomic_dec(&nr_iowait_tasks);
+}
+
 /**
 * bdev_get_queue: - return the queue that matches the given device
 * @bdev:    device
@@ -1274,7 +1296,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 		if (!rl->count)
-			schedule();
+			io_schedule();
 		finish_wait(&rl->wait, &wait);
 		spin_lock_irq(q->queue_lock);
 		rq = get_request(q, rw);
@@ -1497,7 +1519,7 @@ void blk_congestion_wait(int rw, long timeout)
 	blk_run_queues();
 	prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE);
 	if (atomic_read(&cs->nr_congested_queues) != 0)
-		schedule_timeout(timeout);
+		io_schedule_timeout(timeout);
 	finish_wait(&cs->wqh, &wait);
 }

@@ -1856,21 +1878,14 @@ int submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);

-	/*
-	 * do some validity checks...
-	 */
 	BUG_ON(!bio->bi_end_io);
-
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
-
 	bio->bi_rw = rw;
-
 	if (rw & WRITE)
-		kstat.pgpgout += count;
+		mod_page_state(pgpgout, count);
 	else
-		kstat.pgpgin += count;
-
+		mod_page_state(pgpgin, count);
 	generic_make_request(bio);
 	return 1;
 }

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -443,7 +443,7 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 	 * We really want to use invalidate_inode_pages2() for
 	 * that, but not until that's cleaned up.
 	 */
-	invalidate_inode_pages(bdev->bd_inode);
+	invalidate_inode_pages(bdev->bd_inode->i_mapping);
 }

 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)

--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -18,16 +18,11 @@
 #include <linux/bio.h>
 #include <linux/wait.h>
 #include <linux/err.h>
+#include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
 #include <asm/atomic.h>

-/*
- * The largest-sized BIO which this code will assemble, in bytes.  Set this
- * to PAGE_SIZE if your drivers are broken.
- */
-#define DIO_BIO_MAX_SIZE (16*1024)
-
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
 * the size of a structure on the stack.
@@ -37,7 +32,6 @@
 struct dio {
 	/* BIO submission state */
 	struct bio *bio;		/* bio under assembly */
-	struct bio_vec *bvec;		/* current bvec in that bio */
 	struct inode *inode;
 	int rw;
 	unsigned blkbits;		/* doesn't change */
@@ -179,15 +173,10 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		return -ENOMEM;

 	bio->bi_bdev = bdev;
-	bio->bi_vcnt = nr_vecs;
-	bio->bi_idx = 0;
-	bio->bi_size = 0;
 	bio->bi_sector = first_sector;
-	bio->bi_io_vec[0].bv_page = NULL;
 	bio->bi_end_io = dio_bio_end_io;

 	dio->bio = bio;
-	dio->bvec = NULL;		/* debug */
 	return 0;
 }

@@ -195,14 +184,11 @@ static void dio_bio_submit(struct dio *dio)
 {
 	struct bio *bio = dio->bio;

-	bio->bi_vcnt = bio->bi_idx;
-	bio->bi_idx = 0;
 	bio->bi_private = dio;
 	atomic_inc(&dio->bio_count);
 	submit_bio(dio->rw, bio);

 	dio->bio = NULL;
-	dio->bvec = NULL;
 	dio->boundary = 0;
 }

@@ -230,7 +216,7 @@ static struct bio *dio_await_one(struct dio *dio)
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 			blk_run_queues();
-			schedule();
+			io_schedule();
 			spin_lock_irqsave(&dio->bio_list_lock, flags);
 			dio->waiter = NULL;
 		}
@@ -393,8 +379,7 @@ static void dio_prep_bio(struct dio *dio)
 	if (dio->bio == NULL)
 		return;

-	if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
-			dio->boundary ||
+	if (dio->boundary ||
 			dio->last_block_in_bio != dio->next_block_in_bio - 1)
 		dio_bio_submit(dio);
 }
@@ -405,19 +390,44 @@ static void dio_prep_bio(struct dio *dio)
 static int dio_new_bio(struct dio *dio)
 {
 	sector_t sector;
-	int ret;
+	int ret, nr_pages;

 	ret = dio_bio_reap(dio);
 	if (ret)
 		goto out;
 	sector = dio->next_block_in_bio << (dio->blkbits - 9);
-	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector,
-				DIO_BIO_MAX_SIZE / PAGE_SIZE);
+	nr_pages = min(dio->total_pages, BIO_MAX_PAGES);
+	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
 	dio->boundary = 0;
 out:
 	return ret;
 }

+
+static int
+dio_bio_add_page(struct dio *dio, struct page *page,
+		unsigned int bv_len, unsigned int bv_offset)
+{
+	int ret = 0;
+
+	if (bv_len == 0) 
+		goto out;
+
+	page_cache_get(page);
+	if (bio_add_page(dio->bio, page, bv_len, bv_offset)) {
+		dio_bio_submit(dio);
+		ret = dio_new_bio(dio);
+		if (ret == 0) {
+			ret = bio_add_page(dio->bio, page, bv_len, bv_offset);
+			BUG_ON(ret != 0);
+		}
+	}
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+
 /*
 * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
 *
@@ -438,13 +448,15 @@ int do_direct_IO(struct dio *dio)
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
-	int ret;
+	int ret = 0;

 	/* The I/O can start at any block offset within the first page */
 	block_in_page = dio->first_block_in_page;

 	while (dio->block_in_file < dio->final_block_in_request) {
 		int new_page;	/* Need to insert this page into the BIO? */
+		unsigned int bv_offset;
+		unsigned int bv_len;

 		page = dio_get_page(dio);
 		if (IS_ERR(page)) {
@@ -453,15 +465,16 @@ int do_direct_IO(struct dio *dio)
 		}

 		new_page = 1;
+		bv_offset = 0;
+		bv_len = 0;
 		while (block_in_page < blocks_per_page) {
-			struct bio *bio;
 			unsigned this_chunk_bytes;	/* # of bytes mapped */
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;

 			ret = get_more_blocks(dio);
 			if (ret)
-				goto fail_release;
+				goto out;

 			/* Handle holes */
 			if (!buffer_mapped(&dio->map_bh)) {
@@ -480,24 +493,19 @@ int do_direct_IO(struct dio *dio)
 			if (dio->bio == NULL) {
 				ret = dio_new_bio(dio);
 				if (ret)
-					goto fail_release;
+					goto out;
 				new_page = 1;
 			}

-			bio = dio->bio;
 			if (new_page) {
-				dio->bvec = &bio->bi_io_vec[bio->bi_idx];
-				page_cache_get(page);
-				dio->bvec->bv_page = page;
-				dio->bvec->bv_len = 0;
-				dio->bvec->bv_offset = block_in_page << blkbits;
-				bio->bi_idx++;
+				bv_len = 0;
+				bv_offset = block_in_page << blkbits;
 				new_page = 0;
 			}

 			/* Work out how much disk we can add to this page */
 			this_chunk_blocks = dio->blocks_available;
-			u = (PAGE_SIZE - (dio->bvec->bv_offset + dio->bvec->bv_len)) >> blkbits;
+			u = (PAGE_SIZE - (bv_len + bv_offset)) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			u = dio->final_block_in_request - dio->block_in_file;
@@ -506,8 +514,7 @@ int do_direct_IO(struct dio *dio)
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);

-			dio->bvec->bv_len += this_chunk_bytes;
-			bio->bi_size += this_chunk_bytes;
+			bv_len += this_chunk_bytes;
 			dio->next_block_in_bio += this_chunk_blocks;
 			dio->last_block_in_bio = dio->next_block_in_bio - 1;
 			dio->boundary = buffer_boundary(&dio->map_bh);
@@ -520,13 +527,11 @@ int do_direct_IO(struct dio *dio)
 			if (dio->block_in_file == dio->final_block_in_request)
 				break;
 		}
+		ret = dio_bio_add_page(dio, page, bv_len, bv_offset);
+		if (ret)
+			goto out;
 		block_in_page = 0;
-		page_cache_release(page);
 	}
-	ret = 0;
-	goto out;
-fail_release:
-	page_cache_release(page);
 out:
 	return ret;
 }
@@ -542,7 +547,6 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
 	size_t bytes, tot_bytes = 0;

 	dio.bio = NULL;
-	dio.bvec = NULL;
 	dio.inode = inode;
 	dio.rw = rw;
 	dio.blkbits = blkbits;

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -147,10 +147,12 @@ static void destroy_inode(struct inode *inode)
 	if (inode_has_buffers(inode))
 		BUG();
 	security_ops->inode_free_security(inode);
-	if (inode->i_sb->s_op->destroy_inode)
+	if (inode->i_sb->s_op->destroy_inode) {
 		inode->i_sb->s_op->destroy_inode(inode);
-	else
+	} else {
+		BUG_ON(inode->i_data.page_tree.rnode != NULL);
 		kmem_cache_free(inode_cachep, (inode));
+	}
 }



--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -301,7 +301,7 @@ jffs_setattr(struct dentry *dentry, struct iattr *iattr)
 		inode->i_blocks = (inode->i_size + 511) >> 9;

 		if (len) {
-			invalidate_inode_pages(inode);
+			invalidate_inode_pages(inode->i_mapping);
 		}
 		inode->i_ctime = CURRENT_TIME;
 		inode->i_mtime = inode->i_ctime;
@@ -1520,7 +1520,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 	}
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);

 out_isem:
 	return err;

--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -125,14 +125,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	 *	 throught inode->i_sem or some other mechanism.
 	 */
 	if (page->index == 0)
-		invalidate_inode_pages(inode);
+		invalidate_inode_pages(inode->i_mapping);
 	unlock_page(page);
 	return 0;
 error:
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 	desc->error = error;
 	return -EIO;
 }

--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -564,7 +564,7 @@ nfs_zap_caches(struct inode *inode)
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;

-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);

 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	NFS_CACHEINV(inode);
@@ -1130,7 +1130,7 @@ __nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if (invalid) {
 		NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 		NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
-		invalidate_inode_pages(inode);
+		invalidate_inode_pages(inode->i_mapping);
 		memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	} else if (time_after(jiffies, NFS_ATTRTIMEO_UPDATE(inode)+NFS_ATTRTIMEO(inode))) {
 		if ((NFS_ATTRTIMEO(inode) <<= 1) > NFS_MAXATTRTIMEO(inode))

--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -252,6 +252,18 @@ static struct file_operations proc_cpuinfo_operations = {
 	.release	= seq_release,
 };

+extern struct seq_operations vmstat_op;
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vmstat_op);
+}
+static struct file_operations proc_vmstat_file_operations = {
+	open:		vmstat_open,
+	read:		seq_read,
+	llseek:		seq_lseek,
+	release:	seq_release,
+};
+
 #ifdef CONFIG_PROC_HARDWARE
 static int hardware_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -327,7 +339,7 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	int i, len;
 	extern unsigned long total_forks;
 	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
 	int major, disk;

 	for (i = 0 ; i < NR_CPUS; i++) {
@@ -337,38 +349,32 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 		user += kstat.per_cpu_user[i];
 		nice += kstat.per_cpu_nice[i];
 		system += kstat.per_cpu_system[i];
+		idle += kstat.per_cpu_idle[i];
+		iowait += kstat.per_cpu_iowait[i];
 #if !defined(CONFIG_ARCH_S390)
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat.irqs[i][j];
 #endif
 	}

-	len = sprintf(page, "cpu  %u %u %u %lu\n",
+	len = sprintf(page, "cpu  %u %u %u %u %u\n",
 		jiffies_to_clock_t(user),
 		jiffies_to_clock_t(nice),
 		jiffies_to_clock_t(system),
-		jiffies_to_clock_t(jif * num_online_cpus() - (user + nice + system)));
+		jiffies_to_clock_t(idle),
+		jiffies_to_clock_t(iowait));
 	for (i = 0 ; i < NR_CPUS; i++){
 		if (!cpu_online(i)) continue;
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
+		len += sprintf(page + len, "cpu%d %u %u %u %u %u\n",
 			i,
 			jiffies_to_clock_t(kstat.per_cpu_user[i]),
 			jiffies_to_clock_t(kstat.per_cpu_nice[i]),
 			jiffies_to_clock_t(kstat.per_cpu_system[i]),
-			jiffies_to_clock_t(jif - (  kstat.per_cpu_user[i] \
-				   + kstat.per_cpu_nice[i] \
-				   + kstat.per_cpu_system[i])));
+			jiffies_to_clock_t(kstat.per_cpu_idle[i]),
+			jiffies_to_clock_t(kstat.per_cpu_iowait[i]));
 	}
-	len += sprintf(page + len,
-		"page %u %u\n"
-		"swap %u %u\n"
-		"intr %u",
-			kstat.pgpgin >> 1,
-			kstat.pgpgout >> 1,
-			kstat.pswpin,
-			kstat.pswpout,
-			sum
-	);
+	len += sprintf(page + len, "intr %u", sum);
+
 #if !defined(CONFIG_ARCH_S390)
 	for (i = 0 ; i < NR_IRQS ; i++)
 		len += sprintf(page + len, " %u", kstat_irqs(i));
@@ -395,29 +401,9 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	}

 	len += sprintf(page + len,
-		"\npageallocs %u\n"
-		"pagefrees %u\n"
-		"pageactiv %u\n"
-		"pagedeact %u\n"
-		"pagefault %u\n"
-		"majorfault %u\n"
-		"pagescan %u\n"
-		"pagesteal %u\n"
-		"pageoutrun %u\n"
-		"allocstall %u\n"
-		"ctxt %lu\n"
+		"\nctxt %lu\n"
 		"btime %lu\n"
 		"processes %lu\n",
-		kstat.pgalloc,
-		kstat.pgfree,
-		kstat.pgactivate,
-		kstat.pgdeactivate,
-		kstat.pgfault,
-		kstat.pgmajfault,
-		kstat.pgscan,
-		kstat.pgsteal,
-		kstat.pageoutrun,
-		kstat.allocstall,
 		nr_context_switches(),
 		xtime.tv_sec - jif / HZ,
 		total_forks);
@@ -646,6 +632,7 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 	create_seq_entry("ksyms", 0, &proc_ksyms_operations);

--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -210,7 +210,7 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 			(long) last_sz, (long) inode->i_size);

 		if (!S_ISDIR(inode->i_mode))
-			invalidate_inode_pages(inode);
+			invalidate_inode_pages(inode->i_mapping);
 	}
 }

@@ -274,7 +274,7 @@ smb_refresh_inode(struct dentry *dentry)
 			 * But we do want to invalidate the caches ...
 			 */
 			if (!S_ISDIR(inode->i_mode))
-				invalidate_inode_pages(inode);
+				invalidate_inode_pages(inode->i_mapping);
 			else
 				smb_invalid_dir_cache(inode);
 			error = -EIO;

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -400,4 +400,8 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }

+extern atomic_t nr_iowait_tasks;
+void io_schedule(void);
+void io_schedule_timeout(long timeout);
+
 #endif
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1140,7 +1140,7 @@ extern int full_check_disk_change(struct block_device *);
 extern int __check_disk_change(dev_t);
 extern int invalidate_inodes(struct super_block *);
 extern int invalidate_device(kdev_t, int);
-extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_inode_pages(struct address_space *mapping);
 extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);

--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -18,19 +18,14 @@
 struct kernel_stat {
 	unsigned int per_cpu_user[NR_CPUS],
 	             per_cpu_nice[NR_CPUS],
-	             per_cpu_system[NR_CPUS];
+	             per_cpu_system[NR_CPUS],
+	             per_cpu_idle[NR_CPUS],
+	             per_cpu_iowait[NR_CPUS];
 	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rblk[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wblk[DK_MAX_MAJOR][DK_MAX_DISK];
-	unsigned int pgpgin, pgpgout;
-	unsigned int pswpin, pswpout;
-	unsigned int pgalloc, pgfree;
-	unsigned int pgactivate, pgdeactivate;
-	unsigned int pgfault, pgmajfault;
-	unsigned int pgscan, pgsteal;
-	unsigned int pageoutrun, allocstall;
 #if !defined(CONFIG_ARCH_S390)
 	unsigned int irqs[NR_CPUS][NR_IRQS];
 #endif

--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -70,7 +70,8 @@
 #define PG_direct		16	/* ->pte_chain points directly at pte */

 /*
- * Global page accounting.  One instance per CPU.
+ * Global page accounting.  One instance per CPU.  Only unsigned longs are
+ * allowed.
 */
 extern struct page_state {
 	unsigned long nr_dirty;
@@ -80,9 +81,32 @@ extern struct page_state {
 	unsigned long nr_reverse_maps;
 	unsigned long nr_mapped;
 	unsigned long nr_slab;
+#define GET_PAGE_STATE_LAST nr_slab
+
+	/*
+	 * The below are zeroed by get_page_state().  Use get_full_page_state()
+	 * to add up all these.
+	 */
+	unsigned long pgpgin;
+	unsigned long pgpgout;
+	unsigned long pswpin;
+	unsigned long pswpout;
+	unsigned long pgalloc;
+	unsigned long pgfree;
+	unsigned long pgactivate;
+	unsigned long pgdeactivate;
+	unsigned long pgfault;
+	unsigned long pgmajfault;
+	unsigned long pgscan;
+	unsigned long pgrefill;
+	unsigned long pgsteal;
+	unsigned long kswapd_steal;
+	unsigned long pageoutrun;
+	unsigned long allocstall;
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];

 extern void get_page_state(struct page_state *ret);
+extern void get_full_page_state(struct page_state *ret);

 #define mod_page_state(member, delta)					\
 	do {								\

--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -41,6 +41,9 @@ extern struct page * find_trylock_page(struct address_space *mapping,
 				unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
 				unsigned long index, unsigned int gfp_mask);
+extern unsigned int find_get_pages(struct address_space *mapping,
+				pgoff_t start, unsigned int nr_pages,
+				struct page **pages);

 /*
 * Returns locked page at given index in given cache, creating it if needed.

--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -8,6 +8,7 @@
 #define PAGEVEC_SIZE	16

 struct page;
+struct address_space;

 struct pagevec {
 	unsigned nr;
@@ -21,6 +22,8 @@ void __pagevec_lru_add(struct pagevec *pvec);
 void lru_add_drain(void);
 void pagevec_deactivate_inactive(struct pagevec *pvec);
 void pagevec_strip(struct pagevec *pvec);
+unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned int nr_pages);

 static inline void pagevec_init(struct pagevec *pvec)
 {

--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -45,5 +45,8 @@ extern int radix_tree_reserve(struct radix_tree_root *, unsigned long, void ***)
 extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 extern int radix_tree_delete(struct radix_tree_root *, unsigned long);
+extern unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items);

 #endif /* _LINUX_RADIX_TREE_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -430,6 +430,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_FROZEN	0x00040000	/* frozen for system suspend */
 #define PF_SYNC		0x00080000	/* performing fsync(), etc */
 #define PF_FSTRANS	0x00100000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00200000	/* I am kswapd */

 /*
 * Ptrace flags

--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -11,11 +11,11 @@ extern atomic_t shmem_nrpages;

 struct shmem_inode_info {
 	spinlock_t		lock;
-	struct semaphore 	sem;
 	unsigned long		next_index;
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
-	void		      **i_indirect; /* indirect blocks */
-	unsigned long		swapped;
+	struct page	       *i_indirect; /* indirect blocks */
+	unsigned long		alloced;    /* data pages allocated to file */
+	unsigned long		swapped;    /* subtotal assigned to swap */
 	unsigned long		flags;
 	struct list_head	list;
 	struct inode		vfs_inode;

--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -218,7 +218,7 @@ extern spinlock_t swaplock;
 #define swap_device_lock(p)	spin_lock(&p->sdev_lock)
 #define swap_device_unlock(p)	spin_unlock(&p->sdev_lock)

-extern void shmem_unuse(swp_entry_t entry, struct page *page);
+extern int shmem_unuse(swp_entry_t entry, struct page *page);

 #endif /* __KERNEL__*/


--- a/init/main.c
+++ b/init/main.c
@@ -555,8 +555,6 @@ static int init(void * unused)
 	unlock_kernel();
 	system_running = 1;

-	kstat.pgfree = 0;
-
 	if (open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");


--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -28,6 +28,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
+#include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/timer.h>

@@ -866,6 +867,10 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
 			kstat.per_cpu_system[cpu] += sys_ticks;
+		else if (atomic_read(&nr_iowait_tasks) > 0)
+			kstat.per_cpu_iowait[cpu] += sys_ticks;
+		else
+			kstat.per_cpu_idle[cpu] += sys_ticks;
 #if CONFIG_SMP
 		idle_tick(rq);
 #endif

--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -43,6 +43,7 @@ struct radix_tree_path {
 };

 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)

 /*
 * Radix tree node cache.
@@ -218,9 +219,113 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)

 	return (void *) *slot;
 }
-
 EXPORT_SYMBOL(radix_tree_lookup);

+static /* inline */ unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index,
+	unsigned long max_index)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	while (height > 0) {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (slot->slots[i] != NULL)
+				break;
+			index &= ~((1 << shift) - 1);
+			index += 1 << shift;
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		if (height == 0) {
+			/* Bottom level: grab some items */
+			unsigned long j;
+
+			BUG_ON((shift + RADIX_TREE_MAP_SHIFT) != 0);
+			
+			j = index & RADIX_TREE_MAP_MASK;
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (slot->slots[j]) {
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		slot = slot->slots[i];
+	}
+out:
+	*next_index = index;
+	return nr_found;
+	
+}
+/**
+ *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	them at *@results and returns the number of items which were placed at
+ *	*@results.
+ *
+ *	The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	if (root->rnode == NULL)
+		goto out;
+	if (max_index == 0) {			/* Bah.  Special case */
+		if (first_index == 0) {
+			if (max_items > 0) {
+				*results = root->rnode;
+				ret = 1;
+			}
+		}
+		goto out;
+	}
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup(root, results + ret, cur_index,
+				max_items - ret, &next_index, max_index);
+		if (nr_found == 0) {
+			 if (!(cur_index & RADIX_TREE_MAP_MASK))
+				break;
+			/*
+			 * It could be that there simply were no items to the
+			 * right of `cur_index' in the leaf node.  So we still
+			 * need to search for additional nodes to the right of
+			 * this one.
+			 */
+		}
+		ret += nr_found;
+		cur_index = next_index;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup);

 /**
 *	radix_tree_delete    -    delete an item from a radix tree
@@ -231,7 +336,7 @@ EXPORT_SYMBOL(radix_tree_lookup);
 */
 int radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
-	struct radix_tree_path path[RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2], *pathp = path;
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
 	unsigned int height, shift;

 	height = root->height;

--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,6 +9,7 @@ obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o
+	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o \
+	    truncate.o

 include $(TOPDIR)/Rules.make
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -24,6 +24,7 @@
 #include <linux/hash.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/blkdev.h>
 #include <linux/security.h>
 /*
 * This is needed for the following functions:
@@ -51,7 +52,6 @@
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

-
 /*
 * Lock ordering:
 *
@@ -73,10 +73,9 @@ void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;

-	if (unlikely(PageDirty(page)) && !PageSwapCache(page))
-		BUG();
+	BUG_ON(PageDirty(page) && !PageSwapCache(page));

-	radix_tree_delete(&page->mapping->page_tree, page->index);
+	radix_tree_delete(&mapping->page_tree, page->index);
 	list_del(&page->list);
 	page->mapping = NULL;

@@ -105,341 +104,6 @@ static inline int sync_page(struct page *page)
 	return 0;
 }

-/**
- * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
- * @inode: the inode which pages we want to invalidate
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- */
-
-void invalidate_inode_pages(struct inode * inode)
-{
-	struct list_head *head, *curr;
-	struct page * page;
-	struct address_space *mapping = inode->i_mapping;
-	struct pagevec pvec;
-
-	head = &mapping->clean_pages;
-	pagevec_init(&pvec);
-	write_lock(&mapping->page_lock);
-	curr = head->next;
-
-	while (curr != head) {
-		page = list_entry(curr, struct page, list);
-		curr = curr->next;
-
-		/* We cannot invalidate something in dirty.. */
-		if (PageDirty(page))
-			continue;
-
-		/* ..or locked */
-		if (TestSetPageLocked(page))
-			continue;
-
-		if (PagePrivate(page) && !try_to_release_page(page, 0))
-			goto unlock;
-
-		if (page_count(page) != 1)
-			goto unlock;
-
-		__remove_from_page_cache(page);
-		unlock_page(page);
-		if (!pagevec_add(&pvec, page))
-			__pagevec_release(&pvec);
-		continue;
-unlock:
-		unlock_page(page);
-		continue;
-	}
-
-	write_unlock(&mapping->page_lock);
-	pagevec_release(&pvec);
-}
-
-static int do_invalidatepage(struct page *page, unsigned long offset)
-{
-	int (*invalidatepage)(struct page *, unsigned long);
-	invalidatepage = page->mapping->a_ops->invalidatepage;
-	if (invalidatepage)
-		return (*invalidatepage)(page, offset);
-	return block_invalidatepage(page, offset);
-}
-
-static inline void truncate_partial_page(struct page *page, unsigned partial)
-{
-	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-	if (PagePrivate(page))
-		do_invalidatepage(page, partial);
-}
-
-/*
- * If truncate cannot remove the fs-private metadata from the page, the page
- * becomes anonymous.  It will be left on the LRU and may even be mapped into
- * user pagetables if we're racing with filemap_nopage().
- */
-static void truncate_complete_page(struct page *page)
-{
-	if (PagePrivate(page))
-		do_invalidatepage(page, 0);
-
-	clear_page_dirty(page);
-	ClearPageUptodate(page);
-	remove_from_page_cache(page);
-	page_cache_release(page);
-}
-
-/*
- * Writeback walks the page list in ->prev order, which is low-to-high file
- * offsets in the common case where he file was written linearly. So truncate
- * walks the page list in the opposite (->next) direction, to avoid getting
- * into lockstep with writeback's cursor.  To prune as many pages as possible
- * before the truncate cursor collides with the writeback cursor.
- */
-static int truncate_list_pages(struct address_space *mapping,
-	struct list_head *head, unsigned long start, unsigned *partial)
-{
-	struct list_head *curr;
-	struct page * page;
-	int unlocked = 0;
-	struct pagevec release_pvec;
-
-	pagevec_init(&release_pvec);
-restart:
-	curr = head->next;
-	while (curr != head) {
-		unsigned long offset;
-
-		page = list_entry(curr, struct page, list);
-		offset = page->index;
-
-		/* Is one of the pages to truncate? */
-		if ((offset >= start) || (*partial && (offset + 1) == start)) {
-			int failed;
-
-			page_cache_get(page);
-			failed = TestSetPageLocked(page);
-			if (!failed && PageWriteback(page)) {
-				unlock_page(page);
-				list_del(head);
-				list_add_tail(head, curr);
-				write_unlock(&mapping->page_lock);
-				wait_on_page_writeback(page);
-				if (!pagevec_add(&release_pvec, page))
-					__pagevec_release(&release_pvec);
-				unlocked = 1;
-				write_lock(&mapping->page_lock);
-				goto restart;
-			}
-
-			list_del(head);
-			if (!failed)		/* Restart after this page */
-				list_add(head, curr);
-			else			/* Restart on this page */
-				list_add_tail(head, curr);
-
-			write_unlock(&mapping->page_lock);
-			unlocked = 1;
-
- 			if (!failed) {
-				if (*partial && (offset + 1) == start) {
-					truncate_partial_page(page, *partial);
-					*partial = 0;
-				} else {
-					truncate_complete_page(page);
-				}
-				unlock_page(page);
-			} else {
- 				wait_on_page_locked(page);
-			}
-			if (!pagevec_add(&release_pvec, page))
-				__pagevec_release(&release_pvec);
-			cond_resched();
-			write_lock(&mapping->page_lock);
-			goto restart;
-		}
-		curr = curr->next;
-	}
-	if (pagevec_count(&release_pvec)) {
-		write_unlock(&mapping->page_lock);
-		pagevec_release(&release_pvec);
-		write_lock(&mapping->page_lock);
-		unlocked = 1;
-	}
-	return unlocked;
-}
-
-/*
- * Unconditionally clean all pages outside `start'.  The mapping lock
- * must be held.
- */
-static void clean_list_pages(struct address_space *mapping,
-		struct list_head *head, unsigned long start)
-{
-	struct page *page;
-	struct list_head *curr;
-
-	for (curr = head->next; curr != head; curr = curr->next) {
-		page = list_entry(curr, struct page, list);
-		if (page->index > start)
-			clear_page_dirty(page);
-	}
-}
-
-/**
- * truncate_inode_pages - truncate *all* the pages from an offset
- * @mapping: mapping to truncate
- * @lstart: offset from with to truncate
- *
- * Truncate the page cache at a set offset, removing the pages
- * that are beyond that offset (and zeroing out partial pages).
- * If any page is locked we wait for it to become unlocked.
- */
-void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
-{
-	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-	int unlocked;
-
-	write_lock(&mapping->page_lock);
-	clean_list_pages(mapping, &mapping->io_pages, start);
-	clean_list_pages(mapping, &mapping->dirty_pages, start);
-	do {
-		unlocked = truncate_list_pages(mapping,
-				&mapping->io_pages, start, &partial);
-		unlocked |= truncate_list_pages(mapping,
-				&mapping->dirty_pages, start, &partial);
-		unlocked |= truncate_list_pages(mapping,
-				&mapping->clean_pages, start, &partial);
-		unlocked |= truncate_list_pages(mapping,
-				&mapping->locked_pages, start, &partial);
-	} while (unlocked);
-	/* Traversed all three lists without dropping the lock */
-	write_unlock(&mapping->page_lock);
-}
-
-static inline int invalidate_this_page2(struct address_space * mapping,
-					struct page * page,
-					struct list_head * curr,
-					struct list_head * head)
-{
-	int unlocked = 1;
-
-	/*
-	 * The page is locked and we hold the mapping lock as well
-	 * so both page_count(page) and page_buffers stays constant here.
-	 * AKPM: fixme: No global lock any more.  Is this still OK?
-	 */
-	if (page_count(page) == 1 + !!page_has_buffers(page)) {
-		/* Restart after this page */
-		list_del(head);
-		list_add_tail(head, curr);
-
-		page_cache_get(page);
-		write_unlock(&mapping->page_lock);
-		truncate_complete_page(page);
-	} else {
-		if (page_has_buffers(page)) {
-			/* Restart after this page */
-			list_del(head);
-			list_add_tail(head, curr);
-
-			page_cache_get(page);
-			write_unlock(&mapping->page_lock);
-			do_invalidatepage(page, 0);
-		} else
-			unlocked = 0;
-
-		clear_page_dirty(page);
-		ClearPageUptodate(page);
-	}
-
-	return unlocked;
-}
-
-static int invalidate_list_pages2(struct address_space * mapping,
-				  struct list_head * head)
-{
-	struct list_head *curr;
-	struct page * page;
-	int unlocked = 0;
-	struct pagevec release_pvec;
-
-	pagevec_init(&release_pvec);
-restart:
-	curr = head->prev;
-	while (curr != head) {
-		page = list_entry(curr, struct page, list);
-
-		if (!TestSetPageLocked(page)) {
-			int __unlocked;
-
-			if (PageWriteback(page)) {
-				write_unlock(&mapping->page_lock);
-				wait_on_page_writeback(page);
-				unlocked = 1;
-				write_lock(&mapping->page_lock);
-				unlock_page(page);
-				goto restart;
-			}
-
-			__unlocked = invalidate_this_page2(mapping,
-						page, curr, head);
-			unlock_page(page);
-			unlocked |= __unlocked;
-			if (!__unlocked) {
-				curr = curr->prev;
-				continue;
-			}
-		} else {
-			/* Restart on this page */
-			list_del(head);
-			list_add(head, curr);
-
-			page_cache_get(page);
-			write_unlock(&mapping->page_lock);
-			unlocked = 1;
-			wait_on_page_locked(page);
-		}
-
-		if (!pagevec_add(&release_pvec, page))
-			__pagevec_release(&release_pvec);
-		cond_resched();
-		write_lock(&mapping->page_lock);
-		goto restart;
-	}
-	if (pagevec_count(&release_pvec)) {
-		write_unlock(&mapping->page_lock);
-		pagevec_release(&release_pvec);
-		write_lock(&mapping->page_lock);
-		unlocked = 1;
-	}
-	return unlocked;
-}
-
-/**
- * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
- * free the pages because they're mapped.
- * @mapping: the address_space which pages we want to invalidate
- */
-void invalidate_inode_pages2(struct address_space *mapping)
-{
-	int unlocked;
-
-	write_lock(&mapping->page_lock);
-	do {
-		unlocked = invalidate_list_pages2(mapping,
-				&mapping->clean_pages);
-		unlocked |= invalidate_list_pages2(mapping,
-				&mapping->dirty_pages);
-		unlocked |= invalidate_list_pages2(mapping,
-				&mapping->io_pages);
-		unlocked |= invalidate_list_pages2(mapping,
-				&mapping->locked_pages);
-	} while (unlocked);
-	write_unlock(&mapping->page_lock);
-}
-
 /*
 * In-memory filesystems have to fail their
 * writepage function - and this has to be
@@ -564,7 +228,6 @@ int add_to_page_cache(struct page *page,
 	if (!error) {
 		SetPageLocked(page);
 		___add_to_page_cache(page, mapping, offset);
-		ClearPageDirty(page);
 	} else {
 		page_cache_release(page);
 	}
@@ -638,7 +301,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 		prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
 		sync_page(page);
 		if (test_bit(bit_nr, &page->flags))
-			schedule();
+			io_schedule();
 	} while (test_bit(bit_nr, &page->flags));
 	finish_wait(waitqueue, &wait);
 }
@@ -702,7 +365,7 @@ void __lock_page(struct page *page)
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		sync_page(page);
 		if (PageLocked(page))
-			schedule();
+			io_schedule();
 	}
 	finish_wait(wqh, &wait);
 }
@@ -824,6 +487,37 @@ struct page *find_or_create_page(struct address_space *mapping,
 	return page;
 }

+/**
+ * find_get_pages - gang pagecache lookup
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ * @pages:	Where the resulting pages are placed
+ *
+ * find_get_pages() will search for and return a group of up to
+ * @nr_pages pages in the mapping.  The pages are placed at @pages.
+ * find_get_pages() takes a reference against the returned pages.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * find_get_pages() returns the number of pages which were found.
+ */
+unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
+			    unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	read_lock(&mapping->page_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, start, nr_pages);
+	for (i = 0; i < ret; i++)
+		page_cache_get(pages[i]);
+	read_unlock(&mapping->page_lock);
+	return ret;
+}
+
 /*
 * Same as grab_cache_page, but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
@@ -1403,7 +1097,7 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
 	return NULL;

 page_not_uptodate:
-	KERNEL_STAT_INC(pgmajfault);
+	inc_page_state(pgmajfault);
 	lock_page(page);

 	/* Did it get unhashed while we waited for it? */

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1216,7 +1216,7 @@ static int do_swap_page(struct mm_struct * mm,

 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
-		KERNEL_STAT_INC(pgmajfault);
+		inc_page_state(pgmajfault);
 	}

 	mark_page_accessed(page);
@@ -1461,7 +1461,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	current->state = TASK_RUNNING;
 	pgd = pgd_offset(mm, address);

-	KERNEL_STAT_INC(pgfault);
+	inc_page_state(pgfault);
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.

--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -186,8 +186,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		/*
 		 * Try to merge with the previous vma.
 		 */
-		if (mprotect_attempt_merge(vma, *pprev, end, newflags))
+		if (mprotect_attempt_merge(vma, *pprev, end, newflags)) {
+			vma = *pprev;
 			goto success;
+		}
 	} else {
 		error = split_vma(mm, vma, start, 1);
 		if (error)

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -13,7 +13,7 @@
 */

 #include <linux/config.h>
-#include <linux/kernel_stat.h>
+#include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
@@ -24,6 +24,7 @@
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+#include <linux/slab.h>

 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
@@ -86,7 +87,7 @@ void __free_pages_ok (struct page *page, unsigned int order)
 	struct page *base;
 	struct zone *zone;

-	KERNEL_STAT_ADD(pgfree, 1<<order);
+	mod_page_state(pgfree, 1<<order);

 	BUG_ON(PageLRU(page));
 	BUG_ON(PagePrivate(page));
@@ -324,7 +325,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();

-	KERNEL_STAT_ADD(pgalloc, 1<<order);
+	mod_page_state(pgalloc, 1<<order);

 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	classzone = zones[0]; 
@@ -397,7 +398,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nopage;

-	KERNEL_STAT_INC(allocstall);
+	inc_page_state(allocstall);
 	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
@@ -555,28 +556,39 @@ unsigned int nr_free_highpages (void)
 struct page_state page_states[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(page_states);

-void get_page_state(struct page_state *ret)
+void __get_page_state(struct page_state *ret, int nr)
 {
-	int pcpu;
+	int cpu;

 	memset(ret, 0, sizeof(*ret));
-	for (pcpu = 0; pcpu < NR_CPUS; pcpu++) {
-		struct page_state *ps;
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		unsigned long *in, *out, off;

-		if (!cpu_online(pcpu))
+		if (!cpu_online(cpu))
 			continue;

-		ps = &page_states[pcpu];
-		ret->nr_dirty += ps->nr_dirty;
-		ret->nr_writeback += ps->nr_writeback;
-		ret->nr_pagecache += ps->nr_pagecache;
-		ret->nr_page_table_pages += ps->nr_page_table_pages;
-		ret->nr_reverse_maps += ps->nr_reverse_maps;
-		ret->nr_mapped += ps->nr_mapped;
-		ret->nr_slab += ps->nr_slab;
+		in = (unsigned long *)(page_states + cpu);
+		out = (unsigned long *)ret;
+		for (off = 0; off < nr; off++)
+			*out++ += *in++;
 	}
 }

+void get_page_state(struct page_state *ret)
+{
+	int nr;
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr + 1);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
+
 void get_zone_counts(unsigned long *active, unsigned long *inactive)
 {
 	struct zone *zone;
@@ -1048,4 +1060,76 @@ struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };

+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_pagecache",
+	"nr_page_table_pages",
+	"nr_reverse_maps",
+	"nr_mapped",
+	"nr_slab",
+
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+	"pgalloc",
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+	"pgfault",
+	"pgmajfault",
+	"pgscan",
+	"pgrefill",
+	"pgsteal",
+	"kswapd_steal",
+	"pageoutrun",
+	"allocstall",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
+
 #endif /* CONFIG_PROC_FS */
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -100,7 +100,7 @@ int swap_writepage(struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	kstat.pswpout++;
+	inc_page_state(pswpout);
 	SetPageWriteback(page);
 	unlock_page(page);
 	submit_bio(WRITE, bio);
@@ -119,7 +119,7 @@ int swap_readpage(struct file *file, struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	kstat.pswpin++;
+	inc_page_state(pswpin);
 	submit_bio(READ, bio);
 out:
 	return ret;

--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,11 +274,11 @@ void page_remove_rmap(struct page * page, pte_t * ptep)
 		BUG();
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 		return;
+	if (!page_mapped(page))
+		return;		/* remap_page_range() from a driver? */

 	pte_chain_lock(page);

-	BUG_ON(page->pte.direct == 0);
- 
 	if (PageDirect(page)) {
 		if (page->pte.direct == pte_paddr) {
 			page->pte.direct = 0;

--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -37,12 +37,64 @@
 #define TMPFS_MAGIC	0x01021994

 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
+#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)

-#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * (ENTRIES_PER_PAGE/2) * (ENTRIES_PER_PAGE+1))
+#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)

-#define VM_ACCT(size)    (((size) + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT)
+#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
+
+/* Pretend that each entry is of this size in directory's i_size */
+#define BOGO_DIRENT_SIZE 20
+
+/* Keep swapped page count in private field of indirect struct page */
+#define nr_swapped		private
+
+static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
+{
+	/*
+	 * The above definition of ENTRIES_PER_PAGE, and the use of
+	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
+	 * might be reconsidered if it ever diverges from PAGE_SIZE.
+	 */
+	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static inline void shmem_dir_free(struct page *page)
+{
+	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static struct page **shmem_dir_map(struct page *page)
+{
+	return (struct page **)kmap_atomic(page, KM_USER0);
+}
+
+static inline void shmem_dir_unmap(struct page **dir)
+{
+	kunmap_atomic(dir, KM_USER0);
+}
+
+static swp_entry_t *shmem_swp_map(struct page *page)
+{
+	/*
+	 * We have to avoid the unconditional inc_preempt_count()
+	 * in kmap_atomic(), since shmem_swp_unmap() will also be
+	 * applied to the low memory addresses within i_direct[].
+	 * PageHighMem and high_memory tests are good for all arches
+	 * and configs: highmem_start_page and FIXADDR_START are not.
+	 */
+	return PageHighMem(page)?
+		(swp_entry_t *)kmap_atomic(page, KM_USER1):
+		(swp_entry_t *)page_address(page);
+}
+
+static inline void shmem_swp_unmap(swp_entry_t *entry)
+{
+	if (entry >= (swp_entry_t *)high_memory)
+		kunmap_atomic(entry, KM_USER1);
+}

 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 {
@@ -65,40 +117,42 @@ LIST_HEAD (shmem_inodes);
 static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
 atomic_t shmem_nrpages = ATOMIC_INIT(0); /* Not used right now */

-static struct page *shmem_getpage_locked(struct shmem_inode_info *, struct inode *, unsigned long);
+static void shmem_free_block(struct inode *inode)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	spin_lock(&sbinfo->stat_lock);
+	sbinfo->free_blocks++;
+	inode->i_blocks -= BLOCKS_PER_PAGE;
+	spin_unlock(&sbinfo->stat_lock);
+}

 /*
 * shmem_recalc_inode - recalculate the size of an inode
 *
 * @inode: inode to recalc
- * @swap:  additional swap pages freed externally
 *
- * We have to calculate the free blocks since the mm can drop pages
- * behind our back
+ * We have to calculate the free blocks since the mm can drop
+ * undirtied hole pages behind our back.  Later we should be
+ * able to use the releasepage method to handle this better.
 *
- * But we know that normally
- * inodes->i_blocks/BLOCKS_PER_PAGE == 
- * 			inode->i_mapping->nrpages + info->swapped
- *
- * So the mm freed 
- * inodes->i_blocks/BLOCKS_PER_PAGE - 
- * 			(inode->i_mapping->nrpages + info->swapped)
+ * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
+ * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 *
 * It has to be called with the spinlock held.
 */
-
-static void shmem_recalc_inode(struct inode * inode)
+static void shmem_recalc_inode(struct inode *inode)
 {
-	unsigned long freed;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	long freed;

-	freed = (inode->i_blocks/BLOCKS_PER_PAGE) -
-		(inode->i_mapping->nrpages + SHMEM_I(inode)->swapped);
-	if (freed){
-		struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb);
-		inode->i_blocks -= freed*BLOCKS_PER_PAGE;
-		spin_lock (&sbinfo->stat_lock);
+	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+	if (freed > 0) {
+		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+		info->alloced -= freed;
+		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_blocks += freed;
-		spin_unlock (&sbinfo->stat_lock);
+		inode->i_blocks -= freed*BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
 	}
 }

@@ -110,11 +164,9 @@ static void shmem_recalc_inode(struct inode * inode)
 * @page:  optional page to add to the structure. Has to be preset to
 *         all zeros
 *
- * If there is no space allocated yet it will return -ENOMEM when
- * page == 0 else it will use the page for the needed block.
- *
- * returns -EFBIG if the index is too big.
- *
+ * If there is no space allocated yet it will return NULL when
+ * page is NULL, else it will use the page for the needed block,
+ * setting it to NULL on return to indicate that it has been used.
 *
 * The swap vector is organized the following way:
 *
@@ -142,233 +194,285 @@ static void shmem_recalc_inode(struct inode * inode)
 * 	      	       +-> 48-51
 * 	      	       +-> 52-55
 */
-static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index, unsigned long page) 
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
 {
 	unsigned long offset;
-	void **dir;
+	struct page **dir;
+	struct page *subdir;

+	if (index >= info->next_index)
+		return NULL;
 	if (index < SHMEM_NR_DIRECT)
 		return info->i_direct+index;
+	if (!info->i_indirect) {
+		if (page) {
+			info->i_indirect = *page;
+			*page = NULL;
+		}
+		return NULL;			/* need another page */
+	}

 	index -= SHMEM_NR_DIRECT;
 	offset = index % ENTRIES_PER_PAGE;
 	index /= ENTRIES_PER_PAGE;
+	dir = shmem_dir_map(info->i_indirect);

-	if (!info->i_indirect) {
-		info->i_indirect = (void *) page;
-		return ERR_PTR(-ENOMEM);
-	}
-
-	dir = info->i_indirect + index;
 	if (index >= ENTRIES_PER_PAGE/2) {
 		index -= ENTRIES_PER_PAGE/2;
-		dir = info->i_indirect + ENTRIES_PER_PAGE/2 
-			+ index/ENTRIES_PER_PAGE;
+		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
 		index %= ENTRIES_PER_PAGE;
-
-		if(!*dir) {
-			*dir = (void *) page;
-			/* We return since we will need another page
-                           in the next step */
-			return ERR_PTR(-ENOMEM);
+		subdir = *dir;
+		if (!subdir) {
+			if (page) {
+				*dir = *page;
+				*page = NULL;
+			}
+			shmem_dir_unmap(dir);
+			return NULL;		/* need another page */
 		}
-		dir = ((void **)*dir) + index;
-	}
-	if (!*dir) {
-		if (!page)
-			return ERR_PTR(-ENOMEM);
-		*dir = (void *)page;
+		shmem_dir_unmap(dir);
+		dir = shmem_dir_map(subdir);
 	}
-	return ((swp_entry_t *)*dir) + offset;
-}
-
-/*
- * shmem_alloc_entry - get the position of the swap entry for the
- *                     page. If it does not exist allocate the entry
- *
- * @info:	info structure for the inode
- * @index:	index of the page to find
- */
-static inline swp_entry_t * shmem_alloc_entry (struct shmem_inode_info *info, unsigned long index)
-{
-	unsigned long page = 0;
-	swp_entry_t * res;
-
-	if (index >= SHMEM_MAX_INDEX)
-		return ERR_PTR(-EFBIG);
-
-	if (info->next_index <= index)
-		info->next_index = index + 1;

-	while ((res = shmem_swp_entry(info,index,page)) == ERR_PTR(-ENOMEM)) {
-		page = get_zeroed_page(GFP_USER);
-		if (!page)
-			break;
+	dir += index;
+	subdir = *dir;
+	if (!subdir) {
+		if (!page || !(subdir = *page)) {
+			shmem_dir_unmap(dir);
+			return NULL;		/* need a page */
+		}
+		*dir = subdir;
+		*page = NULL;
 	}
-	return res;
+	shmem_dir_unmap(dir);
+
+	/*
+	 * With apologies... caller shmem_swp_alloc passes non-NULL
+	 * page (though perhaps NULL *page); and now we know that this
+	 * indirect page has been allocated, we can shortcut the final
+	 * kmap if we know it contains no swap entries, as is commonly
+	 * the case: return pointer to a 0 which doesn't need kmapping.
+	 */
+	return (page && !subdir->nr_swapped)?
+		(swp_entry_t *)&subdir->nr_swapped:
+		shmem_swp_map(subdir) + offset;
 }

-/*
- * shmem_free_swp - free some swap entries in a directory
- *
- * @dir:   pointer to the directory
- * @count: number of entries to scan
- */
-static int shmem_free_swp(swp_entry_t *dir, unsigned int count)
+static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
 {
-	swp_entry_t *ptr, entry;
-	int freed = 0;
+	long incdec = value? 1: -1;

-	for (ptr = dir; ptr < dir + count; ptr++) {
-		if (!ptr->val)
-			continue;
-		entry = *ptr;
-		*ptr = (swp_entry_t){0};
-		freed++;
-		free_swap_and_cache(entry);
-	}
-	return freed;
+	entry->val = value;
+	info->swapped += incdec;
+	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
+		kmap_atomic_to_page(entry)->nr_swapped += incdec;
 }

 /*
- * shmem_truncate_direct - free the swap entries of a whole doubly
- *                         indirect block
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ *                   If it does not exist allocate the entry.
 *
- * @dir:	pointer to the pointer to the block
- * @start:	offset to start from (in pages)
- * @len:	how many pages are stored in this block
- *
- * Returns the number of freed swap entries.
+ * @info:	info structure for the inode
+ * @index:	index of the page to find
 */
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index)
+{
+	struct inode *inode = &info->vfs_inode;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	struct page *page = NULL;
+	swp_entry_t *entry;

-static inline unsigned long 
-shmem_truncate_direct(swp_entry_t *** dir, unsigned long start, unsigned long len) {
-	swp_entry_t **last, **ptr;
-	unsigned long off, freed = 0;
- 
-	if (!*dir)
-		return 0;
+	while (!(entry = shmem_swp_entry(info, index, &page))) {
+		if (index >= info->next_index) {
+			entry = ERR_PTR(-EFAULT);
+			break;
+		}

-	last = *dir + (len + ENTRIES_PER_PAGE-1) / ENTRIES_PER_PAGE;
-	off = start % ENTRIES_PER_PAGE;
+		/*
+		 * Test free_blocks against 1 not 0, since we have 1 data
+		 * page (and perhaps indirect index pages) yet to allocate:
+		 * a waste to allocate index if we cannot allocate data.
+		 */
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks <= 1) {
+			spin_unlock(&sbinfo->stat_lock);
+			return ERR_PTR(-ENOSPC);
+		}
+		sbinfo->free_blocks--;
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);

-	for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++) {
-		if (!*ptr) {
-			off = 0;
-			continue;
+		spin_unlock(&info->lock);
+		page = shmem_dir_alloc(inode->i_mapping->gfp_mask);
+		if (page) {
+			clear_highpage(page);
+			page->nr_swapped = 0;
 		}
+		spin_lock(&info->lock);

-		if (!off) {
-			freed += shmem_free_swp(*ptr, ENTRIES_PER_PAGE);
-			free_page ((unsigned long) *ptr);
-			*ptr = 0;
-		} else {
-			freed += shmem_free_swp(*ptr+off,ENTRIES_PER_PAGE-off);
-			off = 0;
+		if (!page) {
+			shmem_free_block(inode);
+			return ERR_PTR(-ENOMEM);
 		}
 	}
-	
-	if (!start) {
-		free_page((unsigned long) *dir);
-		*dir = 0;
+	if (page) {
+		/* another task gave its page, or truncated the file */
+		shmem_free_block(inode);
+		shmem_dir_free(page);
 	}
-	return freed;
+	return entry;
 }

 /*
- * shmem_truncate_indirect - truncate an inode
- *
- * @info:  the info structure of the inode
- * @index: the index to truncate
+ * shmem_free_swp - free some swap entries in a directory
 *
- * This function locates the last doubly indirect block and calls
- * then shmem_truncate_direct to do the real work
+ * @dir:   pointer to the directory
+ * @edir:  pointer after last entry of the directory
 */
-static inline unsigned long
-shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index)
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
 {
-	swp_entry_t ***base;
-	unsigned long baseidx, len, start;
-	unsigned long max = info->next_index-1;
-
-	if (max < SHMEM_NR_DIRECT) {
-		info->next_index = index;
-		return shmem_free_swp(info->i_direct + index,
-				      SHMEM_NR_DIRECT - index);
-	}
-
-	if (max < ENTRIES_PER_PAGE * ENTRIES_PER_PAGE/2 + SHMEM_NR_DIRECT) {
-		max -= SHMEM_NR_DIRECT;
-		base = (swp_entry_t ***) &info->i_indirect;
-		baseidx = SHMEM_NR_DIRECT;
-		len = max+1;
-	} else {
-		max -= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2+SHMEM_NR_DIRECT;
-		if (max >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2)
-			BUG();
-
-		baseidx = max & ~(ENTRIES_PER_PAGE*ENTRIES_PER_PAGE-1);
-		base = (swp_entry_t ***) info->i_indirect + ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGE/ENTRIES_PER_PAGE ;
-		len = max - baseidx + 1;
-		baseidx += ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2+SHMEM_NR_DIRECT;
-	}
+	swp_entry_t *ptr;
+	int freed = 0;

-	if (index > baseidx) {
-		info->next_index = index;
-		start = index - baseidx;
-	} else {
-		info->next_index = baseidx;
-		start = 0;
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val) {
+			free_swap_and_cache(*ptr);
+			*ptr = (swp_entry_t){0};
+			freed++;
+		}
 	}
-	return shmem_truncate_direct(base, start, len);
+	return freed;
 }

-static void shmem_truncate (struct inode * inode)
+static void shmem_truncate(struct inode *inode)
 {
-	unsigned long index;
-	unsigned long partial;
-	unsigned long freed = 0;
-	struct shmem_inode_info * info = SHMEM_I(inode);
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	struct page **dir;
+	struct page *subdir;
+	struct page *empty;
+	swp_entry_t *ptr;
+	int offset;
+	int freed;

-	down(&info->sem);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	spin_lock (&info->lock);
-	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	partial = inode->i_size & ~PAGE_CACHE_MASK;
+	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (idx >= info->next_index)
+		return;

-	if (partial) {
-		swp_entry_t *entry = shmem_swp_entry(info, index-1, 0);
-		struct page *page;
-		/*
-		 * This check is racy: it's faintly possible that page
-		 * was assigned to swap during truncate_inode_pages,
-		 * and now assigned to file; but better than nothing.
-		 */
-		if (!IS_ERR(entry) && entry->val) {
-			spin_unlock(&info->lock);
-			page = shmem_getpage_locked(info, inode, index-1);
-			if (!IS_ERR(page)) {
-				memclear_highpage_flush(page, partial,
-					PAGE_CACHE_SIZE - partial);
-				unlock_page(page);
-				page_cache_release(page);
+	spin_lock(&info->lock);
+	limit = info->next_index;
+	info->next_index = idx;
+	if (info->swapped && idx < SHMEM_NR_DIRECT) {
+		ptr = info->i_direct;
+		size = limit;
+		if (size > SHMEM_NR_DIRECT)
+			size = SHMEM_NR_DIRECT;
+		info->swapped -= shmem_free_swp(ptr+idx, ptr+size);
+	}
+	if (!info->i_indirect)
+		goto done2;
+
+	BUG_ON(limit <= SHMEM_NR_DIRECT);
+	limit -= SHMEM_NR_DIRECT;
+	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+	offset = idx % ENTRIES_PER_PAGE;
+	idx -= offset;
+
+	empty = NULL;
+	dir = shmem_dir_map(info->i_indirect);
+	stage = ENTRIES_PER_PAGEPAGE/2;
+	if (idx < ENTRIES_PER_PAGEPAGE/2)
+		dir += idx/ENTRIES_PER_PAGE;
+	else {
+		dir += ENTRIES_PER_PAGE/2;
+		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
+		while (stage <= idx)
+			stage += ENTRIES_PER_PAGEPAGE;
+		if (*dir) {
+			subdir = *dir;
+			size = ((idx - ENTRIES_PER_PAGEPAGE/2) %
+				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
+			if (!size && !offset) {
+				empty = subdir;
+				*dir = NULL;
 			}
-			spin_lock(&info->lock);
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(subdir) + size;
+		} else {
+			offset = 0;
+			idx = stage;
 		}
 	}

-	while (index < info->next_index) 
-		freed += shmem_truncate_indirect(info, index);
-
-	info->swapped -= freed;
+	for (; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir-1);
+			dir = shmem_dir_map(info->i_indirect) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto done1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			subdir = *dir;
+			*dir = NULL;
+			shmem_dir_unmap(dir);
+			if (empty) {
+				shmem_dir_free(empty);
+				info->alloced++;
+			}
+			empty = subdir;
+			cond_resched_lock(&info->lock);
+			dir = shmem_dir_map(subdir);
+		}
+		subdir = *dir;
+		if (subdir && subdir->nr_swapped) {
+			ptr = shmem_swp_map(subdir);
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			freed = shmem_free_swp(ptr+offset, ptr+size);
+			shmem_swp_unmap(ptr);
+			info->swapped -= freed;
+			subdir->nr_swapped -= freed;
+			BUG_ON(subdir->nr_swapped > offset);
+		}
+		if (offset)
+			offset = 0;
+		else if (subdir) {
+			*dir = NULL;
+			shmem_dir_free(subdir);
+			info->alloced++;
+		}
+	}
+done1:
+	shmem_dir_unmap(dir-1);
+	if (empty) {
+		shmem_dir_free(empty);
+		info->alloced++;
+	}
+	if (info->next_index <= SHMEM_NR_DIRECT) {
+		shmem_dir_free(info->i_indirect);
+		info->i_indirect = NULL;
+		info->alloced++;
+	}
+done2:
+	BUG_ON(info->swapped > info->next_index);
 	shmem_recalc_inode(inode);
-	spin_unlock (&info->lock);
-	up(&info->sem);
+	spin_unlock(&info->lock);
 }

-static int shmem_notify_change(struct dentry * dentry, struct iattr *attr)
+static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
+	static struct page *shmem_holdpage(struct inode *, unsigned long);
 	struct inode *inode = dentry->d_inode;
+	struct page *page = NULL;
 	long change = 0;
 	int error;

@@ -381,20 +485,33 @@ static int shmem_notify_change(struct dentry * dentry, struct iattr *attr)
 		if (change > 0) {
 			if (!vm_enough_memory(change))
 				return -ENOMEM;
-		} else
+		} else if (attr->ia_size < inode->i_size) {
 			vm_unacct_memory(-change);
+			/*
+			 * If truncating down to a partial page, then
+			 * if that page is already allocated, hold it
+			 * in memory until the truncation is over, so
+			 * truncate_partial_page cannnot miss it were
+			 * it assigned to swap.
+			 */
+			if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
+				page = shmem_holdpage(inode,
+					attr->ia_size >> PAGE_CACHE_SHIFT);
+			}
+		}
 	}

 	error = inode_change_ok(inode, attr);
 	if (!error)
 		error = inode_setattr(inode, attr);
+	if (page)
+		page_cache_release(page);
 	if (error)
 		vm_unacct_memory(change);
 	return error;
 }

-
-static void shmem_delete_inode(struct inode * inode)
+static void shmem_delete_inode(struct inode *inode)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -406,128 +523,142 @@ static void shmem_delete_inode(struct inode * inode)
 		if (info->flags & VM_ACCOUNT)
 			vm_unacct_memory(VM_ACCT(inode->i_size));
 		inode->i_size = 0;
-		shmem_truncate (inode);
+		shmem_truncate(inode);
 	}
-	spin_lock (&sbinfo->stat_lock);
+	BUG_ON(inode->i_blocks);
+	spin_lock(&sbinfo->stat_lock);
 	sbinfo->free_inodes++;
-	spin_unlock (&sbinfo->stat_lock);
+	spin_unlock(&sbinfo->stat_lock);
 	clear_inode(inode);
 }

-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *ptr, swp_entry_t *eptr)
+static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
 {
-	swp_entry_t *test;
+	swp_entry_t *ptr;

-	for (test = ptr; test < eptr; test++) {
-		if (test->val == entry.val)
-			return test - ptr;
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val == entry.val)
+			return ptr - dir;
 	}
 	return -1;
 }

 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
-	swp_entry_t *ptr;
 	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	struct page **dir;
+	struct page *subdir;
+	swp_entry_t *ptr;
 	int offset;
-	struct inode *inode;

 	idx = 0;
 	ptr = info->i_direct;
-	spin_lock (&info->lock);
-	offset = info->next_index;
-	if (offset > SHMEM_NR_DIRECT)
-		offset = SHMEM_NR_DIRECT;
-	offset = shmem_find_swp(entry, ptr, ptr + offset);
+	spin_lock(&info->lock);
+	limit = info->next_index;
+	size = limit;
+	if (size > SHMEM_NR_DIRECT)
+		size = SHMEM_NR_DIRECT;
+	offset = shmem_find_swp(entry, ptr, ptr+size);
 	if (offset >= 0)
 		goto found;
-
-	for (idx = SHMEM_NR_DIRECT; idx < info->next_index; 
-	     idx += ENTRIES_PER_PAGE) {
-		ptr = shmem_swp_entry(info, idx, 0);
-		if (IS_ERR(ptr))
-			continue;
-		offset = info->next_index - idx;
-		if (offset > ENTRIES_PER_PAGE)
-			offset = ENTRIES_PER_PAGE;
-		offset = shmem_find_swp(entry, ptr, ptr + offset);
-		if (offset >= 0)
-			goto found;
+	if (!info->i_indirect)
+		goto lost2;
+	/* we might be racing with shmem_truncate */
+	if (limit <= SHMEM_NR_DIRECT)
+		goto lost2;
+
+	dir = shmem_dir_map(info->i_indirect);
+	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
+
+	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir-1);
+			dir = shmem_dir_map(info->i_indirect) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto lost1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			subdir = *dir;
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(subdir);
+		}
+		subdir = *dir;
+		if (subdir && subdir->nr_swapped) {
+			ptr = shmem_swp_map(subdir);
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			offset = shmem_find_swp(entry, ptr, ptr+size);
+			if (offset >= 0) {
+				shmem_dir_unmap(dir);
+				goto found;
+			}
+			shmem_swp_unmap(ptr);
+		}
 	}
-	spin_unlock (&info->lock);
+lost1:
+	shmem_dir_unmap(dir-1);
+lost2:
+	spin_unlock(&info->lock);
 	return 0;
 found:
-	idx += offset;
-	inode = igrab(&info->vfs_inode);
-	/* move head to start search for next from here */
-	list_move_tail(&shmem_inodes, &info->list);
-	spin_unlock(&shmem_ilock);
-	swap_free(entry);
-	ptr[offset] = (swp_entry_t) {0};
-
-	while (inode && move_from_swap_cache(page, idx, inode->i_mapping)) {
-		/*
-		 * Yield for kswapd, and try again - but we're still
-		 * holding the page lock - ugh! fix this up later on.
-		 * Beware of inode being unlinked or truncated: just
-		 * leave try_to_unuse to delete_from_swap_cache if so.
-		 */
-		spin_unlock(&info->lock);
-		yield();
-		spin_lock(&info->lock);
-		ptr = shmem_swp_entry(info, idx, 0);
-		if (IS_ERR(ptr))
-			break;
-	}
-
-	info->swapped--;
-	SetPageUptodate(page);
+	if (move_from_swap_cache(page, idx + offset,
+			info->vfs_inode.i_mapping) == 0)
+		shmem_swp_set(info, ptr + offset, 0);
+	shmem_swp_unmap(ptr);
 	spin_unlock(&info->lock);
-	if (inode)
-		iput(inode);
+	/*
+	 * Decrement swap count even when the entry is left behind:
+	 * try_to_unuse will skip over mms, then reincrement count.
+	 */
+	swap_free(entry);
 	return 1;
 }

 /*
 * shmem_unuse() search for an eventually swapped out shmem page.
- * Note shmem_unuse_inode drops shmem_ilock itself if successful.
 */
-void shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t entry, struct page *page)
 {
 	struct list_head *p;
-	struct shmem_inode_info * info;
+	struct shmem_inode_info *info;
+	int found = 0;

-	spin_lock (&shmem_ilock);
+	spin_lock(&shmem_ilock);
 	list_for_each(p, &shmem_inodes) {
 		info = list_entry(p, struct shmem_inode_info, list);

-		if (info->swapped && shmem_unuse_inode(info, entry, page))
-			return;
+		if (info->swapped && shmem_unuse_inode(info, entry, page)) {
+			/* move head to start search for next from here */
+			list_move_tail(&shmem_inodes, &info->list);
+			found = 1;
+			break;
+		}
 	}
-	spin_unlock (&shmem_ilock);
+	spin_unlock(&shmem_ilock);
+	return found;
 }

 /*
 * Move the page from the page cache to the swap cache.
- *
- * The page lock prevents multiple occurences of shmem_writepage at
- * once.  We still need to guard against racing with
- * shmem_getpage_locked().  
 */
-static int shmem_writepage(struct page * page)
+static int shmem_writepage(struct page *page)
 {
-	int err;
 	struct shmem_inode_info *info;
 	swp_entry_t *entry, swap;
 	struct address_space *mapping;
 	unsigned long index;
 	struct inode *inode;

-	if (!PageLocked(page))
-		BUG();
-
-	if (!(current->flags & PF_MEMALLOC))
-		return fail_writepage(page);
+	BUG_ON(!PageLocked(page));
+	BUG_ON(page_mapped(page));

 	mapping = page->mapping;
 	index = page->index;
@@ -540,215 +671,247 @@ static int shmem_writepage(struct page * page)
 		return fail_writepage(page);

 	spin_lock(&info->lock);
-	entry = shmem_swp_entry(info, index, 0);
-	if (IS_ERR(entry))	/* this had been allocated on page allocation */
-		BUG();
 	shmem_recalc_inode(inode);
-	if (entry->val)
-		BUG();
+	entry = shmem_swp_entry(info, index, NULL);
+	BUG_ON(!entry);
+	BUG_ON(entry->val);

-	err = move_to_swap_cache(page, swap);
-	if (!err) {
-		*entry = swap;
-		info->swapped++;
+	if (move_to_swap_cache(page, swap) == 0) {
+		shmem_swp_set(info, entry, swap.val);
+		shmem_swp_unmap(entry);
 		spin_unlock(&info->lock);
-		SetPageUptodate(page);
-		set_page_dirty(page);
 		unlock_page(page);
 		return 0;
 	}

+	shmem_swp_unmap(entry);
 	spin_unlock(&info->lock);
 	swap_free(swap);
 	return fail_writepage(page);
 }

+static int shmem_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	return 0;
+}
+
+static int shmem_vm_writeback(struct page *page, struct writeback_control *wbc)
+{
+	clear_page_dirty(page);
+	if (shmem_writepage(page) < 0)
+		set_page_dirty(page);
+	return 0;
+}
+
 /*
- * shmem_getpage_locked - either get the page from swap or allocate a new one
+ * shmem_getpage - either get the page from swap or allocate a new one
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache
- *
- * Called with the inode locked, so it cannot race with itself, but we
- * still need to guard against racing with shm_writepage(), which might
- * be trying to move the page to the swap cache as we run.
 */
-static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode * inode, unsigned long idx)
+static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep)
 {
 	struct address_space *mapping = inode->i_mapping;
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	struct shmem_sb_info *sbinfo;
 	struct page *page;
 	swp_entry_t *entry;
-	int error;
-
-repeat:
-	page = find_lock_page(mapping, idx);
-	if (page)
-		return page;
+	swp_entry_t swap;
+	int error = 0;

-	entry = shmem_alloc_entry (info, idx);
-	if (IS_ERR(entry))
-		return (void *)entry;
+	if (idx >= SHMEM_MAX_INDEX)
+		return -EFBIG;

-	spin_lock (&info->lock);
-	
-	/* The shmem_alloc_entry() call may have blocked, and
-	 * shmem_writepage may have been moving a page between the page
-	 * cache and swap cache.  We need to recheck the page cache
-	 * under the protection of the info->lock spinlock. */
+	/*
+	 * When writing, i_sem is held against truncation and other
+	 * writing, so next_index will remain as set here; but when
+	 * reading, idx must always be checked against next_index
+	 * after sleeping, lest truncation occurred meanwhile.
+	 */
+	spin_lock(&info->lock);
+	if (info->next_index <= idx)
+		info->next_index = idx + 1;
+	spin_unlock(&info->lock);

-	page = find_get_page(mapping, idx);
+repeat:
+	page = find_lock_page(mapping, idx);
 	if (page) {
-		if (TestSetPageLocked(page))
-			goto wait_retry;
-		spin_unlock (&info->lock);
-		return page;
+		*pagep = page;
+		return 0;
 	}
-	
+
+	spin_lock(&info->lock);
 	shmem_recalc_inode(inode);
-	if (entry->val) {
+	entry = shmem_swp_alloc(info, idx);
+	if (IS_ERR(entry)) {
+		spin_unlock(&info->lock);
+		return PTR_ERR(entry);
+	}
+	swap = *entry;
+
+	if (swap.val) {
 		/* Look it up and read it in.. */
-		page = find_get_page(&swapper_space, entry->val);
+		page = lookup_swap_cache(swap);
 		if (!page) {
-			swp_entry_t swap = *entry;
-			spin_unlock (&info->lock);
-			swapin_readahead(*entry);
-			page = read_swap_cache_async(*entry);
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			swapin_readahead(swap);
+			page = read_swap_cache_async(swap);
 			if (!page) {
-				if (entry->val != swap.val)
-					goto repeat;
-				return ERR_PTR(-ENOMEM);
+				spin_lock(&info->lock);
+				entry = shmem_swp_alloc(info, idx);
+				if (IS_ERR(entry))
+					error = PTR_ERR(entry);
+				else {
+					if (entry->val == swap.val)
+						error = -ENOMEM;
+					shmem_swp_unmap(entry);
+				}
+				spin_unlock(&info->lock);
+				if (error)
+					return error;
+				goto repeat;
 			}
 			wait_on_page_locked(page);
-			if (!PageUptodate(page) && entry->val == swap.val) {
-				page_cache_release(page);
-				return ERR_PTR(-EIO);
-			}
-			
-			/* Too bad we can't trust this page, because we
-			 * dropped the info->lock spinlock */
 			page_cache_release(page);
 			goto repeat;
 		}

 		/* We have to do this with page locked to prevent races */
-		if (TestSetPageLocked(page))
-			goto wait_retry;
+		if (TestSetPageLocked(page)) {
+			shmem_swp_unmap(entry);
+			spin_unlock(&info->lock);
+			wait_on_page_locked(page);
+			page_cache_release(page);
+			goto repeat;
+		}
 		if (PageWriteback(page)) {
+			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_writeback(page);
 			unlock_page(page);
 			page_cache_release(page);
 			goto repeat;
 		}
-		error = move_from_swap_cache(page, idx, mapping);
-		if (error < 0) {
+
+		error = PageUptodate(page)?
+			move_from_swap_cache(page, idx, mapping): -EIO;
+		if (error) {
+			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			unlock_page(page);
 			page_cache_release(page);
-			return ERR_PTR(error);
+			return error;
 		}

-		swap_free(*entry);
-		*entry = (swp_entry_t) {0};
-		info->swapped--;
-		spin_unlock (&info->lock);
+		shmem_swp_set(info, entry, 0);
+		shmem_swp_unmap(entry);
+		spin_unlock(&info->lock);
+		swap_free(swap);
 	} else {
+		shmem_swp_unmap(entry);
+		spin_unlock(&info->lock);
 		sbinfo = SHMEM_SB(inode->i_sb);
-		spin_unlock (&info->lock);
-		spin_lock (&sbinfo->stat_lock);
-		if (sbinfo->free_blocks == 0)
-			goto no_space;
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks == 0) {
+			spin_unlock(&sbinfo->stat_lock);
+			return -ENOSPC;
+		}
 		sbinfo->free_blocks--;
-		spin_unlock (&sbinfo->stat_lock);
-
-		/* Ok, get a new page.  We don't have to worry about the
-		 * info->lock spinlock here: we cannot race against
-		 * shm_writepage because we have already verified that
-		 * there is no page present either in memory or in the
-		 * swap cache, so we are guaranteed to be populating a
-		 * new shm entry.  The inode semaphore we already hold
-		 * is enough to make this atomic. */
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
+
 		page = page_cache_alloc(mapping);
-		if (!page)
-			goto no_mem;
-		error = add_to_page_cache_lru(page, mapping, idx);
-		if (error < 0) {
+		if (!page) {
+			shmem_free_block(inode);
+			return -ENOMEM;
+		}
+
+		spin_lock(&info->lock);
+		entry = shmem_swp_alloc(info, idx);
+		if (IS_ERR(entry))
+			error = PTR_ERR(entry);
+		else {
+			swap = *entry;
+			shmem_swp_unmap(entry);
+		}
+		if (error || swap.val ||
+		    add_to_page_cache_lru(page, mapping, idx) < 0) {
+			spin_unlock(&info->lock);
 			page_cache_release(page);
-			goto no_mem;
+			shmem_free_block(inode);
+			if (error)
+				return error;
+			goto repeat;
 		}
+		info->alloced++;
+		spin_unlock(&info->lock);
 		clear_highpage(page);
-		inode->i_blocks += BLOCKS_PER_PAGE;
+		SetPageUptodate(page);
 	}

 	/* We have the page */
-	SetPageUptodate(page);
-	return page;
-
-no_mem:
-	spin_lock(&sbinfo->stat_lock);
-	sbinfo->free_blocks++;
-	spin_unlock(&sbinfo->stat_lock);
-	return ERR_PTR(-ENOMEM);
-
-no_space:
-	spin_unlock (&sbinfo->stat_lock);
-	return ERR_PTR(-ENOSPC);
-
-wait_retry:
-	spin_unlock(&info->lock);
-	wait_on_page_locked(page);
-	page_cache_release(page);
-	goto repeat;
+	*pagep = page;
+	return 0;
 }

-static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr)
+static struct page *shmem_holdpage(struct inode *inode, unsigned long idx)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	int error;
-
-	down (&info->sem);
-	*ptr = ERR_PTR(-EFAULT);
-	if (inode->i_size <= (loff_t) idx * PAGE_CACHE_SIZE)
-		goto failed;
-
-	*ptr = shmem_getpage_locked(info, inode, idx);
-	if (IS_ERR (*ptr))
-		goto failed;
+	struct page *page;
+	swp_entry_t *entry;
+	swp_entry_t swap = {0};

-	unlock_page(*ptr);
-	up (&info->sem);
-	return 0;
-failed:
-	up (&info->sem);
-	error = PTR_ERR(*ptr);
-	*ptr = NOPAGE_SIGBUS;
-	if (error == -ENOMEM)
-		*ptr = NOPAGE_OOM;
-	return error;
+	/*
+	 * Somehow, it feels wrong for truncation down to cause any
+	 * allocation: so instead of a blind shmem_getpage, check that
+	 * the page has actually been instantiated before holding it.
+	 */
+	spin_lock(&info->lock);
+	page = find_get_page(inode->i_mapping, idx);
+	if (!page) {
+		entry = shmem_swp_entry(info, idx, NULL);
+		if (entry) {
+			swap = *entry;
+			shmem_swp_unmap(entry);
+		}
+	}
+	spin_unlock(&info->lock);
+	if (swap.val) {
+		if (shmem_getpage(inode, idx, &page) == 0)
+			unlock_page(page);
+	}
+	return page;
 }

-struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused)
+struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
 {
-	struct page * page;
-	unsigned int idx;
-	struct inode * inode = vma->vm_file->f_dentry->d_inode;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page;
+	unsigned long idx;
+	int error;

-	idx = (address - vma->vm_start) >> PAGE_CACHE_SHIFT;
+	idx = (address - vma->vm_start) >> PAGE_SHIFT;
 	idx += vma->vm_pgoff;
+	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;

-	if (shmem_getpage(inode, idx, &page))
-		return page;
+	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= inode->i_size)
+		return NOPAGE_SIGBUS;

+	error = shmem_getpage(inode, idx, &page);
+	if (error)
+		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
+
+	unlock_page(page);
 	flush_page_to_ram(page);
-	return(page);
+	return page;
 }

-void shmem_lock(struct file * file, int lock)
+void shmem_lock(struct file *file, int lock)
 {
-	struct inode * inode = file->f_dentry->d_inode;
-	struct shmem_inode_info * info = SHMEM_I(inode);
+	struct inode *inode = file->f_dentry->d_inode;
+	struct shmem_inode_info *info = SHMEM_I(inode);

 	spin_lock(&info->lock);
 	if (lock)
@@ -758,9 +921,9 @@ void shmem_lock(struct file * file, int lock)
 	spin_unlock(&info->lock);
 }

-static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
+static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct vm_operations_struct * ops;
+	struct vm_operations_struct *ops;
 	struct inode *inode = file->f_dentry->d_inode;

 	ops = &shmem_vm_ops;
@@ -773,17 +936,17 @@ static int shmem_mmap(struct file * file, struct vm_area_struct * vma)

 struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 {
-	struct inode * inode;
+	struct inode *inode;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

-	spin_lock (&sbinfo->stat_lock);
+	spin_lock(&sbinfo->stat_lock);
 	if (!sbinfo->free_inodes) {
-		spin_unlock (&sbinfo->stat_lock);
+		spin_unlock(&sbinfo->stat_lock);
 		return NULL;
 	}
 	sbinfo->free_inodes--;
-	spin_unlock (&sbinfo->stat_lock);
+	spin_unlock(&sbinfo->stat_lock);

 	inode = new_inode(sb);
 	if (inode) {
@@ -797,12 +960,8 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = SHMEM_I(inode);
-		spin_lock_init (&info->lock);
-		sema_init (&info->sem, 1);
-		info->next_index = 0;
-		memset (info->i_direct, 0, sizeof(info->i_direct));
-		info->i_indirect = NULL;
-		info->swapped = 0;
+		memset(info, 0, (char *)inode - (char *)info);
+		spin_lock_init(&info->lock);
 		info->flags = VM_ACCOUNT;
 		switch (mode & S_IFMT) {
 		default:
@@ -811,12 +970,14 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 		case S_IFREG:
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			spin_lock (&shmem_ilock);
-			list_add_tail(&SHMEM_I(inode)->list, &shmem_inodes);
-			spin_unlock (&shmem_ilock);
+			spin_lock(&shmem_ilock);
+			list_add_tail(&info->list, &shmem_inodes);
+			spin_unlock(&shmem_ilock);
 			break;
 		case S_IFDIR:
 			inode->i_nlink++;
+			/* Some things misbehave if size == 0 on a directory */
+			inode->i_size = 2 * BOGO_DIRENT_SIZE;
 			inode->i_op = &shmem_dir_inode_operations;
 			inode->i_fop = &simple_dir_operations;
 			break;
@@ -857,10 +1018,9 @@ static struct inode_operations shmem_symlink_inode_operations;
 static struct inode_operations shmem_symlink_inline_operations;

 static ssize_t
-shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+shmem_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
 {
-	struct inode	*inode = file->f_dentry->d_inode; 
-	struct shmem_inode_info *info;
+	struct inode	*inode = file->f_dentry->d_inode;
 	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 	loff_t		pos;
 	struct page	*page;
@@ -951,22 +1111,12 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 			__get_user(dummy, buf+bytes-1);
 		}

-		info = SHMEM_I(inode);
-		down (&info->sem);
-		page = shmem_getpage_locked(info, inode, index);
-		up (&info->sem);
-
-		status = PTR_ERR(page);
-		if (IS_ERR(page))
+		status = shmem_getpage(inode, index, &page);
+		if (status)
 			break;

-		/* We have exclusive IO access to the page.. */
-		if (!PageLocked(page)) {
-			PAGE_BUG(page);
-		}
-
 		kaddr = kmap(page);
-		status = copy_from_user(kaddr+offset, buf, bytes);
+		status = __copy_from_user(kaddr+offset, buf, bytes);
 		kunmap(page);
 		if (status)
 			goto fail_write;
@@ -978,7 +1128,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 			count -= bytes;
 			pos += bytes;
 			buf += bytes;
-			if (pos > inode->i_size) 
+			if (pos > inode->i_size)
 				inode->i_size = pos;
 		}
 unlock:
@@ -1005,7 +1155,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 	goto unlock;
 }

-static void do_shmem_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc)
+static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct address_space *mapping = inode->i_mapping;
@@ -1022,17 +1172,35 @@ static void do_shmem_file_read(struct file * filp, loff_t *ppos, read_descriptor
 		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 		if (index > end_index)
 			break;
-		nr = PAGE_CACHE_SIZE;
 		if (index == end_index) {
 			nr = inode->i_size & ~PAGE_CACHE_MASK;
 			if (nr <= offset)
 				break;
 		}

-		nr = nr - offset;
-
-		if ((desc->error = shmem_getpage(inode, index, &page)))
+		desc->error = shmem_getpage(inode, index, &page);
+		if (desc->error) {
+			if (desc->error == -EFAULT)
+				desc->error = 0;
 			break;
+		}
+
+		/*
+		 * We must evaluate after, since reads (unlike writes)
+		 * are called without i_sem protection against truncate
+		 */
+		nr = PAGE_CACHE_SIZE;
+		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+		if (index == end_index) {
+			nr = inode->i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset) {
+				unlock_page(page);
+				page_cache_release(page);
+				break;
+			}
+		}
+		unlock_page(page);
+		nr -= offset;

 		if (!list_empty(&mapping->i_mmap_shared))
 			flush_dcache_page(page);
@@ -1051,7 +1219,7 @@ static void do_shmem_file_read(struct file * filp, loff_t *ppos, read_descriptor
 		offset += nr;
 		index += offset >> PAGE_CACHE_SHIFT;
 		offset &= ~PAGE_CACHE_MASK;
-	
+
 		page_cache_release(page);
 	}

@@ -1059,7 +1227,7 @@ static void do_shmem_file_read(struct file * filp, loff_t *ppos, read_descriptor
 	UPDATE_ATIME(inode);
 }

-static ssize_t shmem_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
 	ssize_t retval;

@@ -1090,13 +1258,13 @@ static int shmem_statfs(struct super_block *sb, struct statfs *buf)

 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
-	spin_lock (&sbinfo->stat_lock);
+	spin_lock(&sbinfo->stat_lock);
 	buf->f_blocks = sbinfo->max_blocks;
 	buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
 	buf->f_files = sbinfo->max_inodes;
 	buf->f_ffree = sbinfo->free_inodes;
-	spin_unlock (&sbinfo->stat_lock);
-	buf->f_namelen = 255;
+	spin_unlock(&sbinfo->stat_lock);
+	buf->f_namelen = NAME_MAX;
 	return 0;
 }

@@ -1105,10 +1273,11 @@ static int shmem_statfs(struct super_block *sb, struct statfs *buf)
 */
 static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev)
 {
-	struct inode * inode = shmem_get_inode(dir->i_sb, mode, dev);
+	struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
 	int error = -ENOSPC;

 	if (inode) {
+		dir->i_size += BOGO_DIRENT_SIZE;
 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 		d_instantiate(dentry, inode);
 		dget(dentry); /* Extra count - pin the dentry in core */
@@ -1117,7 +1286,7 @@ static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, int d
 	return error;
 }

-static int shmem_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int error;

@@ -1135,10 +1304,11 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode)
 /*
 * Link a file..
 */
-static int shmem_link(struct dentry *old_dentry, struct inode * dir, struct dentry * dentry)
+static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;

+	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	inode->i_nlink++;
 	atomic_inc(&inode->i_count);	/* New dentry reference */
@@ -1180,16 +1350,18 @@ static int shmem_empty(struct dentry *dentry)
 	return 1;
 }

-static int shmem_unlink(struct inode * dir, struct dentry *dentry)
+static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+
+	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	inode->i_nlink--;
 	dput(dentry);	/* Undo the count from "create" - this does all the work */
 	return 0;
 }

-static int shmem_rmdir(struct inode * dir, struct dentry *dentry)
+static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	if (!shmem_empty(dentry))
 		return -ENOTEMPTY;
@@ -1204,36 +1376,39 @@ static int shmem_rmdir(struct inode * dir, struct dentry *dentry)
 * it exists so that the VFS layer correctly free's it when it
 * gets overwritten.
 */
-static int shmem_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir,struct dentry *new_dentry)
+static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
 {
-	struct inode *inode;
+	struct inode *inode = old_dentry->d_inode;
+	int they_are_dirs = S_ISDIR(inode->i_mode);

-	if (!shmem_empty(new_dentry)) 
+	if (!shmem_empty(new_dentry))
 		return -ENOTEMPTY;

-	inode = new_dentry->d_inode;
-	if (inode) {
-		inode->i_ctime = CURRENT_TIME;
-		inode->i_nlink--;
-		dput(new_dentry);
-	}
-	inode = old_dentry->d_inode;
-	if (S_ISDIR(inode->i_mode)) {
+	if (new_dentry->d_inode) {
+		(void) shmem_unlink(new_dir, new_dentry);
+		if (they_are_dirs)
+			old_dir->i_nlink--;
+	} else if (they_are_dirs) {
 		old_dir->i_nlink--;
 		new_dir->i_nlink++;
 	}

-	inode->i_ctime = old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+	old_dir->i_size -= BOGO_DIRENT_SIZE;
+	new_dir->i_size += BOGO_DIRENT_SIZE;
+	old_dir->i_ctime = old_dir->i_mtime =
+	new_dir->i_ctime = new_dir->i_mtime =
+	inode->i_ctime = CURRENT_TIME;
 	return 0;
 }

-static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
+	int error;
 	int len;
 	struct inode *inode;
 	struct page *page;
 	char *kaddr;
-	struct shmem_inode_info * info;
+	struct shmem_inode_info *info;

 	len = strlen(symname) + 1;
 	if (len > PAGE_CACHE_SIZE)
@@ -1254,26 +1429,24 @@ static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char *
 			iput(inode);
 			return -ENOMEM;
 		}
-		down(&info->sem);
-		page = shmem_getpage_locked(info, inode, 0);
-		if (IS_ERR(page)) {
-			up(&info->sem);
+		error = shmem_getpage(inode, 0, &page);
+		if (error) {
 			vm_unacct_memory(VM_ACCT(1));
 			iput(inode);
-			return PTR_ERR(page);
+			return error;
 		}
 		inode->i_op = &shmem_symlink_inode_operations;
-		spin_lock (&shmem_ilock);
+		spin_lock(&shmem_ilock);
 		list_add_tail(&info->list, &shmem_inodes);
-		spin_unlock (&shmem_ilock);
+		spin_unlock(&shmem_ilock);
 		kaddr = kmap(page);
 		memcpy(kaddr, symname, len);
 		kunmap(page);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
-		up(&info->sem);
 	}
+	dir->i_size += BOGO_DIRENT_SIZE;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	d_instantiate(dentry, inode);
 	dget(dentry);
@@ -1282,7 +1455,7 @@ static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char *

 static int shmem_readlink_inline(struct dentry *dentry, char *buffer, int buflen)
 {
-	return vfs_readlink(dentry,buffer,buflen, (const char *)SHMEM_I(dentry->d_inode));
+	return vfs_readlink(dentry, buffer, buflen, (const char *)SHMEM_I(dentry->d_inode));
 }

 static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
@@ -1292,27 +1465,26 @@ static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)

 static int shmem_readlink(struct dentry *dentry, char *buffer, int buflen)
 {
-	struct page * page;
+	struct page *page;
 	int res = shmem_getpage(dentry->d_inode, 0, &page);
-
 	if (res)
 		return res;
-
-	res = vfs_readlink(dentry,buffer,buflen, kmap(page));
+	res = vfs_readlink(dentry, buffer, buflen, kmap(page));
 	kunmap(page);
+	unlock_page(page);
 	page_cache_release(page);
 	return res;
 }

 static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	struct page * page;
+	struct page *page;
 	int res = shmem_getpage(dentry->d_inode, 0, &page);
 	if (res)
 		return res;
-
 	res = vfs_follow_link(nd, kmap(page));
 	kunmap(page);
+	unlock_page(page);
 	page_cache_release(page);
 	return res;
 }
@@ -1328,7 +1500,7 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.follow_link	= shmem_follow_link,
 };

-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long * blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
 {
 	char *this_char, *value, *rest;

@@ -1338,8 +1510,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
-			printk(KERN_ERR 
-			    "tmpfs: No value for mount option '%s'\n", 
+			printk(KERN_ERR
+			    "tmpfs: No value for mount option '%s'\n",
 			    this_char);
 			return 1;
 		}
@@ -1385,13 +1557,13 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
 	return 0;

 bad_val:
-	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 
+	printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
 	       value, this_char);
 	return 1;

 }

-static int shmem_remount_fs (struct super_block *sb, int *flags, char *data)
+static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	unsigned long max_blocks = sbinfo->max_blocks;
@@ -1402,16 +1574,16 @@ static int shmem_remount_fs (struct super_block *sb, int *flags, char *data)
 	return shmem_set_size(sbinfo, max_blocks, max_inodes);
 }

-int shmem_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int shmem_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	return 0;
 }
 #endif

-static int shmem_fill_super(struct super_block * sb, void * data, int silent)
+static int shmem_fill_super(struct super_block *sb, void *data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root;
+	struct inode *inode;
+	struct dentry *root;
 	unsigned long blocks, inodes;
 	int mode   = S_IRWXUGO | S_ISVTX;
 	uid_t uid = current->fsuid;
@@ -1442,7 +1614,7 @@ static int shmem_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_flags |= MS_NOUSER;
 #endif

-	spin_lock_init (&sbinfo->stat_lock);
+	spin_lock_init(&sbinfo->stat_lock);
 	sbinfo->max_blocks = blocks;
 	sbinfo->free_blocks = blocks;
 	sbinfo->max_inodes = inodes;
@@ -1482,7 +1654,7 @@ static void shmem_put_super(struct super_block *sb)
 	sb->u.generic_sbp = NULL;
 }

-static kmem_cache_t * shmem_inode_cachep;
+static kmem_cache_t *shmem_inode_cachep;

 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -1498,7 +1670,7 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }

-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;

@@ -1507,7 +1679,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		inode_init_once(&p->vfs_inode);
 	}
 }
- 
+
 static int init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
@@ -1527,6 +1699,8 @@ static void destroy_inodecache(void)

 static struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
+	.writepages	= shmem_writepages,
+	.vm_writeback	= shmem_vm_writeback,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 };

@@ -1600,7 +1774,7 @@ static struct vfsmount *shm_mnt;
 static int __init init_shmem_fs(void)
 {
 	int error;
-	struct vfsmount * res;
+	struct vfsmount *res;

 	error = init_inodecache();
 	if (error)
@@ -1608,21 +1782,21 @@ static int __init init_shmem_fs(void)

 	error = register_filesystem(&tmpfs_fs_type);
 	if (error) {
-		printk (KERN_ERR "Could not register tmpfs\n");
+		printk(KERN_ERR "Could not register tmpfs\n");
 		goto out2;
 	}
 #ifdef CONFIG_TMPFS
 	error = register_filesystem(&shmem_fs_type);
 	if (error) {
-		printk (KERN_ERR "Could not register shm fs\n");
+		printk(KERN_ERR "Could not register shm fs\n");
 		goto out1;
 	}
-	devfs_mk_dir (NULL, "shm", NULL);
+	devfs_mk_dir(NULL, "shm", NULL);
 #endif
 	res = kern_mount(&tmpfs_fs_type);
 	if (IS_ERR (res)) {
 		error = PTR_ERR(res);
-		printk (KERN_ERR "could not kern_mount tmpfs\n");
+		printk(KERN_ERR "could not kern_mount tmpfs\n");
 		goto out;
 	}
 	shm_mnt = res;
@@ -1663,11 +1837,11 @@ module_exit(exit_shmem_fs)
 * @size: size to be set for the file
 *
 */
-struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags)
+struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 {
 	int error;
 	struct file *file;
-	struct inode * inode;
+	struct inode *inode;
 	struct dentry *dentry, *root;
 	struct qstr this;

@@ -1693,7 +1867,7 @@ struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags)

 	error = -ENOSPC;
 	inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
-	if (!inode) 
+	if (!inode)
 		goto close_file;

 	SHMEM_I(inode)->flags &= flags;
@@ -1709,11 +1883,11 @@ struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags)
 close_file:
 	put_filp(file);
 put_dentry:
-	dput (dentry);
+	dput(dentry);
 put_memory:
 	if (flags & VM_ACCOUNT)
 		vm_unacct_memory(VM_ACCT(size));
-	return ERR_PTR(error);	
+	return ERR_PTR(error);
 }

 /*
@@ -1725,13 +1899,13 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 {
 	struct file *file;
 	loff_t size = vma->vm_end - vma->vm_start;
-	
+
 	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
 	if (IS_ERR(file))
 		return PTR_ERR(file);

 	if (vma->vm_file)
-		fput (vma->vm_file);
+		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
 	return 0;

--- a/mm/swap.c
+++ b/mm/swap.c
@@ -38,7 +38,7 @@ void activate_page(struct page *page)
 		del_page_from_inactive_list(zone, page);
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
-		KERNEL_STAT_INC(pgactivate);
+		inc_page_state(pgactivate);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
@@ -238,6 +238,29 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }

+/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec:	Where the resulting pages are placed
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ *
+ * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * pagevec_lookup() returns the number of pages which were found.
+ */
+unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned int nr_pages)
+{
+	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
 /*
 * Perform any setup for the swap system
 */

--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -105,7 +105,6 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
-	ClearPageDirty(page);
 	__remove_from_page_cache(page);
 	INC_CACHE_INFO(del_total);
 }
@@ -146,30 +145,30 @@ int add_to_swap(struct page * page)
 		pf_flags = current->flags;
 		current->flags &= ~PF_MEMALLOC;
 		current->flags |= PF_NOWARN;
-		ClearPageUptodate(page);		/* why? */

 		/*
 		 * Add it to the swap cache and mark it dirty
-		 * (adding to the page cache will clear the dirty
-		 * and uptodate bits, so we need to do it again)
 		 */
-		switch (add_to_swap_cache(page, entry)) {
+		switch (add_to_page_cache(page, &swapper_space, entry.val)) {
 		case 0:				/* Success */
 			current->flags = pf_flags;
 			SetPageUptodate(page);
+			ClearPageDirty(page);
 			set_page_dirty(page);
-			swap_free(entry);
+			INC_CACHE_INFO(add_total);
 			return 1;
-		case -ENOMEM:			/* radix-tree allocation */
+		case -EEXIST:
+			/* Raced with "speculative" read_swap_cache_async */
+			current->flags = pf_flags;
+			INC_CACHE_INFO(exist_race);
+			swap_free(entry);
+			continue;
+		default:
+			/* -ENOMEM radix-tree allocation failure */
 			current->flags = pf_flags;
 			swap_free(entry);
 			return 0;
-		default:			/* ENOENT: raced */
-			break;
 		}
-		/* Raced with "speculative" read_swap_cache_async */
-		current->flags = pf_flags;
-		swap_free(entry);
 	}
 }

@@ -203,33 +202,13 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	void **pslot;
 	int err;

-	if (!mapping)
-		BUG();
-
-	if (!swap_duplicate(entry)) {
-		INC_CACHE_INFO(noent_race);
-		return -ENOENT;
-	}
-
 	write_lock(&swapper_space.page_lock);
 	write_lock(&mapping->page_lock);

 	err = radix_tree_reserve(&swapper_space.page_tree, entry.val, &pslot);
 	if (!err) {
-		/* Remove it from the page cache */
 		__remove_from_page_cache(page);
-
-		/* Add it to the swap cache */
 		*pslot = page;
-		/*
-		 * This code used to clear PG_uptodate, PG_error, PG_arch1,
-		 * PG_referenced and PG_checked.  What _should_ it clear?
-		 */
-		ClearPageUptodate(page);
-		ClearPageReferenced(page);
-
-		SetPageLocked(page);
-		ClearPageDirty(page);
 		___add_to_page_cache(page, &swapper_space, entry.val);
 	}

@@ -237,21 +216,21 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	write_unlock(&swapper_space.page_lock);

 	if (!err) {
+		if (!swap_duplicate(entry))
+			BUG();
+		/* shift page from clean_pages to dirty_pages list */
+		BUG_ON(PageDirty(page));
+		set_page_dirty(page);
 		INC_CACHE_INFO(add_total);
-		return 0;
-	}
-
-	swap_free(entry);
-
-	if (err == -EEXIST)
+	} else if (err == -EEXIST)
 		INC_CACHE_INFO(exist_race);
-
 	return err;
 }

 int move_from_swap_cache(struct page *page, unsigned long index,
 		struct address_space *mapping)
 {
+	swp_entry_t entry;
 	void **pslot;
 	int err;

@@ -259,44 +238,27 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 	BUG_ON(PageWriteback(page));
 	BUG_ON(page_has_buffers(page));

+	entry.val = page->index;
+
 	write_lock(&swapper_space.page_lock);
 	write_lock(&mapping->page_lock);

 	err = radix_tree_reserve(&mapping->page_tree, index, &pslot);
 	if (!err) {
-		swp_entry_t entry;
-
-		entry.val = page->index;
 		__delete_from_swap_cache(page);
-
 		*pslot = page;
-
-		/*
-		 * This code used to clear PG_uptodate, PG_error, PG_referenced,
-		 * PG_arch_1 and PG_checked.  It's not really clear why.
-		 */
-		ClearPageUptodate(page);
-		ClearPageReferenced(page);
-
-		/*
-		 * ___add_to_page_cache puts the page on ->clean_pages,
-		 * but it's dirty.  If it's on ->clean_pages, it will basically
-		 * never get written out.
-		 */
-		SetPageDirty(page);
 		___add_to_page_cache(page, mapping, index);
-		/* fix that up */
-		list_move(&page->list, &mapping->dirty_pages);
-		write_unlock(&mapping->page_lock);
-		write_unlock(&swapper_space.page_lock);
-
-		/* Do this outside ->page_lock */
-		swap_free(entry);
-		return 0;
 	}

 	write_unlock(&mapping->page_lock);
 	write_unlock(&swapper_space.page_lock);
+
+	if (!err) {
+		swap_free(entry);
+		/* shift page from clean_pages to dirty_pages list */
+		ClearPageDirty(page);
+		set_page_dirty(page);
+	}
 	return err;
 }


--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -531,6 +531,7 @@ static int try_to_unuse(unsigned int type)
 	int i = 0;
 	int retval = 0;
 	int reset_overflow = 0;
+	int shmem;

 	/*
 	 * When searching mms for an entry, a good strategy is to
@@ -611,11 +612,12 @@ static int try_to_unuse(unsigned int type)
 		 * Whenever we reach init_mm, there's no address space
 		 * to search, but use it as a reminder to search shmem.
 		 */
+		shmem = 0;
 		swcount = *swap_map;
 		if (swcount > 1) {
 			flush_page_to_ram(page);
 			if (start_mm == &init_mm)
-				shmem_unuse(entry, page);
+				shmem = shmem_unuse(entry, page);
 			else
 				unuse_process(start_mm, entry, page);
 		}
@@ -632,7 +634,9 @@ static int try_to_unuse(unsigned int type)
 				swcount = *swap_map;
 				if (mm == &init_mm) {
 					set_start_mm = 1;
-					shmem_unuse(entry, page);
+					spin_unlock(&mmlist_lock);
+					shmem = shmem_unuse(entry, page);
+					spin_lock(&mmlist_lock);
 				} else
 					unuse_process(mm, entry, page);
 				if (set_start_mm && *swap_map < swcount) {
@@ -681,15 +685,24 @@ static int try_to_unuse(unsigned int type)
 		 * read from disk into another page.  Splitting into two
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
-		 * Note shmem_unuse already deleted its from swap cache.
+		 *
+		 * Note shmem_unuse already deleted a swappage from
+		 * the swap cache, unless the move to filepage failed:
+		 * in which case it left swappage in cache, lowered its
+		 * swap count to pass quickly through the loops above,
+		 * and now we must reincrement count to try again later.
 		 */
 		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
 			swap_writepage(page);
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
-		if (PageSwapCache(page))
-			delete_from_swap_cache(page);
+		if (PageSwapCache(page)) {
+			if (shmem)
+				swap_duplicate(entry);
+			else
+				delete_from_swap_cache(page);
+		}

 		/*
 		 * So we could skip searching mms once swap count went

--- a/mm/truncate.c
+++ b/mm/truncate.c
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/buffer_head.h>	/* grr. try_to_release_page,
+				   block_invalidatepage */
+
+
+static int do_invalidatepage(struct page *page, unsigned long offset)
+{
+	int (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (invalidatepage == NULL)
+		invalidatepage = block_invalidatepage;
+	return (*invalidatepage)(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (PagePrivate(page))
+		do_invalidatepage(page, partial);
+}
+
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_nopage().
+ */
+static void truncate_complete_page(struct page *page)
+{
+	if (PagePrivate(page))
+		do_invalidatepage(page, 0);
+
+	clear_page_dirty(page);
+	ClearPageUptodate(page);
+	remove_from_page_cache(page);
+	page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages that are beyond
+ * that offset (and zeroing out partial pages).
+ *
+ * Truncate takes two passes - the first pass is nonblocking.  It will not
+ * block on page locks and it will not block on writeback.  The second pass
+ * will wait.  This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	pagevec_init(&pvec);
+	next = start;
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			next = page->index + 1;
+			if (TestSetPageLocked(page))
+				continue;
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+			truncate_complete_page(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (partial) {
+		struct page *page = find_lock_page(mapping, start - 1);
+		if (page) {
+			wait_on_page_writeback(page);
+			truncate_partial_page(page, partial);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	next = start;
+	for ( ; ; ) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			wait_on_page_writeback(page);
+			next = page->index + 1;
+			truncate_complete_page(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	}
+	if (lstart == 0 && mapping->nrpages)
+		printk("%s: I goofed!\n", __FUNCTION__);
+}
+
+/**
+ * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
+ * @inode: the inode which pages we want to invalidate
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_inode_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+void invalidate_inode_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			if (TestSetPageLocked(page)) {
+				next++;
+				continue;
+			}
+			next = page->index + 1;
+			if (PageDirty(page) || PageWriteback(page))
+				goto unlock;
+			if (PagePrivate(page) && !try_to_release_page(page, 0))
+				goto unlock;
+			if (page_mapped(page))
+				goto unlock;
+			truncate_complete_page(page);
+unlock:
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/**
+ * invalidate_inode_pages2 - remove all unmapped pages from an address_space
+ * @mapping - the address_space
+ *
+ * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
+ * where the page is seen to be mapped into process pagetables.  In that case,
+ * the page is marked clean but is left attached to its address_space.
+ *
+ * FIXME: invalidate_inode_pages2() is probably trivially livelockable.
+ */
+void invalidate_inode_pages2(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec);
+	while (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			if (page->mapping) {	/* truncate race? */
+				wait_on_page_writeback(page);
+				next = page->index + 1;
+				if (page_mapped(page))
+					clear_page_dirty(page);
+				else
+					truncate_complete_page(page);
+			}
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -312,8 +312,10 @@ shrink_list(struct list_head *page_list, int nr_pages,
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
-	KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages);
-	KERNEL_STAT_ADD(pgactivate, pgactivate);
+	mod_page_state(pgsteal, nr_pages_in - nr_pages);
+	if (current->flags & PF_KSWAPD)
+		mod_page_state(kswapd_steal, nr_pages_in - nr_pages);
+	mod_page_state(pgactivate, pgactivate);
 	return nr_pages;
 }

@@ -380,7 +382,7 @@ shrink_cache(int nr_pages, struct zone *zone,
 			goto done;

 		max_scan -= nr_scan;
-		KERNEL_STAT_ADD(pgscan, nr_scan);
+		mod_page_state(pgscan, nr_scan);
 		nr_pages = shrink_list(&page_list, nr_pages,
 				gfp_mask, &max_scan, nr_mapped);

@@ -527,8 +529,8 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);

-	KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
-	KERNEL_STAT_ADD(pgdeactivate, pgdeactivate);
+	mod_page_state(pgrefill, nr_pages_in - nr_pages);
+	mod_page_state(pgdeactivate, pgdeactivate);
 }

 static /* inline */ int
@@ -641,7 +643,7 @@ try_to_free_pages(struct zone *classzone,
 	int priority = DEF_PRIORITY;
 	int nr_pages = SWAP_CLUSTER_MAX;

-	KERNEL_STAT_INC(pageoutrun);
+	inc_page_state(pageoutrun);

 	for (priority = DEF_PRIORITY; priority; priority--) {
 		int total_scanned = 0;
@@ -757,7 +759,7 @@ int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC;
+	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;

 	/*
 	 * Kswapd main loop.