Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/andrew

into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux

Merge penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/andrew
into penguin.transmeta.com:/home/penguin/torvalds/repositories/kernel/linux
fda0b1ed · Linus Torvalds · 81ad17d7 · 5de3d3bd · fda0b1ed · fda0b1ed
Commit fda0b1ed authored Oct 02, 2002 by Linus Torvalds
39 changed files
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,7 +31,7 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.
-Last updated: January 22, 2002
+Last updated: October 1st, 2002
 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).
@@ -60,6 +60,7 @@ o  xfsprogs               2.1.0                   # xfs_db -V
 o  pcmcia-cs              3.1.21                  # cardmgr -V
 o  PPP                    2.4.0                   # pppd --version
 o  isdn4k-utils           3.1pre1                 # isdnctrl 2>&1|grep version
+o  procps                 2.0.9                   # ps --version
 Kernel compilation
 ==================
@@ -80,9 +81,7 @@ almost certainly bugs (mainly, but not exclusively, in the kernel) that
 will need to be fixed in order to use these compilers. In any case, using
 pgcc instead of plain gcc is just asking for trouble.
-Note that gcc 2.7.2.3 and  gcc 2.91.66 (egcs-1.1.2) are no longer supported
+gcc 2.91.66 (egcs-1.1.2) continues to be supported for SPARC64 requirements.
-kernel compilers. The kernel no longer works around bugs in these versions,
-and, in fact, will refuse to be compiled with it.
 The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
 You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build

--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -191,6 +191,7 @@ Table 1-3: Kernel info in /proc
 ..............................................................................
 File        Content                                           
 apm         Advanced power management info                    
+ buddyinfo   Kernel memory allocator information (see text)	(2.5)
 bus         Directory containing bus specific information     
 cmdline     Kernel command line                               
 cpuinfo     Info about the CPU                                
@@ -275,7 +276,7 @@ output of a SMP machine):
  ERR:       2155
 NMI is incremented in this case because every timer interrupt generates a NMI
-(Non Maskable Interrupt) which is used by the NMI Watchdog to detect lookups.
+(Non Maskable Interrupt) which is used by the NMI Watchdog to detect lockups.
 LOC is the local interrupt counter of the internal APIC of every CPU.
@@ -326,6 +327,25 @@ Linux uses  slab  pools for memory management above page level in version 2.2.
 Commonly used  objects  have  their  own  slab  pool (such as network buffers,
 directory cache, and so on).
+..............................................................................
+> cat /proc/buddyinfo
+Node 0, zone      DMA      0      4      5      4      4      3 ...
+Node 0, zone   Normal      1      0      0      1    101      8 ...
+Node 0, zone  HighMem      2      0      0      1      1      0 ...
+Memory fragmentation is a problem under some workloads, and buddyinfo is a 
+useful tool for helping diagnose these problems.  Buddyinfo will give you a 
+clue as to how big an area you can safely allocate, or why a previous
+allocation failed.
+Each column represents the number of pages of a certain order which are 
+available.  In this case, there are 0 chunks of 2^0*PAGE_SIZE available in 
+ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 
+availble in ZONE_NORMAL, etc... 
 1.3 IDE devices in /proc/ide
 ----------------------------

--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -25,13 +25,13 @@ key: If a user application wants to share hugepages with other
      memory (mapped by hugeTLBs) in their address space.  When a process
      forks, then children share the same physical memory with their parent.
-      For the cases when an application wishes to keep the huge pages
+      For the cases when an application wishes to keep the huge
-      private, the key value of 0 is defined.  In this case kernel allocates
+      pages private, the key value of 0 is defined.  In this case
-      hugetlb pages to the process that are not shareable across different
+      kernel allocates hugetlb pages to the process that are not
-      processes.  These segments are marked private for the process.  These
+      shareable across different processes.  These segments are marked
-      segments are not copied to children's address space on forks.
+      private for the process.  These segments are not copied to
+      children's address space on forks - the child will have no
-AKPM: So what is present at that address within the child?
+      mapping for these virtual addresses.
      The key manangement (and assignment) part is left to user
      applications.

--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -56,6 +56,7 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 {
 	struct thread_struct * t = &current->thread;
 	struct tss_struct * tss;
+	unsigned long *bitmap = NULL;
 	int ret = 0;
 	if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32))
@@ -63,15 +64,12 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	if (turn_on && !capable(CAP_SYS_RAWIO))
 		return -EPERM;
-	tss = init_tss + get_cpu();
 	/*
 	 * If it's the first ioperm() call in this thread's lifetime, set the
 	 * IO bitmap up. ioperm() is much less timing critical than clone(),
 	 * this is why we delay this operation until now:
 	 */
 	if (!t->ts_io_bitmap) {
-		unsigned long *bitmap;
 		bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!bitmap) {
 			ret = -ENOMEM;
@@ -83,20 +81,19 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 		 */
 		memset(bitmap, 0xff, IO_BITMAP_BYTES);
 		t->ts_io_bitmap = bitmap;
-		/*
-		 * this activates it in the TSS
-		 */
-		tss->bitmap = IO_BITMAP_OFFSET;
 	}
+	tss = init_tss + get_cpu();
+	if (bitmap)
+		tss->bitmap = IO_BITMAP_OFFSET;	/* Activate it in the TSS */
 	/*
 	 * do it in the per-thread copy and in the TSS ...
 	 */
 	set_bitmap(t->ts_io_bitmap, from, num, !turn_on);
 	set_bitmap(tss->io_bitmap, from, num, !turn_on);
-out:
 	put_cpu();
+out:
 	return ret;
 }

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -58,6 +58,7 @@ static int queue_nr_requests;
 static int batch_requests;
 unsigned long blk_max_low_pfn, blk_max_pfn;
+atomic_t nr_iowait_tasks = ATOMIC_INIT(0);
 int blk_nohighio = 0;
 static struct congestion_state {
@@ -116,6 +117,27 @@ static void set_queue_congested(request_queue_t *q, int rw)
 		atomic_inc(&congestion_states[rw].nr_congested_queues);
 }
+/*
+ * This task is about to go to sleep on IO.  Increment nr_iowait_tasks so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void io_schedule(void)
+{
+	atomic_inc(&nr_iowait_tasks);
+	schedule();
+	atomic_dec(&nr_iowait_tasks);
+}
+void io_schedule_timeout(long timeout)
+{
+	atomic_inc(&nr_iowait_tasks);
+	schedule_timeout(timeout);
+	atomic_dec(&nr_iowait_tasks);
+}
 /**
 * bdev_get_queue: - return the queue that matches the given device
 * @bdev:    device
@@ -1274,7 +1296,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 		if (!rl->count)
-			schedule();
+			io_schedule();
 		finish_wait(&rl->wait, &wait);
 		spin_lock_irq(q->queue_lock);
 		rq = get_request(q, rw);
@@ -1497,7 +1519,7 @@ void blk_congestion_wait(int rw, long timeout)
 	blk_run_queues();
 	prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE);
 	if (atomic_read(&cs->nr_congested_queues) != 0)
-		schedule_timeout(timeout);
+		io_schedule_timeout(timeout);
 	finish_wait(&cs->wqh, &wait);
 }
@@ -1856,21 +1878,14 @@ int submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
-	/*
-	 * do some validity checks...
-	 */
 	BUG_ON(!bio->bi_end_io);
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
 	bio->bi_rw = rw;
 	if (rw & WRITE)
-		kstat.pgpgout += count;
+		mod_page_state(pgpgout, count);
 	else
-		kstat.pgpgin += count;
+		mod_page_state(pgpgin, count);
 	generic_make_request(bio);
 	return 1;
 }

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -443,7 +443,7 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 	 * We really want to use invalidate_inode_pages2() for
 	 * that, but not until that's cleaned up.
 	 */
-	invalidate_inode_pages(bdev->bd_inode);
+	invalidate_inode_pages(bdev->bd_inode->i_mapping);
 }
 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)

--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -18,16 +18,11 @@
 #include <linux/bio.h>
 #include <linux/wait.h>
 #include <linux/err.h>
+#include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
 #include <asm/atomic.h>
-/*
- * The largest-sized BIO which this code will assemble, in bytes.  Set this
- * to PAGE_SIZE if your drivers are broken.
- */
-#define DIO_BIO_MAX_SIZE (16*1024)
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
 * the size of a structure on the stack.
@@ -37,7 +32,6 @@
 struct dio {
 	/* BIO submission state */
 	struct bio *bio;		/* bio under assembly */
-	struct bio_vec *bvec;		/* current bvec in that bio */
 	struct inode *inode;
 	int rw;
 	unsigned blkbits;		/* doesn't change */
@@ -179,15 +173,10 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 		return -ENOMEM;
 	bio->bi_bdev = bdev;
-	bio->bi_vcnt = nr_vecs;
-	bio->bi_idx = 0;
-	bio->bi_size = 0;
 	bio->bi_sector = first_sector;
-	bio->bi_io_vec[0].bv_page = NULL;
 	bio->bi_end_io = dio_bio_end_io;
 	dio->bio = bio;
-	dio->bvec = NULL;		/* debug */
 	return 0;
 }
@@ -195,14 +184,11 @@ static void dio_bio_submit(struct dio *dio)
 {
 	struct bio *bio = dio->bio;
-	bio->bi_vcnt = bio->bi_idx;
-	bio->bi_idx = 0;
 	bio->bi_private = dio;
 	atomic_inc(&dio->bio_count);
 	submit_bio(dio->rw, bio);
 	dio->bio = NULL;
-	dio->bvec = NULL;
 	dio->boundary = 0;
 }
@@ -230,7 +216,7 @@ static struct bio *dio_await_one(struct dio *dio)
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 			blk_run_queues();
-			schedule();
+			io_schedule();
 			spin_lock_irqsave(&dio->bio_list_lock, flags);
 			dio->waiter = NULL;
 		}
@@ -393,8 +379,7 @@ static void dio_prep_bio(struct dio *dio)
 	if (dio->bio == NULL)
 		return;
-	if (dio->bio->bi_idx == dio->bio->bi_vcnt ||
+	if (dio->boundary ||
-			dio->boundary ||
 			dio->last_block_in_bio != dio->next_block_in_bio - 1)
 		dio_bio_submit(dio);
 }
@@ -405,19 +390,44 @@ static void dio_prep_bio(struct dio *dio)
 static int dio_new_bio(struct dio *dio)
 {
 	sector_t sector;
-	int ret;
+	int ret, nr_pages;
 	ret = dio_bio_reap(dio);
 	if (ret)
 		goto out;
 	sector = dio->next_block_in_bio << (dio->blkbits - 9);
-	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector,
+	nr_pages = min(dio->total_pages, BIO_MAX_PAGES);
-				DIO_BIO_MAX_SIZE / PAGE_SIZE);
+	ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
 	dio->boundary = 0;
 out:
 	return ret;
 }
+static int
+dio_bio_add_page(struct dio *dio, struct page *page,
+		unsigned int bv_len, unsigned int bv_offset)
+{
+	int ret = 0;
+	if (bv_len == 0) 
+		goto out;
+	page_cache_get(page);
+	if (bio_add_page(dio->bio, page, bv_len, bv_offset)) {
+		dio_bio_submit(dio);
+		ret = dio_new_bio(dio);
+		if (ret == 0) {
+			ret = bio_add_page(dio->bio, page, bv_len, bv_offset);
+			BUG_ON(ret != 0);
+		}
+	}
+	page_cache_release(page);
+out:
+	return ret;
+}
 /*
 * Walk the user pages, and the file, mapping blocks to disk and emitting BIOs.
 *
@@ -438,13 +448,15 @@ int do_direct_IO(struct dio *dio)
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
-	int ret;
+	int ret = 0;
 	/* The I/O can start at any block offset within the first page */
 	block_in_page = dio->first_block_in_page;
 	while (dio->block_in_file < dio->final_block_in_request) {
 		int new_page;	/* Need to insert this page into the BIO? */
+		unsigned int bv_offset;
+		unsigned int bv_len;
 		page = dio_get_page(dio);
 		if (IS_ERR(page)) {
@@ -453,15 +465,16 @@ int do_direct_IO(struct dio *dio)
 		}
 		new_page = 1;
+		bv_offset = 0;
+		bv_len = 0;
 		while (block_in_page < blocks_per_page) {
-			struct bio *bio;
 			unsigned this_chunk_bytes;	/* # of bytes mapped */
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;
 			ret = get_more_blocks(dio);
 			if (ret)
-				goto fail_release;
+				goto out;
 			/* Handle holes */
 			if (!buffer_mapped(&dio->map_bh)) {
@@ -480,24 +493,19 @@ int do_direct_IO(struct dio *dio)
 			if (dio->bio == NULL) {
 				ret = dio_new_bio(dio);
 				if (ret)
-					goto fail_release;
+					goto out;
 				new_page = 1;
 			}
-			bio = dio->bio;
 			if (new_page) {
-				dio->bvec = &bio->bi_io_vec[bio->bi_idx];
+				bv_len = 0;
-				page_cache_get(page);
+				bv_offset = block_in_page << blkbits;
-				dio->bvec->bv_page = page;
-				dio->bvec->bv_len = 0;
-				dio->bvec->bv_offset = block_in_page << blkbits;
-				bio->bi_idx++;
 				new_page = 0;
 			}
 			/* Work out how much disk we can add to this page */
 			this_chunk_blocks = dio->blocks_available;
-			u = (PAGE_SIZE - (dio->bvec->bv_offset + dio->bvec->bv_len)) >> blkbits;
+			u = (PAGE_SIZE - (bv_len + bv_offset)) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			u = dio->final_block_in_request - dio->block_in_file;
@@ -506,8 +514,7 @@ int do_direct_IO(struct dio *dio)
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);
-			dio->bvec->bv_len += this_chunk_bytes;
+			bv_len += this_chunk_bytes;
-			bio->bi_size += this_chunk_bytes;
 			dio->next_block_in_bio += this_chunk_blocks;
 			dio->last_block_in_bio = dio->next_block_in_bio - 1;
 			dio->boundary = buffer_boundary(&dio->map_bh);
@@ -520,13 +527,11 @@ int do_direct_IO(struct dio *dio)
 			if (dio->block_in_file == dio->final_block_in_request)
 				break;
 		}
+		ret = dio_bio_add_page(dio, page, bv_len, bv_offset);
+		if (ret)
+			goto out;
 		block_in_page = 0;
-		page_cache_release(page);
 	}
-	ret = 0;
-	goto out;
-fail_release:
-	page_cache_release(page);
 out:
 	return ret;
 }
@@ -542,7 +547,6 @@ direct_io_worker(int rw, struct inode *inode, const struct iovec *iov,
 	size_t bytes, tot_bytes = 0;
 	dio.bio = NULL;
-	dio.bvec = NULL;
 	dio.inode = inode;
 	dio.rw = rw;
 	dio.blkbits = blkbits;

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -147,10 +147,12 @@ static void destroy_inode(struct inode *inode)
 	if (inode_has_buffers(inode))
 		BUG();
 	security_ops->inode_free_security(inode);
-	if (inode->i_sb->s_op->destroy_inode)
+	if (inode->i_sb->s_op->destroy_inode) {
 		inode->i_sb->s_op->destroy_inode(inode);
-	else
+	} else {
+		BUG_ON(inode->i_data.page_tree.rnode != NULL);
 		kmem_cache_free(inode_cachep, (inode));
+	}
 }

--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -301,7 +301,7 @@ jffs_setattr(struct dentry *dentry, struct iattr *iattr)
 		inode->i_blocks = (inode->i_size + 511) >> 9;
 		if (len) {
-			invalidate_inode_pages(inode);
+			invalidate_inode_pages(inode->i_mapping);
 		}
 		inode->i_ctime = CURRENT_TIME;
 		inode->i_mtime = inode->i_ctime;
@@ -1520,7 +1520,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 	}
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 out_isem:
 	return err;

--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -125,14 +125,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	 *	 throught inode->i_sem or some other mechanism.
 	 */
 	if (page->index == 0)
-		invalidate_inode_pages(inode);
+		invalidate_inode_pages(inode->i_mapping);
 	unlock_page(page);
 	return 0;
 error:
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 	desc->error = error;
 	return -EIO;
 }

--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -564,7 +564,7 @@ nfs_zap_caches(struct inode *inode)
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	NFS_CACHEINV(inode);
@@ -1130,7 +1130,7 @@ __nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if (invalid) {
 		NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 		NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
-		invalidate_inode_pages(inode);
+		invalidate_inode_pages(inode->i_mapping);
 		memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	} else if (time_after(jiffies, NFS_ATTRTIMEO_UPDATE(inode)+NFS_ATTRTIMEO(inode))) {
 		if ((NFS_ATTRTIMEO(inode) <<= 1) > NFS_MAXATTRTIMEO(inode))

--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -252,6 +252,18 @@ static struct file_operations proc_cpuinfo_operations = {
 	.release	= seq_release,
 };
+extern struct seq_operations vmstat_op;
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vmstat_op);
+}
+static struct file_operations proc_vmstat_file_operations = {
+	open:		vmstat_open,
+	read:		seq_read,
+	llseek:		seq_lseek,
+	release:	seq_release,
+};
 #ifdef CONFIG_PROC_HARDWARE
 static int hardware_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -327,7 +339,7 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	int i, len;
 	extern unsigned long total_forks;
 	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
 	int major, disk;
 	for (i = 0 ; i < NR_CPUS; i++) {
@@ -337,38 +349,32 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 		user += kstat.per_cpu_user[i];
 		nice += kstat.per_cpu_nice[i];
 		system += kstat.per_cpu_system[i];
+		idle += kstat.per_cpu_idle[i];
+		iowait += kstat.per_cpu_iowait[i];
 #if !defined(CONFIG_ARCH_S390)
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat.irqs[i][j];
 #endif
 	}
-	len = sprintf(page, "cpu  %u %u %u %lu\n",
+	len = sprintf(page, "cpu  %u %u %u %u %u\n",
 		jiffies_to_clock_t(user),
 		jiffies_to_clock_t(nice),
 		jiffies_to_clock_t(system),
-		jiffies_to_clock_t(jif * num_online_cpus() - (user + nice + system)));
+		jiffies_to_clock_t(idle),
+		jiffies_to_clock_t(iowait));
 	for (i = 0 ; i < NR_CPUS; i++){
 		if (!cpu_online(i)) continue;
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
+		len += sprintf(page + len, "cpu%d %u %u %u %u %u\n",
 			i,
 			jiffies_to_clock_t(kstat.per_cpu_user[i]),
 			jiffies_to_clock_t(kstat.per_cpu_nice[i]),
 			jiffies_to_clock_t(kstat.per_cpu_system[i]),
-			jiffies_to_clock_t(jif - (  kstat.per_cpu_user[i] \
+			jiffies_to_clock_t(kstat.per_cpu_idle[i]),
-				   + kstat.per_cpu_nice[i] \
+			jiffies_to_clock_t(kstat.per_cpu_iowait[i]));
-				   + kstat.per_cpu_system[i])));
 	}
-	len += sprintf(page + len,
+	len += sprintf(page + len, "intr %u", sum);
-		"page %u %u\n"
-		"swap %u %u\n"
-		"intr %u",
-			kstat.pgpgin >> 1,
-			kstat.pgpgout >> 1,
-			kstat.pswpin,
-			kstat.pswpout,
-			sum
-	);
 #if !defined(CONFIG_ARCH_S390)
 	for (i = 0 ; i < NR_IRQS ; i++)
 		len += sprintf(page + len, " %u", kstat_irqs(i));
@@ -395,29 +401,9 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	}
 	len += sprintf(page + len,
-		"\npageallocs %u\n"
+		"\nctxt %lu\n"
-		"pagefrees %u\n"
-		"pageactiv %u\n"
-		"pagedeact %u\n"
-		"pagefault %u\n"
-		"majorfault %u\n"
-		"pagescan %u\n"
-		"pagesteal %u\n"
-		"pageoutrun %u\n"
-		"allocstall %u\n"
-		"ctxt %lu\n"
 		"btime %lu\n"
 		"processes %lu\n",
-		kstat.pgalloc,
-		kstat.pgfree,
-		kstat.pgactivate,
-		kstat.pgdeactivate,
-		kstat.pgfault,
-		kstat.pgmajfault,
-		kstat.pgscan,
-		kstat.pgsteal,
-		kstat.pageoutrun,
-		kstat.allocstall,
 		nr_context_switches(),
 		xtime.tv_sec - jif / HZ,
 		total_forks);
@@ -646,6 +632,7 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 	create_seq_entry("ksyms", 0, &proc_ksyms_operations);

--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -210,7 +210,7 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 			(long) last_sz, (long) inode->i_size);
 		if (!S_ISDIR(inode->i_mode))
-			invalidate_inode_pages(inode);
+			invalidate_inode_pages(inode->i_mapping);
 	}
 }
@@ -274,7 +274,7 @@ smb_refresh_inode(struct dentry *dentry)
 			 * But we do want to invalidate the caches ...
 			 */
 			if (!S_ISDIR(inode->i_mode))
-				invalidate_inode_pages(inode);
+				invalidate_inode_pages(inode->i_mapping);
 			else
 				smb_invalid_dir_cache(inode);
 			error = -EIO;

--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -400,4 +400,8 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }
+extern atomic_t nr_iowait_tasks;
+void io_schedule(void);
+void io_schedule_timeout(long timeout);
 #endif
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1140,7 +1140,7 @@ extern int full_check_disk_change(struct block_device *);
 extern int __check_disk_change(dev_t);
 extern int invalidate_inodes(struct super_block *);
 extern int invalidate_device(kdev_t, int);
-extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_inode_pages(struct address_space *mapping);
 extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);

--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -18,19 +18,14 @@
 struct kernel_stat {
 	unsigned int per_cpu_user[NR_CPUS],
 	             per_cpu_nice[NR_CPUS],
-	             per_cpu_system[NR_CPUS];
+	             per_cpu_system[NR_CPUS],
+	             per_cpu_idle[NR_CPUS],
+	             per_cpu_iowait[NR_CPUS];
 	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rblk[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wblk[DK_MAX_MAJOR][DK_MAX_DISK];
-	unsigned int pgpgin, pgpgout;
-	unsigned int pswpin, pswpout;
-	unsigned int pgalloc, pgfree;
-	unsigned int pgactivate, pgdeactivate;
-	unsigned int pgfault, pgmajfault;
-	unsigned int pgscan, pgsteal;
-	unsigned int pageoutrun, allocstall;
 #if !defined(CONFIG_ARCH_S390)
 	unsigned int irqs[NR_CPUS][NR_IRQS];
 #endif

--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -70,7 +70,8 @@
 #define PG_direct		16	/* ->pte_chain points directly at pte */
 /*
- * Global page accounting.  One instance per CPU.
+ * Global page accounting.  One instance per CPU.  Only unsigned longs are
+ * allowed.
 */
 extern struct page_state {
 	unsigned long nr_dirty;
@@ -80,9 +81,32 @@ extern struct page_state {
 	unsigned long nr_reverse_maps;
 	unsigned long nr_mapped;
 	unsigned long nr_slab;
+#define GET_PAGE_STATE_LAST nr_slab
+	/*
+	 * The below are zeroed by get_page_state().  Use get_full_page_state()
+	 * to add up all these.
+	 */
+	unsigned long pgpgin;
+	unsigned long pgpgout;
+	unsigned long pswpin;
+	unsigned long pswpout;
+	unsigned long pgalloc;
+	unsigned long pgfree;
+	unsigned long pgactivate;
+	unsigned long pgdeactivate;
+	unsigned long pgfault;
+	unsigned long pgmajfault;
+	unsigned long pgscan;
+	unsigned long pgrefill;
+	unsigned long pgsteal;
+	unsigned long kswapd_steal;
+	unsigned long pageoutrun;
+	unsigned long allocstall;
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];
 extern void get_page_state(struct page_state *ret);
+extern void get_full_page_state(struct page_state *ret);
 #define mod_page_state(member, delta)					\
 	do {								\

--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -41,6 +41,9 @@ extern struct page * find_trylock_page(struct address_space *mapping,
 				unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
 				unsigned long index, unsigned int gfp_mask);
+extern unsigned int find_get_pages(struct address_space *mapping,
+				pgoff_t start, unsigned int nr_pages,
+				struct page **pages);
 /*
 * Returns locked page at given index in given cache, creating it if needed.

--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -8,6 +8,7 @@
 #define PAGEVEC_SIZE	16
 struct page;
+struct address_space;
 struct pagevec {
 	unsigned nr;
@@ -21,6 +22,8 @@ void __pagevec_lru_add(struct pagevec *pvec);
 void lru_add_drain(void);
 void pagevec_deactivate_inactive(struct pagevec *pvec);
 void pagevec_strip(struct pagevec *pvec);
+unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned int nr_pages);
 static inline void pagevec_init(struct pagevec *pvec)
 {

--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -45,5 +45,8 @@ extern int radix_tree_reserve(struct radix_tree_root *, unsigned long, void ***)
 extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 extern int radix_tree_delete(struct radix_tree_root *, unsigned long);
+extern unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items);
 #endif /* _LINUX_RADIX_TREE_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -430,6 +430,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_FROZEN	0x00040000	/* frozen for system suspend */
 #define PF_SYNC		0x00080000	/* performing fsync(), etc */
 #define PF_FSTRANS	0x00100000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00200000	/* I am kswapd */
 /*
 * Ptrace flags

--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -11,11 +11,11 @@ extern atomic_t shmem_nrpages;
 struct shmem_inode_info {
 	spinlock_t		lock;
-	struct semaphore 	sem;
 	unsigned long		next_index;
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
-	void		      **i_indirect; /* indirect blocks */
+	struct page	       *i_indirect; /* indirect blocks */
-	unsigned long		swapped;
+	unsigned long		alloced;    /* data pages allocated to file */
+	unsigned long		swapped;    /* subtotal assigned to swap */
 	unsigned long		flags;
 	struct list_head	list;
 	struct inode		vfs_inode;

--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -218,7 +218,7 @@ extern spinlock_t swaplock;
 #define swap_device_lock(p)	spin_lock(&p->sdev_lock)
 #define swap_device_unlock(p)	spin_unlock(&p->sdev_lock)
-extern void shmem_unuse(swp_entry_t entry, struct page *page);
+extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #endif /* __KERNEL__*/

--- a/init/main.c
+++ b/init/main.c
@@ -555,8 +555,6 @@ static int init(void * unused)
 	unlock_kernel();
 	system_running = 1;
-	kstat.pgfree = 0;
 	if (open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -28,6 +28,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
+#include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
@@ -866,6 +867,10 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
 			kstat.per_cpu_system[cpu] += sys_ticks;
+		else if (atomic_read(&nr_iowait_tasks) > 0)
+			kstat.per_cpu_iowait[cpu] += sys_ticks;
+		else
+			kstat.per_cpu_idle[cpu] += sys_ticks;
 #if CONFIG_SMP
 		idle_tick(rq);
 #endif

--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -43,6 +43,7 @@ struct radix_tree_path {
 };
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
 /*
 * Radix tree node cache.
@@ -218,9 +219,113 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 	return (void *) *slot;
 }
 EXPORT_SYMBOL(radix_tree_lookup);
+static /* inline */ unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index,
+	unsigned long max_index)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+	while (height > 0) {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (slot->slots[i] != NULL)
+				break;
+			index &= ~((1 << shift) - 1);
+			index += 1 << shift;
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		if (height == 0) {
+			/* Bottom level: grab some items */
+			unsigned long j;
+			BUG_ON((shift + RADIX_TREE_MAP_SHIFT) != 0);
+			j = index & RADIX_TREE_MAP_MASK;
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (slot->slots[j]) {
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		slot = slot->slots[i];
+	}
+out:
+	*next_index = index;
+	return nr_found;
+}
+/**
+ *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	them at *@results and returns the number of items which were placed at
+ *	*@results.
+ *
+ *	The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+	if (root->rnode == NULL)
+		goto out;
+	if (max_index == 0) {			/* Bah.  Special case */
+		if (first_index == 0) {
+			if (max_items > 0) {
+				*results = root->rnode;
+				ret = 1;
+			}
+		}
+		goto out;
+	}
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup(root, results + ret, cur_index,
+				max_items - ret, &next_index, max_index);
+		if (nr_found == 0) {
+			 if (!(cur_index & RADIX_TREE_MAP_MASK))
+				break;
+			/*
+			 * It could be that there simply were no items to the
+			 * right of `cur_index' in the leaf node.  So we still
+			 * need to search for additional nodes to the right of
+			 * this one.
+			 */
+		}
+		ret += nr_found;
+		cur_index = next_index;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup);
 /**
 *	radix_tree_delete    -    delete an item from a radix tree
@@ -231,7 +336,7 @@ EXPORT_SYMBOL(radix_tree_lookup);
 */
 int radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
-	struct radix_tree_path path[RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2], *pathp = path;
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
 	unsigned int height, shift;
 	height = root->height;

--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,6 +9,7 @@ obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o
+	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o \
+	    truncate.o
 include $(TOPDIR)/Rules.make
--- a/mm/filemap.c
+++ b/mm/filemap.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1216,7 +1216,7 @@ static int do_swap_page(struct mm_struct * mm,
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
-		KERNEL_STAT_INC(pgmajfault);
+		inc_page_state(pgmajfault);
 	}
 	mark_page_accessed(page);
@@ -1461,7 +1461,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	current->state = TASK_RUNNING;
 	pgd = pgd_offset(mm, address);
-	KERNEL_STAT_INC(pgfault);
+	inc_page_state(pgfault);
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.

--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -186,8 +186,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		/*
 		 * Try to merge with the previous vma.
 		 */
-		if (mprotect_attempt_merge(vma, *pprev, end, newflags))
+		if (mprotect_attempt_merge(vma, *pprev, end, newflags)) {
+			vma = *pprev;
 			goto success;
+		}
 	} else {
 		error = split_vma(mm, vma, start, 1);
 		if (error)

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -13,7 +13,7 @@
 */
 #include <linux/config.h>
-#include <linux/kernel_stat.h>
+#include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
@@ -24,6 +24,7 @@
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+#include <linux/slab.h>
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
@@ -86,7 +87,7 @@ void __free_pages_ok (struct page *page, unsigned int order)
 	struct page *base;
 	struct zone *zone;
-	KERNEL_STAT_ADD(pgfree, 1<<order);
+	mod_page_state(pgfree, 1<<order);
 	BUG_ON(PageLRU(page));
 	BUG_ON(PagePrivate(page));
@@ -324,7 +325,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();
-	KERNEL_STAT_ADD(pgalloc, 1<<order);
+	mod_page_state(pgalloc, 1<<order);
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	classzone = zones[0]; 
@@ -397,7 +398,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nopage;
-	KERNEL_STAT_INC(allocstall);
+	inc_page_state(allocstall);
 	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
@@ -555,28 +556,39 @@ unsigned int nr_free_highpages (void)
 struct page_state page_states[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(page_states);
-void get_page_state(struct page_state *ret)
+void __get_page_state(struct page_state *ret, int nr)
 {
-	int pcpu;
+	int cpu;
 	memset(ret, 0, sizeof(*ret));
-	for (pcpu = 0; pcpu < NR_CPUS; pcpu++) {
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		struct page_state *ps;
+		unsigned long *in, *out, off;
-		if (!cpu_online(pcpu))
+		if (!cpu_online(cpu))
 			continue;
-		ps = &page_states[pcpu];
+		in = (unsigned long *)(page_states + cpu);
-		ret->nr_dirty += ps->nr_dirty;
+		out = (unsigned long *)ret;
-		ret->nr_writeback += ps->nr_writeback;
+		for (off = 0; off < nr; off++)
-		ret->nr_pagecache += ps->nr_pagecache;
+			*out++ += *in++;
-		ret->nr_page_table_pages += ps->nr_page_table_pages;
-		ret->nr_reverse_maps += ps->nr_reverse_maps;
-		ret->nr_mapped += ps->nr_mapped;
-		ret->nr_slab += ps->nr_slab;
 	}
 }
+void get_page_state(struct page_state *ret)
+{
+	int nr;
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+	__get_page_state(ret, nr + 1);
+}
+void get_full_page_state(struct page_state *ret)
+{
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
 void get_zone_counts(unsigned long *active, unsigned long *inactive)
 {
 	struct zone *zone;
@@ -1048,4 +1060,76 @@ struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };
+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_pagecache",
+	"nr_page_table_pages",
+	"nr_reverse_maps",
+	"nr_mapped",
+	"nr_slab",
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+	"pgalloc",
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+	"pgfault",
+	"pgmajfault",
+	"pgscan",
+	"pgrefill",
+	"pgsteal",
+	"kswapd_steal",
+	"pageoutrun",
+	"allocstall",
+};
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	return (unsigned long *)ps + *pos;
+}
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
 #endif /* CONFIG_PROC_FS */
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -100,7 +100,7 @@ int swap_writepage(struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	kstat.pswpout++;
+	inc_page_state(pswpout);
 	SetPageWriteback(page);
 	unlock_page(page);
 	submit_bio(WRITE, bio);
@@ -119,7 +119,7 @@ int swap_readpage(struct file *file, struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	kstat.pswpin++;
+	inc_page_state(pswpin);
 	submit_bio(READ, bio);
 out:
 	return ret;

--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,11 +274,11 @@ void page_remove_rmap(struct page * page, pte_t * ptep)
 		BUG();
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 		return;
+	if (!page_mapped(page))
+		return;		/* remap_page_range() from a driver? */
 	pte_chain_lock(page);
-	BUG_ON(page->pte.direct == 0);
 	if (PageDirect(page)) {
 		if (page->pte.direct == pte_paddr) {
 			page->pte.direct = 0;

--- a/mm/shmem.c
+++ b/mm/shmem.c
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -38,7 +38,7 @@ void activate_page(struct page *page)
 		del_page_from_inactive_list(zone, page);
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
-		KERNEL_STAT_INC(pgactivate);
+		inc_page_state(pgactivate);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
@@ -238,6 +238,29 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }
+/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec:	Where the resulting pages are placed
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ *
+ * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * pagevec_lookup() returns the number of pages which were found.
+ */
+unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned int nr_pages)
+{
+	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
 /*
 * Perform any setup for the swap system
 */

--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -105,7 +105,6 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
-	ClearPageDirty(page);
 	__remove_from_page_cache(page);
 	INC_CACHE_INFO(del_total);
 }
@@ -146,30 +145,30 @@ int add_to_swap(struct page * page)
 		pf_flags = current->flags;
 		current->flags &= ~PF_MEMALLOC;
 		current->flags |= PF_NOWARN;
-		ClearPageUptodate(page);		/* why? */
 		/*
 		 * Add it to the swap cache and mark it dirty
-		 * (adding to the page cache will clear the dirty
-		 * and uptodate bits, so we need to do it again)
 		 */
-		switch (add_to_swap_cache(page, entry)) {
+		switch (add_to_page_cache(page, &swapper_space, entry.val)) {
 		case 0:				/* Success */
 			current->flags = pf_flags;
 			SetPageUptodate(page);
+			ClearPageDirty(page);
 			set_page_dirty(page);
-			swap_free(entry);
+			INC_CACHE_INFO(add_total);
 			return 1;
-		case -ENOMEM:			/* radix-tree allocation */
+		case -EEXIST:
+			/* Raced with "speculative" read_swap_cache_async */
 			current->flags = pf_flags;
+			INC_CACHE_INFO(exist_race);
 			swap_free(entry);
-			return 0;
+			continue;
-		default:			/* ENOENT: raced */
+		default:
-			break;
+			/* -ENOMEM radix-tree allocation failure */
-		}
-		/* Raced with "speculative" read_swap_cache_async */
 			current->flags = pf_flags;
 			swap_free(entry);
+			return 0;
+		}
 	}
 }
@@ -203,33 +202,13 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	void **pslot;
 	int err;
-	if (!mapping)
-		BUG();
-	if (!swap_duplicate(entry)) {
-		INC_CACHE_INFO(noent_race);
-		return -ENOENT;
-	}
 	write_lock(&swapper_space.page_lock);
 	write_lock(&mapping->page_lock);
 	err = radix_tree_reserve(&swapper_space.page_tree, entry.val, &pslot);
 	if (!err) {
-		/* Remove it from the page cache */
 		__remove_from_page_cache(page);
-		/* Add it to the swap cache */
 		*pslot = page;
-		/*
-		 * This code used to clear PG_uptodate, PG_error, PG_arch1,
-		 * PG_referenced and PG_checked.  What _should_ it clear?
-		 */
-		ClearPageUptodate(page);
-		ClearPageReferenced(page);
-		SetPageLocked(page);
-		ClearPageDirty(page);
 		___add_to_page_cache(page, &swapper_space, entry.val);
 	}
@@ -237,21 +216,21 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	write_unlock(&swapper_space.page_lock);
 	if (!err) {
+		if (!swap_duplicate(entry))
+			BUG();
+		/* shift page from clean_pages to dirty_pages list */
+		BUG_ON(PageDirty(page));
+		set_page_dirty(page);
 		INC_CACHE_INFO(add_total);
-		return 0;
+	} else if (err == -EEXIST)
-	}
-	swap_free(entry);
-	if (err == -EEXIST)
 		INC_CACHE_INFO(exist_race);
 	return err;
 }
 int move_from_swap_cache(struct page *page, unsigned long index,
 		struct address_space *mapping)
 {
+	swp_entry_t entry;
 	void **pslot;
 	int err;
@@ -259,44 +238,27 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 	BUG_ON(PageWriteback(page));
 	BUG_ON(page_has_buffers(page));
+	entry.val = page->index;
 	write_lock(&swapper_space.page_lock);
 	write_lock(&mapping->page_lock);
 	err = radix_tree_reserve(&mapping->page_tree, index, &pslot);
 	if (!err) {
-		swp_entry_t entry;
-		entry.val = page->index;
 		__delete_from_swap_cache(page);
 		*pslot = page;
-		/*
-		 * This code used to clear PG_uptodate, PG_error, PG_referenced,
-		 * PG_arch_1 and PG_checked.  It's not really clear why.
-		 */
-		ClearPageUptodate(page);
-		ClearPageReferenced(page);
-		/*
-		 * ___add_to_page_cache puts the page on ->clean_pages,
-		 * but it's dirty.  If it's on ->clean_pages, it will basically
-		 * never get written out.
-		 */
-		SetPageDirty(page);
 		___add_to_page_cache(page, mapping, index);
-		/* fix that up */
+	}
-		list_move(&page->list, &mapping->dirty_pages);
 	write_unlock(&mapping->page_lock);
 	write_unlock(&swapper_space.page_lock);
-		/* Do this outside ->page_lock */
+	if (!err) {
 		swap_free(entry);
-		return 0;
+		/* shift page from clean_pages to dirty_pages list */
+		ClearPageDirty(page);
+		set_page_dirty(page);
 	}
-	write_unlock(&mapping->page_lock);
-	write_unlock(&swapper_space.page_lock);
 	return err;
 }

--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -531,6 +531,7 @@ static int try_to_unuse(unsigned int type)
 	int i = 0;
 	int retval = 0;
 	int reset_overflow = 0;
+	int shmem;
 	/*
 	 * When searching mms for an entry, a good strategy is to
@@ -611,11 +612,12 @@ static int try_to_unuse(unsigned int type)
 		 * Whenever we reach init_mm, there's no address space
 		 * to search, but use it as a reminder to search shmem.
 		 */
+		shmem = 0;
 		swcount = *swap_map;
 		if (swcount > 1) {
 			flush_page_to_ram(page);
 			if (start_mm == &init_mm)
-				shmem_unuse(entry, page);
+				shmem = shmem_unuse(entry, page);
 			else
 				unuse_process(start_mm, entry, page);
 		}
@@ -632,7 +634,9 @@ static int try_to_unuse(unsigned int type)
 				swcount = *swap_map;
 				if (mm == &init_mm) {
 					set_start_mm = 1;
-					shmem_unuse(entry, page);
+					spin_unlock(&mmlist_lock);
+					shmem = shmem_unuse(entry, page);
+					spin_lock(&mmlist_lock);
 				} else
 					unuse_process(mm, entry, page);
 				if (set_start_mm && *swap_map < swcount) {
@@ -681,15 +685,24 @@ static int try_to_unuse(unsigned int type)
 		 * read from disk into another page.  Splitting into two
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
-		 * Note shmem_unuse already deleted its from swap cache.
+		 *
+		 * Note shmem_unuse already deleted a swappage from
+		 * the swap cache, unless the move to filepage failed:
+		 * in which case it left swappage in cache, lowered its
+		 * swap count to pass quickly through the loops above,
+		 * and now we must reincrement count to try again later.
 		 */
 		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
 			swap_writepage(page);
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
-		if (PageSwapCache(page))
+		if (PageSwapCache(page)) {
+			if (shmem)
+				swap_duplicate(entry);
+			else
 				delete_from_swap_cache(page);
+		}
 		/*
 		 * So we could skip searching mms once swap count went

--- a/mm/truncate.c
+++ b/mm/truncate.c
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002	akpm@zip.com.au
+ *		Initial version.
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/buffer_head.h>	/* grr. try_to_release_page,
+				   block_invalidatepage */
+static int do_invalidatepage(struct page *page, unsigned long offset)
+{
+	int (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (invalidatepage == NULL)
+		invalidatepage = block_invalidatepage;
+	return (*invalidatepage)(page, offset);
+}
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (PagePrivate(page))
+		do_invalidatepage(page, partial);
+}
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_nopage().
+ */
+static void truncate_complete_page(struct page *page)
+{
+	if (PagePrivate(page))
+		do_invalidatepage(page, 0);
+	clear_page_dirty(page);
+	ClearPageUptodate(page);
+	remove_from_page_cache(page);
+	page_cache_release(page);
+}
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages that are beyond
+ * that offset (and zeroing out partial pages).
+ *
+ * Truncate takes two passes - the first pass is nonblocking.  It will not
+ * block on page locks and it will not block on writeback.  The second pass
+ * will wait.  This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+	pagevec_init(&pvec);
+	next = start;
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			next = page->index + 1;
+			if (TestSetPageLocked(page))
+				continue;
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+			truncate_complete_page(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (partial) {
+		struct page *page = find_lock_page(mapping, start - 1);
+		if (page) {
+			wait_on_page_writeback(page);
+			truncate_partial_page(page, partial);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+	next = start;
+	for ( ; ; ) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			lock_page(page);
+			wait_on_page_writeback(page);
+			next = page->index + 1;
+			truncate_complete_page(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	}
+	if (lstart == 0 && mapping->nrpages)
+		printk("%s: I goofed!\n", __FUNCTION__);
+}
+/**
+ * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
+ * @inode: the inode which pages we want to invalidate
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_inode_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+void invalidate_inode_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+	pagevec_init(&pvec);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			if (TestSetPageLocked(page)) {
+				next++;
+				continue;
+			}
+			next = page->index + 1;
+			if (PageDirty(page) || PageWriteback(page))
+				goto unlock;
+			if (PagePrivate(page) && !try_to_release_page(page, 0))
+				goto unlock;
+			if (page_mapped(page))
+				goto unlock;
+			truncate_complete_page(page);
+unlock:
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+/**
+ * invalidate_inode_pages2 - remove all unmapped pages from an address_space
+ * @mapping - the address_space
+ *
+ * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
+ * where the page is seen to be mapped into process pagetables.  In that case,
+ * the page is marked clean but is left attached to its address_space.
+ *
+ * FIXME: invalidate_inode_pages2() is probably trivially livelockable.
+ */
+void invalidate_inode_pages2(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+	pagevec_init(&pvec);
+	while (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			lock_page(page);
+			if (page->mapping) {	/* truncate race? */
+				wait_on_page_writeback(page);
+				next = page->index + 1;
+				if (page_mapped(page))
+					clear_page_dirty(page);
+				else
+					truncate_complete_page(page);
+			}
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -312,8 +312,10 @@ shrink_list(struct list_head *page_list, int nr_pages,
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
-	KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages);
+	mod_page_state(pgsteal, nr_pages_in - nr_pages);
-	KERNEL_STAT_ADD(pgactivate, pgactivate);
+	if (current->flags & PF_KSWAPD)
+		mod_page_state(kswapd_steal, nr_pages_in - nr_pages);
+	mod_page_state(pgactivate, pgactivate);
 	return nr_pages;
 }
@@ -380,7 +382,7 @@ shrink_cache(int nr_pages, struct zone *zone,
 			goto done;
 		max_scan -= nr_scan;
-		KERNEL_STAT_ADD(pgscan, nr_scan);
+		mod_page_state(pgscan, nr_scan);
 		nr_pages = shrink_list(&page_list, nr_pages,
 				gfp_mask, &max_scan, nr_mapped);
@@ -527,8 +529,8 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
-	KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
+	mod_page_state(pgrefill, nr_pages_in - nr_pages);
-	KERNEL_STAT_ADD(pgdeactivate, pgdeactivate);
+	mod_page_state(pgdeactivate, pgdeactivate);
 }
 static /* inline */ int
@@ -641,7 +643,7 @@ try_to_free_pages(struct zone *classzone,
 	int priority = DEF_PRIORITY;
 	int nr_pages = SWAP_CLUSTER_MAX;
-	KERNEL_STAT_INC(pageoutrun);
+	inc_page_state(pageoutrun);
 	for (priority = DEF_PRIORITY; priority; priority--) {
 		int total_scanned = 0;
@@ -757,7 +759,7 @@ int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC;
+	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
 	/*
 	 * Kswapd main loop.