Merge home.transmeta.com:/home/torvalds/v2.5/akpm

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge home.transmeta.com:/home/torvalds/v2.5/akpm
into home.transmeta.com:/home/torvalds/v2.5/linux
2188a617 · Linus Torvalds · 075ee978 · c7ea169d · 2188a617 · 2188a617
Commit 2188a617 authored Sep 19, 2002 by Linus Torvalds
63 changed files
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -963,13 +963,6 @@ Contains, as a percentage of total system memory, the number of pages at which
 a process which is generating disk writes will itself start writing out dirty
 data.

-dirty_sync_ratio
----------------
-
-Contains, as a percentage of total system memory, the number of pages at which
-a process which is generating disk writes will itself start writing out dirty
-data and waiting upon completion of that writeout.
-
 dirty_writeback_centisecs
 -------------------------


--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -21,13 +21,12 @@ Currently, these files are in /proc/sys/vm:
 - dirty_async_ratio
 - dirty_background_ratio
 - dirty_expire_centisecs
- dirty_sync_ratio
 - dirty_writeback_centisecs

 ==============================================================

 dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs,
-dirty_sync_ratio dirty_writeback_centisecs:
+dirty_writeback_centisecs:

 See Documentation/filesystems/proc.txt


--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -286,7 +286,6 @@ void __init paging_init(void)
 	for (nid = 0; nid < numnodes; nid++) {
 		unsigned long start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
 		unsigned long end_pfn = plat_node_bdata[nid].node_low_pfn;
-		unsigned long lmax_mapnr;

 		if (dma_local_pfn >= end_pfn - start_pfn)
 			zones_size[ZONE_DMA] = end_pfn - start_pfn;
@@ -295,11 +294,6 @@ void __init paging_init(void)
 			zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
 		}
 		free_area_init_node(nid, NODE_DATA(nid), NULL, zones_size, start_pfn, NULL);
-		lmax_mapnr = PLAT_NODE_DATA_STARTNR(nid) + PLAT_NODE_DATA_SIZE(nid);
-		if (lmax_mapnr > max_mapnr) {
-			max_mapnr = lmax_mapnr;
-			DBGDCONT("Grow max_mapnr to %ld\n", max_mapnr);
-		}
 	}

 	/* Initialize the kernel's ZERO_PGE. */

--- a/arch/i386/config.in
+++ b/arch/i386/config.in
@@ -154,7 +154,7 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then
   define_bool CONFIG_X86_OOSTORE y
 fi

-bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE
+bool 'Huge TLB Page Support' CONFIG_HUGETLB_PAGE

 bool 'Symmetric multi-processing support' CONFIG_SMP
 bool 'Preemptible Kernel' CONFIG_PREEMPT

--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret");
 static void __init init_amd(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
-	int mbytes = max_mapnr >> (20-PAGE_SHIFT);
+	int mbytes = num_physpages >> (20-PAGE_SHIFT);
 	int r;

 	/*

--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -96,4 +96,7 @@ extern struct mtrr_ops * mtrr_if;

 extern unsigned int num_var_ranges;

+void finalize_mtrr_state(void);
+void mtrr_state_warn(void);
+
 extern char * mtrr_if_name[];
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -58,7 +58,11 @@ EXPORT_SYMBOL(boot_cpu_data);
 EXPORT_SYMBOL(EISA_bus);
 #endif
 EXPORT_SYMBOL(MCA_bus);
-#ifdef CONFIG_MULTIQUAD
+#ifdef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(node_data);
+EXPORT_SYMBOL(pfn_to_nid);
+#endif
+#ifdef CONFIG_X86_NUMAQ
 EXPORT_SYMBOL(xquad_portio);
 #endif
 EXPORT_SYMBOL(__verify_write);

--- a/arch/i386/kernel/numaq.c
+++ b/arch/i386/kernel/numaq.c
@@ -82,27 +82,19 @@ static void __init smp_dump_qct(void)
 */
 int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};

-#define MB_TO_ELEMENT(x) (x >> ELEMENT_REPRESENTS)
-#define PA_TO_MB(pa) (pa >> 20) 	/* assumption: a physical address is in bytes */
+#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT)
+#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT))

-int pa_to_nid(u64 pa)
+int pfn_to_nid(unsigned long pfn)
 {
-	int nid;
-	
-	nid = physnode_map[MB_TO_ELEMENT(PA_TO_MB(pa))];
+	int nid = physnode_map[PFN_TO_ELEMENT(pfn)];

-	/* the physical address passed in is not in the map for the system */
 	if (nid == -1)
-		BUG();
+		BUG(); /* address is not present */

 	return nid;
 }

-int pfn_to_nid(unsigned long pfn)
-{
-	return pa_to_nid(((u64)pfn) << PAGE_SHIFT);
-}
-
 /*
 * for each node mark the regions
 *        TOPOFMEM = hi_shrd_mem_start + hi_shrd_mem_size
@@ -132,7 +124,7 @@ static void __init initialize_physnode_map(void)
 			topofmem = eq->hi_shrd_mem_start + eq->hi_shrd_mem_size;
 			while (cur < topofmem) {
 				physnode_map[cur >> 8] = nid;
-				cur += (ELEMENT_REPRESENTS - 1);
+				cur ++;
 			}
 		}
 	}

--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -275,20 +275,9 @@ void __init set_highmem_pages_init(int bad_ppro)
 void __init set_max_mapnr_init(void)
 {
 #ifdef CONFIG_HIGHMEM
-	unsigned long lmax_mapnr;
-	int nid;
-	
-	highmem_start_page = mem_map + NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_start_mapnr;
+	highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map;
 	num_physpages = highend_pfn;
-
-	for (nid = 0; nid < numnodes; nid++) {
-		lmax_mapnr = node_startnr(nid) + node_size(nid);
-		if (lmax_mapnr > max_mapnr) {
-			max_mapnr = lmax_mapnr;
-		}
-	}
-	
 #else
-	max_mapnr = num_physpages = max_low_pfn;
+	num_physpages = max_low_pfn;
 #endif
 }
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -319,7 +319,7 @@ set_new_inode(unsigned long len, int prot, int flag, int key)
 	}
 	if (i == MAX_ID)
 		return NULL;
-	inode = kmalloc(sizeof (struct inode), GFP_KERNEL);
+	inode = kmalloc(sizeof (struct inode), GFP_ATOMIC);
 	if (inode == NULL)
 		return NULL;

@@ -502,7 +502,7 @@ set_hugetlb_mem_size(int count)

 	if (lcount > 0) {	/* Increase the mem size. */
 		while (lcount--) {
-			page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
+			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
 			if (page == NULL)
 				break;
 			map = page;

--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -440,8 +440,10 @@ void __init mem_init(void)
 	int tmp;
 	int bad_ppro;

+#ifndef CONFIG_DISCONTIGMEM
 	if (!mem_map)
 		BUG();
+#endif
 	
 	bad_ppro = ppro_with_ram_bug();

@@ -471,7 +473,7 @@ void __init mem_init(void)

 	printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
-		max_mapnr << (PAGE_SHIFT-10),
+		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		reservedpages << (PAGE_SHIFT-10),
 		datasize >> 10,
@@ -504,7 +506,7 @@ void __init mem_init(void)
 		/*Will make this kernel command line. */
 		INIT_LIST_HEAD(&htlbpage_freelist);
 		for (i=0; i<htlbzone_pages; i++) {
-			page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
+			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
 			if (page == NULL)
 				break;
 			map = page;

--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -22,26 +22,29 @@

 void show_mem(void)
 {
-	int pfn, total = 0, reserved = 0;
+	int total = 0, reserved = 0;
 	int shared = 0, cached = 0;
 	int highmem = 0;
 	struct page *page;
+	pg_data_t *pgdat;
+	unsigned long i;

 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
-	pfn = max_mapnr;
-	while (pfn-- > 0) {
-		page = pfn_to_page(pfn);
-		total++;
-		if (PageHighMem(page))
-			highmem++;
-		if (PageReserved(page))
-			reserved++;
-		else if (PageSwapCache(page))
-			cached++;
-		else if (page_count(page))
-			shared += page_count(page) - 1;
+	for_each_pgdat(pgdat) {
+		for (i = 0; i < pgdat->node_size; ++i) {
+			page = pgdat->node_mem_map + i;
+			total++;
+			if (PageHighMem(page))
+				highmem++;
+			if (PageReserved(page))
+				reserved++;
+			else if (PageSwapCache(page))
+				cached++;
+			else if (page_count(page))
+				shared += page_count(page) - 1;
+		}
 	}
 	printk("%d pages of RAM\n", total);
 	printk("%d pages of HIGHMEM\n",highmem);

--- a/arch/mips64/sgi-ip27/ip27-memory.c
+++ b/arch/mips64/sgi-ip27/ip27-memory.c
@@ -254,10 +254,6 @@ void __init paging_init(void)
 		zones_size[ZONE_DMA] = end_pfn + 1 - start_pfn;
 		free_area_init_node(node, NODE_DATA(node), 0, zones_size, 
 						start_pfn, 0);
-		if ((PLAT_NODE_DATA_STARTNR(node) + 
-					PLAT_NODE_DATA_SIZE(node)) > pagenr)
-			pagenr = PLAT_NODE_DATA_STARTNR(node) +
-					PLAT_NODE_DATA_SIZE(node);
 	}
 }

@@ -271,7 +267,6 @@ void __init mem_init(void)
 	unsigned long codesize, datasize, initsize;
 	int slot, numslots;
 	struct page *pg, *pslot;
-	pfn_t pgnr;

 	num_physpages = numpages;	/* memory already sized by szmem */
 	max_mapnr = pagenr;		/* already found during paging_init */
@@ -293,7 +288,6 @@ void __init mem_init(void)
 		 * We need to manually do the other slots.
 		 */
 		pg = NODE_DATA(nid)->node_mem_map + slot_getsize(nid, 0);
-		pgnr = PLAT_NODE_DATA_STARTNR(nid) + slot_getsize(nid, 0);
 		numslots = node_getlastslot(nid);
 		for (slot = 1; slot <= numslots; slot++) {
 			pslot = NODE_DATA(nid)->node_mem_map + 
@@ -304,7 +298,7 @@ void __init mem_init(void)
 			 * free up the pages that hold the memmap entries.
 			 */
 			while (pg < pslot) {
-				pg++; pgnr++;
+				pg++;
 			}

 			/*
@@ -312,8 +306,8 @@ void __init mem_init(void)
 			 */
 			pslot += slot_getsize(nid, slot);
 			while (pg < pslot) {
-				if (!page_is_ram(pgnr))
-					continue;
+				/* if (!page_is_ram(pgnr)) continue; */
+				/* commented out until page_is_ram works */
 				ClearPageReserved(pg);
 				atomic_set(&pg->count, 1);
 				__free_page(pg);

--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1733,7 +1733,7 @@ void __init mem_init(void)
 	 * Set up the zero page, mark it reserved, so that page count
 	 * is not manipulated when freeing the page from user ptes.
 	 */
-	mem_map_zero = _alloc_pages(GFP_KERNEL, 0);
+	mem_map_zero = alloc_pages(GFP_KERNEL, 0);
 	if (mem_map_zero == NULL) {
 		prom_printf("paging_init: Cannot alloc zero page.\n");
 		prom_halt();

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -36,7 +36,7 @@ static kmem_cache_t *request_cachep;
 /*
 * plug management
 */
-static struct list_head blk_plug_list;
+static LIST_HEAD(blk_plug_list);
 static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;

 /* blk_dev_struct is:
@@ -1875,27 +1875,16 @@ void end_that_request_last(struct request *req)
 	blk_put_request(req);
 }

-#define MB(kb)	((kb) << 10)
-
 int __init blk_dev_init(void)
 {
-	struct blk_dev_struct *dev;
-	int total_ram;
+	int total_ram = nr_free_pages() << (PAGE_SHIFT - 10);

 	request_cachep = kmem_cache_create("blkdev_requests",
-					   sizeof(struct request),
-					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-
+			sizeof(struct request), 0,
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!request_cachep)
 		panic("Can't create request pool slab cache\n");

-	for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
-		dev->queue = NULL;
-
-	memset(ro_bits,0,sizeof(ro_bits));
-
-	total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
-
 	/*
 	 * Free request slots per queue.
 	 * (Half for reads, half for writes)
@@ -1911,17 +1900,12 @@ int __init blk_dev_init(void)
 	 */
 	if ((batch_requests = queue_nr_requests / 4) > 32)
 		batch_requests = 32;
-	printk("block: %d slots per queue, batch=%d\n", queue_nr_requests, batch_requests);
+	printk("block: %d slots per queue, batch=%d\n",
+			queue_nr_requests, batch_requests);

 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;

-	INIT_LIST_HEAD(&blk_plug_list);
-
-#if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
-	hd_init();
-#endif
-
 	return 0;
 };


--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -241,7 +241,7 @@ raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
 static ssize_t
 raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
 {
-	struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+	struct iovec local_iov = { .iov_base = (char *)buf, .iov_len = size};

 	return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
 }

--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -846,7 +846,7 @@ static void __init hd_geninit(void)
 	}
 }

-int __init hd_init(void)
+static int __init hd_init(void)
 {
 	if (register_blkdev(MAJOR_NR,"hd",&hd_fops)) {
 		printk("hd: unable to get major %d for hard disk\n",MAJOR_NR);

--- a/drivers/pcmcia/sa1100.h
+++ b/drivers/pcmcia/sa1100.h
@@ -160,7 +160,7 @@ struct sa1100_pcmcia_socket {
   */
  socket_state_t        cs_state;
  pccard_io_map         io_map[MAX_IO_WIN];
-  pccard_mem_map        mem_map[MAX_WIN];
+  pccard_mem_map        pc_mem_map[MAX_WIN];
  void                  (*handler)(void *, unsigned int);
  void                  *handler_info;


--- a/drivers/pcmcia/sa1100_generic.c
+++ b/drivers/pcmcia/sa1100_generic.c
@@ -686,7 +686,7 @@ sa1100_pcmcia_get_mem_map(unsigned int sock, struct pccard_mem_map *map)
  DEBUG(2, "%s() for sock %u\n", __FUNCTION__, sock);

  if (map->map < MAX_WIN) {
-    *map = skt->mem_map[map->map];
+    *map = skt->pc_mem_map[map->map];
    ret = 0;
  }

@@ -754,7 +754,7 @@ sa1100_pcmcia_set_mem_map(unsigned int sock, struct pccard_mem_map *map)
  map->sys_stop += start;
  map->sys_start = start;

-  skt->mem_map[map->map] = *map;
+  skt->pc_mem_map[map->map] = *map;

  return 0;
 }  /* sa1100_pcmcia_set_mem_map() */

--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -2537,6 +2537,7 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)
 {
 	struct scsi_host_sg_pool *sgp;
 	struct scatterlist *sgl;
+	int pf_flags;

 	BUG_ON(!SCpnt->use_sg);

@@ -2551,9 +2552,10 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)

 	sgp = scsi_sg_pools + SCpnt->sglist_len;

+	pf_flags = current->flags;
 	current->flags |= PF_NOWARN;
 	sgl = mempool_alloc(sgp->pool, gfp_mask);
-	current->flags &= ~PF_NOWARN;
+	current->flags = pf_flags;
 	if (sgl) {
 		memset(sgl, 0, sgp->size);
 		return sgl;

--- a/fs/bio.c
+++ b/fs/bio.c
@@ -135,6 +135,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
 {
 	struct bio *bio;
 	struct bio_vec *bvl = NULL;
+	int pf_flags = current->flags;

 	current->flags |= PF_NOWARN;
 	bio = mempool_alloc(bio_pool, gfp_mask);
@@ -151,7 +152,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
 	mempool_free(bio, bio_pool);
 	bio = NULL;
 out:
-	current->flags &= ~PF_NOWARN;
+	current->flags = pf_flags;
 	return bio;
 }


--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -937,9 +937,11 @@ create_buffers(struct page * page, unsigned long size, int retry)
 	head = NULL;
 	offset = PAGE_SIZE;
 	while ((offset -= size) >= 0) {
+		int pf_flags = current->flags;
+
 		current->flags |= PF_NOWARN;
 		bh = alloc_buffer_head();
-		current->flags &= ~PF_NOWARN;
+		current->flags = pf_flags;
 		if (!bh)
 			goto no_grow;


--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -627,13 +627,13 @@ ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
 }

 static int
-ext2_writepages(struct address_space *mapping, int *nr_to_write)
+ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
 	int err;

 	ret = write_mapping_buffers(mapping);
-	err = mpage_writepages(mapping, nr_to_write, ext2_get_block);
+	err = mpage_writepages(mapping, wbc, ext2_get_block);
 	if (!ret)
 		ret = err;
 	return ret;

--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1475,13 +1475,13 @@ struct address_space_operations ext3_aops = {
 /* For writeback mode, we can use mpage_writepages() */

 static int
-ext3_writepages(struct address_space *mapping, int *nr_to_write)
+ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
 	int err;

 	ret = write_mapping_buffers(mapping);
-	err = mpage_writepages(mapping, nr_to_write, ext3_get_block);
+	err = mpage_writepages(mapping, wbc, ext3_get_block);
 	if (!ret)
 		ret = err;
 	return ret;

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -111,8 +111,7 @@ static void write_inode(struct inode *inode, int sync)
 /*
 * Write a single inode's dirty pages and inode data out to disk.
 * If `sync' is set, wait on the writeout.
- * If `nr_to_write' is not NULL, subtract the number of written pages
- * from *nr_to_write.
+ * Subtract the number of written pages from nr_to_write.
 *
 * Normally it is not legal for a single process to lock more than one
 * page at a time, due to ab/ba deadlock problems.  But writepages()
@@ -127,7 +126,9 @@ static void write_inode(struct inode *inode, int sync)
 *
 * Called under inode_lock.
 */
-static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
+static void
+__sync_single_inode(struct inode *inode, int wait,
+			struct writeback_control *wbc)
 {
 	unsigned dirty;
 	unsigned long orig_dirtied_when;
@@ -144,7 +145,7 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
 	mapping->dirtied_when = 0;	/* assume it's whole-file writeback */
 	spin_unlock(&inode_lock);

-	do_writepages(mapping, nr_to_write);
+	do_writepages(mapping, wbc);

 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
@@ -181,7 +182,8 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
 * Write out an inode's dirty pages.  Called under inode_lock.
 */
 static void
-__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
+__writeback_single_inode(struct inode *inode, int sync,
+			struct writeback_control *wbc)
 {
 	if (current_is_pdflush() && (inode->i_state & I_LOCK))
 		return;
@@ -193,7 +195,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
 		iput(inode);
 		spin_lock(&inode_lock);
 	}
-	__sync_single_inode(inode, sync, nr_to_write);
+	__sync_single_inode(inode, sync, wbc);
 }

 /*
@@ -226,8 +228,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
 * thrlttled threads: we don't want them all piling up on __wait_on_inode.
 */
 static void
-sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
-	int sync_mode, int *nr_to_write, unsigned long *older_than_this)
+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 {
 	struct list_head *tmp;
 	struct list_head *head;
@@ -241,7 +242,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
 		struct backing_dev_info *bdi;
 		int really_sync;

-		if (single_bdi && mapping->backing_dev_info != single_bdi) {
+		if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
 			if (sb != blockdev_superblock)
 				break;		/* inappropriate superblock */
 			list_move(&inode->i_list, &sb->s_dirty);
@@ -252,23 +253,20 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
 		if (time_after(mapping->dirtied_when, start))
 			break;

-		if (older_than_this &&
-			time_after(mapping->dirtied_when, *older_than_this))
+		if (wbc->older_than_this && time_after(mapping->dirtied_when,
+						*wbc->older_than_this))
 			goto out;

 		bdi = mapping->backing_dev_info;
 		if (current_is_pdflush() && !writeback_acquire(bdi))
 			break;

-		really_sync = (sync_mode == WB_SYNC_ALL);
-		if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
-			really_sync = 1;
-
+		really_sync = (wbc->sync_mode == WB_SYNC_ALL);
 		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
 		list_move(&inode->i_list, &sb->s_dirty);
-		__writeback_single_inode(inode, really_sync, nr_to_write);
-		if (sync_mode == WB_SYNC_HOLD) {
+		__writeback_single_inode(inode, really_sync, wbc);
+		if (wbc->sync_mode == WB_SYNC_HOLD) {
 			mapping->dirtied_when = jiffies;
 			list_move(&inode->i_list, &sb->s_dirty);
 		}
@@ -277,7 +275,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
 		spin_unlock(&inode_lock);
 		iput(inode);
 		spin_lock(&inode_lock);
-		if (nr_to_write && *nr_to_write <= 0)
+		if (wbc->nr_to_write <= 0)
 			break;
 	}
 out:
@@ -288,16 +286,26 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
 }

 /*
+ * Start writeback of dirty pagecache data against all unlocked inodes.
+ *
+ * Note:
+ * We don't need to grab a reference to superblock here. If it has non-empty
+ * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
+ * empty. Since __sync_single_inode() regains inode_lock before it finally moves
+ * inode from superblock lists we are OK.
+ *
+ * If `older_than_this' is non-zero then only flush inodes which have a
+ * flushtime older than *older_than_this.
+ *
 * If `bdi' is non-zero then we will scan the first inode against each
 * superblock until we find the matching ones.  One group will be the dirty
 * inodes against a filesystem.  Then when we hit the dummy blockdev superblock,
 * sync_sb_inodes will seekout the blockdev which matches `bdi'.  Maybe not
 * super-efficient but we're about to do a ton of I/O...
 */
-static void
-__writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
-				enum writeback_sync_modes sync_mode,
-				unsigned long *older_than_this)
+void
+writeback_inodes(struct writeback_control *wbc)
 {
 	struct super_block *sb;

@@ -307,54 +315,16 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
 	for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
 		if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
 			spin_unlock(&sb_lock);
-			sync_sb_inodes(bdi, sb, sync_mode, nr_to_write,
-					older_than_this);
+			sync_sb_inodes(sb, wbc);
 			spin_lock(&sb_lock);
 		}
-		if (nr_to_write && *nr_to_write <= 0)
+		if (wbc->nr_to_write <= 0)
 			break;
 	}
 	spin_unlock(&sb_lock);
 	spin_unlock(&inode_lock);
 }

-/*
- * Start writeback of dirty pagecache data against all unlocked inodes.
- *
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
- *
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
- *
- * This is a "memory cleansing" operation, not a "data integrity" operation.
- */
-void writeback_unlocked_inodes(int *nr_to_write,
-				enum writeback_sync_modes sync_mode,
-				unsigned long *older_than_this)
-{
-	__writeback_unlocked_inodes(NULL, nr_to_write,
-				sync_mode, older_than_this);
-}
-/*
- * Perform writeback of dirty data against a particular queue.
- *
- * This is for writer throttling.  We don't want processes to write back
- * other process's data, espsecially when the other data belongs to a
- * different spindle.
- */
-void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
-			enum writeback_sync_modes sync_mode,
-			unsigned long *older_than_this)
-{
-	__writeback_unlocked_inodes(bdi, nr_to_write,
-				sync_mode, older_than_this);
-}
-
 /*
 * writeback and wait upon the filesystem's dirty inodes.  The caller will
 * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
@@ -366,14 +336,17 @@ void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
 void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct page_state ps;
-	int nr_to_write;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.older_than_this = NULL,
+		.nr_to_write	= 0,
+	};

 	get_page_state(&ps);
-	nr_to_write = ps.nr_dirty + ps.nr_dirty / 4;
-
+	wbc.nr_to_write = ps.nr_dirty + ps.nr_dirty / 4;
 	spin_lock(&inode_lock);
-	sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
-				&nr_to_write, NULL);
+	sync_sb_inodes(sb, &wbc);
 	spin_unlock(&inode_lock);
 }

@@ -466,8 +439,12 @@ void sync_inodes(int wait)
 
 void write_inode_now(struct inode *inode, int sync)
 {
+	struct writeback_control wbc = {
+		.nr_to_write = LONG_MAX,
+	};
+
 	spin_lock(&inode_lock);
-	__writeback_single_inode(inode, sync, NULL);
+	__writeback_single_inode(inode, sync, &wbc);
 	spin_unlock(&inode_lock);
 	if (sync)
 		wait_on_inode(inode);

--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -282,9 +282,10 @@ static int jfs_writepage(struct page *page)
 	return block_write_full_page(page, jfs_get_block);
 }

-static int jfs_writepages(struct address_space *mapping, int *nr_to_write)
+static int jfs_writepages(struct address_space *mapping,
+			struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, nr_to_write, jfs_get_block);
+	return mpage_writepages(mapping, wbc, jfs_get_block);
 }

 static int jfs_readpage(struct file *file, struct page *page)

--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -484,7 +484,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 * address space and writepage() all of them.
 * 
 * @mapping: address space structure to write
- * @nr_to_write: subtract the number of written pages from *@nr_to_write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @get_block: the filesystem's block mapper function.
 *             If this is NULL then use a_ops->writepage.  Otherwise, go
 *             direct-to-BIO.
@@ -520,7 +520,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 */
 int
 mpage_writepages(struct address_space *mapping,
-			int *nr_to_write, get_block_t get_block)
+		struct writeback_control *wbc, get_block_t get_block)
 {
 	struct bio *bio = NULL;
 	sector_t last_block_in_bio = 0;
@@ -583,7 +583,7 @@ mpage_writepages(struct address_space *mapping,
 				__set_page_dirty_nobuffers(page);
 				ret = 0;
 			}
-			if (ret || (nr_to_write && --(*nr_to_write) <= 0))
+			if (ret || (--(wbc->nr_to_write) <= 0))
 				done = 1;
 		} else {
 			unlock_page(page);

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -394,131 +394,40 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	return res;
 }
 		
-static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total)
+int proc_pid_statm(task_t *task, char *buffer)
 {
-	unsigned long end, pmd_end;
-	pte_t *pte;
-
-	if (pmd_none(*pmd))
-		return;
-	if (pmd_bad(*pmd)) {
-		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
-		return;
-	}
-	preempt_disable();
-	pte = pte_offset_map(pmd, address);
-	end = address + size;
-	pmd_end = (address + PMD_SIZE) & PMD_MASK;
-	if (end > pmd_end)
-		end = pmd_end;
-	do {
-		pte_t page = *pte;
-		struct page *ptpage;
-		unsigned long pfn;
+	int size, resident, shared, text, lib, data, dirty;
+	struct mm_struct *mm = get_task_mm(task);
+	struct vm_area_struct * vma;

-		address += PAGE_SIZE;
-		pte++;
-		if (pte_none(page))
-			continue;
-		++*total;
-		if (!pte_present(page))
-			continue;
-		pfn = pte_pfn(page);
-		if (!pfn_valid(pfn))
-			continue;
-		ptpage = pfn_to_page(pfn);
-		if (PageReserved(ptpage))
-			continue;
-		++*pages;
-		if (pte_dirty(page))
-			++*dirty;
-		if (page_count(pte_page(page)) > 1)
-			++*shared;
-	} while (address < end);
-	pte_unmap(pte - 1);
-	preempt_enable();
-}
+	size = resident = shared = text = lib = data = dirty = 0;

-static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
-	int * pages, int * shared, int * dirty, int * total)
-{
-	pmd_t * pmd;
-	unsigned long end;
-
-	if (pgd_none(*pgd))
-		return;
-	if (pgd_bad(*pgd)) {
-		pgd_ERROR(*pgd);
-		pgd_clear(pgd);
-		return;
-	}
-	pmd = pmd_offset(pgd, address);
-	address &= ~PGDIR_MASK;
-	end = address + size;
-	if (end > PGDIR_SIZE)
-		end = PGDIR_SIZE;
-	do {
-		statm_pte_range(pmd, address, end - address, pages, shared, dirty, total);
-		address = (address + PMD_SIZE) & PMD_MASK;
-		pmd++;
-	} while (address < end);
-}
-
-static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end,
-	int * pages, int * shared, int * dirty, int * total)
-{
-	while (address < end) {
-		statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total);
-		address = (address + PGDIR_SIZE) & PGDIR_MASK;
-		pgd++;
-	}
-}
+	if (!mm)
+		goto out;

-int proc_pid_statm(struct task_struct *task, char * buffer)
-{
-	int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
-	struct mm_struct *mm = get_task_mm(task);
+	down_read(&mm->mmap_sem);
+	resident = mm->rss;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;

-	if (mm) {
-		struct vm_area_struct * vma;
-		down_read(&mm->mmap_sem);
-		vma = mm->mmap;
-		while (vma) {
-			pgd_t *pgd = pgd_offset(mm, vma->vm_start);
-			int pages = 0, shared = 0, dirty = 0, total = 0;
-			if (is_vm_hugetlb_page(vma)) {
-				int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE);
-
-				resident += num_pages;
-				if (!(vma->vm_flags & VM_DONTCOPY))
-					share += num_pages;
-				if (vma->vm_flags & VM_WRITE)
-					dt += num_pages;
-				drs += num_pages;
-				vma = vma->vm_next;
-				continue;
-			}
-			statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
-			resident += pages;
-			share += shared;
-			dt += dirty;
-			size += total;
-			if (vma->vm_flags & VM_EXECUTABLE)
-				trs += pages;	/* text */
-			else if (vma->vm_flags & VM_GROWSDOWN)
-				drs += pages;	/* stack */
-			else if (vma->vm_end > 0x60000000)
-				lrs += pages;	/* library */
-			else
-				drs += pages;
-			vma = vma->vm_next;
+		size += pages;
+		if (is_vm_hugetlb_page(vma)) {
+			if (!(vma->vm_flags & VM_DONTCOPY))
+				shared += pages;
+			continue;
 		}
-		up_read(&mm->mmap_sem);
-		mmput(mm);
+		if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared))
+			shared += pages;
+		if (vma->vm_flags & VM_EXECUTABLE)
+			text += pages;
+		else
+			data += pages;
 	}
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out:
 	return sprintf(buffer,"%d %d %d %d %d %d %d\n",
-		       size, resident, share, trs, lrs, drs, dt);
+		       size, resident, shared, text, lib, data, dirty);
 }

 /*

--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -36,18 +36,14 @@ extern plat_pg_data_t *plat_node_data[];

 #ifdef CONFIG_ALPHA_WILDFIRE
 # define ALPHA_PA_TO_NID(pa)	((pa) >> 36)	/* 16 nodes max due 43bit kseg */
-#define NODE_MAX_MEM_SIZE	(64L * 1024L * 1024L * 1024L) /* 64 GB */
-#define MAX_NUMNODES		WILDFIRE_MAX_QBB
+# define NODE_MAX_MEM_SIZE	(64L * 1024L * 1024L * 1024L) /* 64 GB */
 #else
 # define ALPHA_PA_TO_NID(pa)	(0)
-#define NODE_MAX_MEM_SIZE	(~0UL)
-#define MAX_NUMNODES		1
+# define NODE_MAX_MEM_SIZE	(~0UL)
 #endif

 #define PHYSADDR_TO_NID(pa)		ALPHA_PA_TO_NID(pa)
 #define PLAT_NODE_DATA(n)		(plat_node_data[(n)])
-#define PLAT_NODE_DATA_STARTNR(n)	\
-	(PLAT_NODE_DATA(n)->gendata.node_start_mapnr)
 #define PLAT_NODE_DATA_SIZE(n)		(PLAT_NODE_DATA(n)->gendata.node_size)

 #if 1

--- a/include/asm-alpha/numnodes.h
+++ b/include/asm-alpha/numnodes.h
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+/*
+ * Currently the Wildfire is the only discontigmem/NUMA capable Alpha core.
+ */
+#if defined(CONFIG_ALPHA_WILDFIRE) || defined(CONFIG_ALPHA_GENERIC)
+# include <asm/core_wildfire.h>
+# define MAX_NUMNODES		WILDFIRE_MAX_QBB
+#endif
+
+#endif /* _ASM_MAX_NUMNODES_H */
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -6,12 +6,13 @@
 #ifndef _ASM_MMZONE_H_
 #define _ASM_MMZONE_H_

+#include <asm/smp.h>
+
 #ifdef CONFIG_DISCONTIGMEM

 #ifdef CONFIG_X86_NUMAQ
 #include <asm/numaq.h>
 #else
-#define pa_to_nid(pa)	(0)
 #define pfn_to_nid(pfn)		(0)
 #ifdef CONFIG_NUMA
 #define _cpu_to_node(cpu) 0
@@ -44,7 +45,6 @@ extern struct pglist_data *node_data[];
 #define alloc_bootmem_low_pages_node(ignore, x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)

-#define node_startnr(nid)	(node_data[nid]->node_start_mapnr)
 #define node_size(nid)		(node_data[nid]->node_size)
 #define node_localnr(pfn, nid)	((pfn) - node_data[nid]->node_start_pfn)

@@ -55,7 +55,7 @@ extern struct pglist_data *node_data[];
 /*
 * Given a kernel address, find the home node of the underlying memory.
 */
-#define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
+#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)

 /*
 * Return a pointer to the node data for node n.
@@ -64,6 +64,8 @@ extern struct pglist_data *node_data[];

 #define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
+				 NODE_DATA(nid)->node_size)

 #define local_mapnr(kvaddr) \
 	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
@@ -74,5 +76,13 @@ extern struct pglist_data *node_data[];
 #define pfn_to_page(pfn)	(node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn)))
 #define page_to_pfn(page)	((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn)
 #define pmd_page(pmd)		(pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+/*
+ * pfn_valid should be made as fast as possible, and the current definition 
+ * is valid for machines that are NUMA, but still contiguous, which is what
+ * is currently supported. A more generalised, but slower definition would
+ * be something like this - mbligh:
+ * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) ) 
+ */ 
+#define pfn_valid(pfn)          (pfn < num_physpages)
 #endif /* CONFIG_DISCONTIGMEM */
 #endif /* _ASM_MMZONE_H_ */
--- a/include/asm-i386/numaq.h
+++ b/include/asm-i386/numaq.h
@@ -32,17 +32,18 @@

 /*
 * for now assume that 64Gb is max amount of RAM for whole system
- *    64Gb * 1024Mb/Gb = 65536 Mb
- *    65536 Mb / 256Mb = 256
+ *    64Gb / 4096bytes/page = 16777216 pages
 */
+#define MAX_NR_PAGES 16777216
 #define MAX_ELEMENTS 256
-#define ELEMENT_REPRESENTS 8 /* 256 Mb */
+#define PAGES_PER_ELEMENT (16777216/256)

+#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn))
+#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT)
 #define MAX_NUMNODES		8
 #ifdef CONFIG_NUMA
 #define _cpu_to_node(cpu) (cpu_to_logical_apicid(cpu) >> 4)
 #endif /* CONFIG_NUMA */
-extern int pa_to_nid(u64);
 extern int pfn_to_nid(unsigned long);
 extern void get_memcfg_numaq(void);
 #define get_memcfg_numa() get_memcfg_numaq()

--- a/include/asm-i386/max_numnodes.h
+++ b/include/asm-i386/max_numnodes.h
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -145,10 +145,10 @@ static __inline__ int get_order(unsigned long size)
 #ifndef CONFIG_DISCONTIGMEM
 #define pfn_to_page(pfn)	(mem_map + (pfn))
 #define page_to_pfn(page)	((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)		((pfn) < max_mapnr)
 #endif /* !CONFIG_DISCONTIGMEM */
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)

-#define pfn_valid(pfn)		((pfn) < max_mapnr)
 #define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)

 #define VM_DATA_DEFAULT_FLAGS	(VM_READ | VM_WRITE | VM_EXEC | \

--- a/include/asm-mips64/mmzone.h
+++ b/include/asm-mips64/mmzone.h
@@ -24,7 +24,6 @@ extern plat_pg_data_t *plat_node_data[];

 #define PHYSADDR_TO_NID(pa)		NASID_TO_COMPACT_NODEID(NASID_GET(pa))
 #define PLAT_NODE_DATA(n)		(plat_node_data[n])
-#define PLAT_NODE_DATA_STARTNR(n)    (PLAT_NODE_DATA(n)->gendata.node_start_mapnr)
 #define PLAT_NODE_DATA_SIZE(n)	     (PLAT_NODE_DATA(n)->gendata.node_size)
 #define PLAT_NODE_DATA_LOCALNR(p, n) \
 		(((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn)

--- a/include/asm-mips64/pgtable.h
+++ b/include/asm-mips64/pgtable.h
@@ -373,10 +373,10 @@ extern inline void pgd_clear(pgd_t *pgdp)
 #ifndef CONFIG_DISCONTIGMEM
 #define pte_page(x)		(mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT)))
 #else
-#define mips64_pte_pagenr(x) \
-	(PLAT_NODE_DATA_STARTNR(PHYSADDR_TO_NID(pte_val(x))) + \
-	PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))))
-#define pte_page(x)		(mem_map+mips64_pte_pagenr(x))
+
+#define pte_page(x) ( NODE_MEM_MAP(PHYSADDR_TO_NID(pte_val(x))) +
+	PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))) )
+				  
 #endif

 /*

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -279,6 +279,7 @@ struct iattr {
 */
 struct page;
 struct address_space;
+struct writeback_control;

 struct address_space_operations {
 	int (*writepage)(struct page *);
@@ -286,10 +287,10 @@ struct address_space_operations {
 	int (*sync_page)(struct page *);

 	/* Write back some dirty pages from this mapping. */
-	int (*writepages)(struct address_space *, int *nr_to_write);
+	int (*writepages)(struct address_space *, struct writeback_control *);

 	/* Perform a writeback as a memory-freeing operation. */
-	int (*vm_writeback)(struct page *, int *nr_to_write);
+	int (*vm_writeback)(struct page *, struct writeback_control *);

 	/* Set a page dirty */
 	int (*set_page_dirty)(struct page *page);
@@ -1259,7 +1260,8 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);

-extern int generic_vm_writeback(struct page *page, int *nr_to_write);
+extern int generic_vm_writeback(struct page *page,
+				struct writeback_control *wbc);

 extern struct file_operations generic_ro_fops;


--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,18 +39,25 @@
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */
-extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
 extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, struct zonelist *zonelist));
 extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);

+/*
+ * We get the zone list from the current node and the gfp_mask.
+ * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
+ *
+ * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
+ * optimized to &contig_page_data at compile-time.
+ */
 static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
 {
-	/*
-	 * Gets optimized away by the compiler.
-	 */
-	if (order >= MAX_ORDER)
+	pg_data_t *pgdat = NODE_DATA(numa_node_id());
+	unsigned int idx = (gfp_mask & GFP_ZONEMASK);
+
+	if (unlikely(order >= MAX_ORDER))
 		return NULL;
-	return _alloc_pages(gfp_mask, order);
+
+	return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + idx);
 }

 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,7 +15,10 @@
 #include <linux/rbtree.h>
 #include <linux/fs.h>

+#ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
+#endif
+
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
@@ -345,8 +348,10 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_MINOR	1
 #define VM_FAULT_MAJOR	2

-/* The array of struct pages */
+#ifndef CONFIG_DISCONTIGMEM
+/* The array of struct pages - for discontigmem use pgdat->lmem_map */
 extern struct page *mem_map;
+#endif 

 extern void show_free_areas(void);


--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -10,11 +10,14 @@
 #include <linux/wait.h>
 #include <linux/cache.h>
 #include <asm/atomic.h>
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+#ifndef MAX_NUMNODES
+#define MAX_NUMNODES 1
+#endif

-/*
- * Free memory management - zoned buddy allocator.
- */
-
+/* Free memory management - zoned buddy allocator.  */
 #ifndef CONFIG_FORCE_MAX_ZONEORDER
 #define MAX_ORDER 11
 #else
@@ -112,7 +115,6 @@ struct zone {
 	struct page		*zone_mem_map;
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
-	unsigned long		zone_start_mapnr;

 	/*
 	 * rarely used fields:
@@ -138,7 +140,7 @@ struct zone {
 * footprint of this construct is very small.
 */
 struct zonelist {
-	struct zone *zones[MAX_NR_ZONES+1]; // NULL delimited
+	struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
 };

 #define GFP_ZONEMASK	0x0f
@@ -163,7 +165,6 @@ typedef struct pglist_data {
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
 	unsigned long node_start_pfn;
-	unsigned long node_start_mapnr;
 	unsigned long node_size;
 	int node_id;
 	struct pglist_data *pgdat_next;
@@ -187,10 +188,12 @@ memclass(struct zone *pgzone, struct zone *classzone)
 * prototypes for the discontig memory code.
 */
 struct page;
-void free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
-  unsigned long *zones_size, unsigned long paddr, unsigned long *zholes_size,
-  struct page *pmap);
+extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
+		unsigned long *zholes_size);
+extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size,
+		unsigned long *zholes_size);
 void get_zone_counts(unsigned long *active, unsigned long *inactive);
+extern void build_all_zonelists(void);

 extern pg_data_t contig_page_data;


--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -10,14 +10,16 @@
 * nested includes.  Get it right in the .c file).
 */

+struct writeback_control;
+
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
 int mpage_writepages(struct address_space *mapping,
-		int *nr_to_write, get_block_t get_block);
+		struct writeback_control *wbc, get_block_t get_block);

 static inline int
-generic_writepages(struct address_space *mapping, int *nr_to_write)
+generic_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	return mpage_writepages(mapping, nr_to_write, NULL);
+	return mpage_writepages(mapping, wbc, NULL);
 }
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -128,7 +128,6 @@ enum
 	KERN_TAINTED=53,	/* int: various kernel tainted flags */
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
 	KERN_PIDMAX=55,		/* int: PID # limit */
-	KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */
 };


@@ -147,12 +146,12 @@ enum
 	VM_PAGE_CLUSTER=10,	/* int: set number of pages to swap together */
 	VM_DIRTY_BACKGROUND=11,	/* dirty_background_ratio */
 	VM_DIRTY_ASYNC=12,	/* dirty_async_ratio */
-	VM_DIRTY_SYNC=13,	/* dirty_sync_ratio */
-	VM_DIRTY_WB_CS=14,	/* dirty_writeback_centisecs */
-	VM_DIRTY_EXPIRE_CS=15,	/* dirty_expire_centisecs */
-	VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */
-	VM_OVERCOMMIT_RATIO=17, /* percent of RAM to allow overcommit in */
-	VM_PAGEBUF=18		/* struct: Control pagebuf parameters */
+	VM_DIRTY_WB_CS=13,	/* dirty_writeback_centisecs */
+	VM_DIRTY_EXPIRE_CS=14,	/* dirty_expire_centisecs */
+	VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */
+	VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
+	VM_PAGEBUF=17,		/* struct: Control pagebuf parameters */
+	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
 };



--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -35,7 +35,11 @@ struct iovec
 #endif

 /*
- * Total number of bytes covered by an iovec
+ * Total number of bytes covered by an iovec.
+ *
+ * NOTE that it is not safe to use this function until all the iovec's
+ * segment lengths have been validated.  Because the individual lengths can
+ * overflow a size_t when added together.
 */
 static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
 {

--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -27,22 +27,29 @@ static inline int current_is_pdflush(void)
 * fs/fs-writeback.c
 */
 enum writeback_sync_modes {
-	WB_SYNC_NONE =  0,	/* Don't wait on anything */
-	WB_SYNC_LAST =  1,	/* Wait on the last-written mapping */
-	WB_SYNC_ALL =   2,	/* Wait on every mapping */
-	WB_SYNC_HOLD =  3,	/* Hold the inode on sb_dirty for sys_sync() */
+	WB_SYNC_NONE,	/* Don't wait on anything */
+	WB_SYNC_ALL,	/* Wait on every mapping */
+	WB_SYNC_HOLD,	/* Hold the inode on sb_dirty for sys_sync() */
 };

-void writeback_unlocked_inodes(int *nr_to_write,
-			       enum writeback_sync_modes sync_mode,
-			       unsigned long *older_than_this);
+/*
+ * A control structure which tells the writeback code what to do
+ */
+struct writeback_control {
+	struct backing_dev_info *bdi;	/* If !NULL, only write back this
+					   queue */
+	enum writeback_sync_modes sync_mode;
+	unsigned long *older_than_this;	/* If !NULL, only write back inodes
+					   older than this */
+	long nr_to_write;		/* Write this many pages, and decrement
+					   this for each page written */
+};
+	
+void writeback_inodes(struct writeback_control *wbc);
 void wake_up_inode(struct inode *inode);
 void __wait_on_inode(struct inode * inode);
 void sync_inodes_sb(struct super_block *, int wait);
 void sync_inodes(int wait);
-void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
-			enum writeback_sync_modes sync_mode,
-			unsigned long *older_than_this);

 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
@@ -57,7 +64,6 @@ static inline void wait_on_inode(struct inode *inode)
 /* These 5 are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int dirty_async_ratio;
-extern int dirty_sync_ratio;
 extern int dirty_writeback_centisecs;
 extern int dirty_expire_centisecs;

@@ -65,7 +71,7 @@ extern int dirty_expire_centisecs;
 void balance_dirty_pages(struct address_space *mapping);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
-int do_writepages(struct address_space *mapping, int *nr_to_write);
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc);

 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl

--- a/init/main.c
+++ b/init/main.c
@@ -393,6 +393,7 @@ asmlinkage void __init start_kernel(void)
 	printk(linux_banner);
 	setup_arch(&command_line);
 	setup_per_cpu_areas();
+	build_all_zonelists();
 	printk("Kernel command line: %s\n", saved_command_line);
 	parse_options(command_line);
 	trap_init();

--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -91,7 +91,6 @@ EXPORT_SYMBOL(do_brk);
 EXPORT_SYMBOL(exit_mm);

 /* internal kernel memory management */
-EXPORT_SYMBOL(_alloc_pages);
 EXPORT_SYMBOL(__alloc_pages);
 EXPORT_SYMBOL(alloc_pages_node);
 EXPORT_SYMBOL(__get_free_pages);
@@ -116,9 +115,12 @@ EXPORT_SYMBOL(vmalloc_32);
 EXPORT_SYMBOL(vmap);
 EXPORT_SYMBOL(vunmap);
 EXPORT_SYMBOL(vmalloc_to_page);
-EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(remap_page_range);
+#ifndef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(contig_page_data);
+EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(max_mapnr);
+#endif
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmtruncate);
 EXPORT_SYMBOL(find_vma);

--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -525,11 +525,11 @@ void release_console_sem(void)
 {
 	unsigned long flags;
 	unsigned long _con_start, _log_end;
-	unsigned long must_wake_klogd = 0;
+	unsigned long wake_klogd = 0;

 	for ( ; ; ) {
 		spin_lock_irqsave(&logbuf_lock, flags);
-		must_wake_klogd |= log_start - log_end;
+		wake_klogd |= log_start - log_end;
 		if (con_start == log_end)
 			break;			/* Nothing to print */
 		_con_start = con_start;
@@ -541,7 +541,7 @@ void release_console_sem(void)
 	console_may_schedule = 0;
 	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
-	if (must_wake_klogd && !oops_in_progress)
+	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
 		wake_up_interruptible(&log_wait);
 }


--- a/kernel/suspend.c
+++ b/kernel/suspend.c
@@ -471,10 +471,12 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)
 	int nr_copy_pages = 0;
 	int pfn;
 	struct page *page;
-	
+
+#ifndef CONFIG_DISCONTIGMEM	
 	if (max_mapnr != num_physpages)
 		panic("mapnr is not expected");
-	for (pfn = 0; pfn < max_mapnr; pfn++) {
+#endif
+	for (pfn = 0; pfn < num_physpages; pfn++) {
 		page = pfn_to_page(pfn);
 		if (PageHighMem(page))
 			panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
@@ -514,19 +516,20 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)

 static void free_suspend_pagedir(unsigned long this_pagedir)
 {
-	struct page *page = mem_map;
-	int i;
+	struct page *page;
+	int pfn;
 	unsigned long this_pagedir_end = this_pagedir +
 		(PAGE_SIZE << pagedir_order);

-	for(i=0; i < num_physpages; i++, page++) {
+	for(pfn = 0; pfn < num_physpages; pfn++) {
+		page = pfn_to_page(pfn);
 		if (!TestClearPageNosave(page))
 			continue;

-		if (ADDRESS(i) >= this_pagedir && ADDRESS(i) < this_pagedir_end)
+		if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
 			continue; /* old pagedir gets freed in one */
 		
-		free_page(ADDRESS(i));
+		free_page(ADDRESS(pfn));
 	}
 	free_pages(this_pagedir, pagedir_order);
 }

--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -99,8 +99,8 @@ extern int acct_parm[];
 #endif

 #ifdef CONFIG_HUGETLB_PAGE
-extern	int htlbpage_max;
-extern	int	set_hugetlb_mem_size(int);
+extern int htlbpage_max;
+extern int set_hugetlb_mem_size(int);
 #endif

 static int parse_table(int *, int, void *, size_t *, void *, size_t,
@@ -263,10 +263,6 @@ static ctl_table kern_table[] = {
 #endif
 	{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
 	 0600, NULL, &proc_dointvec},
-#ifdef CONFIG_HUGETLB_PAGE
-	 {KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL, 
-	  &proc_dointvec},
-#endif
 	{0}
 };

@@ -292,9 +288,6 @@ static ctl_table vm_table[] = {
 	{VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio,
 	 sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec_minmax,
 	 &sysctl_intvec, NULL, &zero, &one_hundred },
-	{VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio,
-	 sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec_minmax,
-	 &sysctl_intvec, NULL, &zero, &one_hundred },
 	{VM_DIRTY_WB_CS, "dirty_writeback_centisecs",
 	 &dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644,
 	 NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL,
@@ -317,6 +310,10 @@ static ctl_table vm_table[] = {
 	{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
 	  &nr_pdflush_threads, sizeof nr_pdflush_threads,
 	  0444 /* read-only*/, NULL, &proc_dointvec},
+#ifdef CONFIG_HUGETLB_PAGE
+	 {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, 
+	  &proc_dointvec},
+#endif
 	{0}
 };


--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -487,9 +487,13 @@ EXPORT_SYMBOL(fail_writepage);
 int filemap_fdatawrite(struct address_space *mapping)
 {
 	int ret;
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+	};

 	current->flags |= PF_SYNC;
-	ret = do_writepages(mapping, NULL);
+	ret = do_writepages(mapping, &wbc);
 	current->flags &= ~PF_SYNC;
 	return ret;
 }
@@ -1130,10 +1134,26 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	struct file *filp = iocb->ki_filp;
 	ssize_t retval;
 	unsigned long seg;
-	size_t count = iov_length(iov, nr_segs);
+	size_t count;

-	if ((ssize_t) count < 0)
-		return -EINVAL;
+	count = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		count += iv->iov_len;
+		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		break;
+	}

 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
@@ -1162,11 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		goto out;
 	}

-	for (seg = 0; seg < nr_segs; seg++) {
-		if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len))
-			return -EFAULT;
-	}
-
 	retval = 0;
 	if (count) {
 		for (seg = 0; seg < nr_segs; seg++) {
@@ -1626,6 +1641,63 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
 	return left;
 }

+static inline int
+__filemap_copy_from_user_iovec(char *vaddr, 
+			const struct iovec *iov, size_t base, unsigned bytes)
+{
+	int left = 0;
+
+	while (bytes) {
+		char *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+		base = 0;
+		if ((left = __copy_from_user(vaddr, buf, copy)))
+			break;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+	}
+	return left;
+}
+
+static inline int
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+			const struct iovec *iov, size_t base, unsigned bytes)
+{
+	char *kaddr;
+	int left;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+	if (left != 0) {
+		kaddr = kmap(page);
+		left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
+		kunmap(page);
+	}
+	return left;
+}
+
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, unsigned bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t base = *basep;
+
+	while (bytes) {
+		int copy = min(bytes, iov->iov_len - base);
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			base = 0;
+		}
+	}
+	*iovp = iov;
+	*basep = base;
+}
+
+
 /*
 * Write to a file through the page cache. 
 *
@@ -1641,8 +1713,8 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 {
 	struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
 	struct address_space_operations *a_ops = mapping->a_ops;
-	const size_t ocount = iov_length(iov, nr_segs);
-	size_t count =	ocount;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 	long		status = 0;
@@ -1654,19 +1726,30 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	unsigned	bytes;
 	time_t		time_now;
 	struct pagevec	lru_pvec;
-	struct iovec	*cur_iov;
-	unsigned	iov_bytes;	/* Cumulative count to the end of the
-					   current iovec */
+	const struct iovec *cur_iov = iov; /* current iovec */
+	unsigned	iov_base = 0;	   /* offset in the current iovec */
 	unsigned long	seg;
 	char		*buf;

-	if (unlikely((ssize_t)count < 0))
-		return -EINVAL;
-
+	ocount = 0;
 	for (seg = 0; seg < nr_segs; seg++) {
-		if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len))
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		ocount += iv->iov_len;
+		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
 			return -EFAULT;
+		nr_segs = seg;
+		break;
 	}
+	count = ocount;

 	pos = *ppos;
 	if (unlikely(pos < 0))
@@ -1788,9 +1871,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 		goto out_status;
 	}

-	cur_iov = (struct iovec *)iov;
-	iov_bytes = cur_iov->iov_len;
-	buf = cur_iov->iov_base;
+	buf = iov->iov_base;
 	do {
 		unsigned long index;
 		unsigned long offset;
@@ -1801,8 +1882,6 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 		bytes = PAGE_CACHE_SIZE - offset;
 		if (bytes > count)
 			bytes = count;
-		if (bytes + written > iov_bytes)
-			bytes = iov_bytes - written;

 		/*
 		 * Bring in the user page that we will copy from _first_.
@@ -1830,7 +1909,12 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 				vmtruncate(inode, inode->i_size);
 			break;
 		}
-		page_fault = filemap_copy_from_user(page, offset, buf, bytes);
+		if (likely(nr_segs == 1))
+			page_fault = filemap_copy_from_user(page, offset,
+							buf, bytes);
+		else
+			page_fault = filemap_copy_from_user_iovec(page, offset,
+						cur_iov, iov_base, bytes);
 		flush_dcache_page(page);
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
 		if (unlikely(page_fault)) {
@@ -1844,11 +1928,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 				count -= status;
 				pos += status;
 				buf += status;
-				if (written == iov_bytes && count) {
-					cur_iov++;
-					iov_bytes += cur_iov->iov_len;
-					buf = cur_iov->iov_base;
-				}
+				if (unlikely(nr_segs > 1))
+					filemap_set_next_iovec(&cur_iov,
+							&iov_base, status);
 			}
 		}
 		if (!PageReferenced(page))

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -40,7 +40,6 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
-#include <linux/smp_lock.h>
 #include <linux/iobuf.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -53,7 +52,12 @@

 #include <linux/swapops.h>

+#ifndef CONFIG_DISCONTIGMEM
+/* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
+struct page *mem_map;
+#endif
+
 unsigned long num_physpages;
 void * high_memory;
 struct page *highmem_start_page;
@@ -72,8 +76,6 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned
 	copy_user_highpage(to, from, address);
 }

-struct page *mem_map;
-
 /*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.

--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -187,11 +187,12 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
 	int curr_nr;
 	DECLARE_WAITQUEUE(wait, current);
 	int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+	int pf_flags = current->flags;

 repeat_alloc:
 	current->flags |= PF_NOWARN;
 	element = pool->alloc(gfp_nowait, pool->pool_data);
-	current->flags &= ~PF_NOWARN;
+	current->flags = pf_flags;
 	if (likely(element != NULL))
 		return element;


--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -11,7 +11,6 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -444,6 +443,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	 */
 	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

+	if (flags & MAP_LOCKED) {
+		if (!capable(CAP_IPC_LOCK))
+			return -EPERM;
+		vm_flags |= VM_LOCKED;
+	}
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
@@ -1073,7 +1077,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 /* Munmap is split into 2 main parts -- this part which finds
 * what needs doing, and the areas themselves, which do the
 * work.  This now handles partial unmappings.
- * Jeremy Fitzhardine <jeremy@sw.oz.au>
+ * Jeremy Fitzhardinge <jeremy@goop.org>
 */
 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 {

--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@

 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>

--- a/mm/numa.c
+++ b/mm/numa.c
@@ -22,11 +22,21 @@ pg_data_t contig_page_data = { .bdata = &contig_bootmem_data };
 * Should be invoked with paramters (0, 0, unsigned long *[], start_paddr).
 */
 void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
-	unsigned long *zones_size, unsigned long zone_start_pfn, 
+	unsigned long *zones_size, unsigned long node_start_pfn, 
 	unsigned long *zholes_size)
 {
-	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 
-				zone_start_pfn, zholes_size, pmap);
+	unsigned long size;
+
+	contig_page_data.node_id = 0;
+	contig_page_data.node_start_pfn = node_start_pfn;
+	calculate_totalpages (&contig_page_data, zones_size, zholes_size);
+	if (pmap == (struct page *)0) {
+		size = (pgdat->node_size + 1) * sizeof(struct page);
+		pmap = (struct page *) alloc_bootmem_node(pgdat, size);
+	}
+	contig_page_data.node_mem_map = pmap;
+	free_area_init_core(&contig_page_data, zones_size, zholes_size);
+	mem_map = contig_page_data.node_mem_map;
 }

 #endif /* !CONFIG_DISCONTIGMEM */
@@ -48,22 +58,26 @@ struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int orde
 * Nodes can be initialized parallely, in no particular order.
 */
 void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
-	unsigned long *zones_size, unsigned long zone_start_pfn, 
+	unsigned long *zones_size, unsigned long node_start_pfn, 
 	unsigned long *zholes_size)
 {
-	int i, size = 0;
-	struct page *discard;
-
-	if (mem_map == NULL)
-		mem_map = (struct page *)PAGE_OFFSET;
+	int i;
+	unsigned long size;

-	free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_pfn,
-					zholes_size, pmap);
 	pgdat->node_id = nid;
+	pgdat->node_start_pfn = node_start_pfn;
+	calculate_totalpages (pgdat, zones_size, zholes_size);
+	if (pmap == (struct page *)0) {
+		size = (pgdat->node_size + 1) * sizeof(struct page); 
+		pmap = (struct page *) alloc_bootmem_node(pgdat, size);
+	}
+	pgdat->node_mem_map = pmap;
+	free_area_init_core(pgdat, zones_size, zholes_size);

 	/*
 	 * Get space for the valid bitmap.
 	 */
+	size = 0;
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		size += zones_size[i];
 	size = LONG_ALIGN((size + 7) >> 3);
@@ -71,48 +85,4 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
 	memset(pgdat->valid_addr_bitmap, 0, size);
 }

-static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
-	unsigned int order)
-{
-	return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
-}
-
-/*
- * This can be refined. Currently, tries to do round robin, instead
- * should do concentratic circle search, starting from current node.
- */
-struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
-{
-	struct page *ret = 0;
-	pg_data_t *start, *temp;
-#ifndef CONFIG_NUMA
-	unsigned long flags;
-	static pg_data_t *next = 0;
-#endif
-
-	if (order >= MAX_ORDER)
-		return NULL;
-#ifdef CONFIG_NUMA
-	temp = NODE_DATA(numa_node_id());
-#else
-	if (!next)
-		next = pgdat_list;
-	temp = next;
-	next = next->pgdat_next;
-#endif
-	start = temp;
-	while (temp) {
-		if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
-			return(ret);
-		temp = temp->pgdat_next;
-	}
-	temp = pgdat_list;
-	while (temp != start) {
-		if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
-			return(ret);
-		temp = temp->pgdat_next;
-	}
-	return(0);
-}
-
 #endif /* CONFIG_DISCONTIGMEM */
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -51,7 +51,7 @@ static long total_pages;
 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
 * large amounts of I/O are submitted.
 */
-static inline int sync_writeback_pages(void)
+static inline long sync_writeback_pages(void)
 {
 	return ratelimit_pages + ratelimit_pages / 2;
 }
@@ -72,11 +72,6 @@ int dirty_background_ratio = 10;
 */
 int dirty_async_ratio = 40;

-/*
- * The generator of dirty data performs sync writeout at this level
- */
-int dirty_sync_ratio = 50;
-
 /*
 * The interval between `kupdate'-style writebacks, in centiseconds
 * (hundredths of a second)
@@ -105,15 +100,11 @@ static void background_writeout(unsigned long _min_pages);
 * - Does nothing at all.
 *
 * balance_dirty_pages() can sleep.
- *
- * FIXME: WB_SYNC_LAST doesn't actually work.  It waits on the last dirty
- * inode on the superblock list.  It should wait when nr_to_write is
- * exhausted.  Doesn't seem to matter.
 */
 void balance_dirty_pages(struct address_space *mapping)
 {
 	struct page_state ps;
-	long background_thresh, async_thresh, sync_thresh;
+	long background_thresh, async_thresh;
 	unsigned long dirty_and_writeback;
 	struct backing_dev_info *bdi;

@@ -122,18 +113,17 @@ void balance_dirty_pages(struct address_space *mapping)

 	background_thresh = (dirty_background_ratio * total_pages) / 100;
 	async_thresh = (dirty_async_ratio * total_pages) / 100;
-	sync_thresh = (dirty_sync_ratio * total_pages) / 100;
 	bdi = mapping->backing_dev_info;

-	if (dirty_and_writeback > sync_thresh) {
-		int nr_to_write = sync_writeback_pages();
+	if (dirty_and_writeback > async_thresh) {
+		struct writeback_control wbc = {
+			.bdi		= bdi,
+			.sync_mode	= WB_SYNC_NONE,
+			.older_than_this = NULL,
+			.nr_to_write	= sync_writeback_pages(),
+		};

-		writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
-		get_page_state(&ps);
-	} else if (dirty_and_writeback > async_thresh) {
-		int nr_to_write = sync_writeback_pages();
-
-		writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
+		writeback_inodes(&wbc);
 		get_page_state(&ps);
 	}

@@ -177,7 +167,12 @@ static void background_writeout(unsigned long _min_pages)
 {
 	long min_pages = _min_pages;
 	long background_thresh;
-	int nr_to_write;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 0,
+	};

 	CHECK_EMERGENCY_SYNC

@@ -185,14 +180,13 @@ static void background_writeout(unsigned long _min_pages)

 	do {
 		struct page_state ps;
-
 		get_page_state(&ps);
 		if (ps.nr_dirty < background_thresh && min_pages <= 0)
 			break;
-		nr_to_write = MAX_WRITEBACK_PAGES;
-		writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
-		min_pages -= MAX_WRITEBACK_PAGES - nr_to_write;
-	} while (nr_to_write <= 0);
+		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		writeback_inodes(&wbc);
+		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+	} while (wbc.nr_to_write <= 0);
 	blk_run_queues();
 }

@@ -230,7 +224,12 @@ static void wb_kupdate(unsigned long arg)
 	unsigned long start_jif;
 	unsigned long next_jif;
 	struct page_state ps;
-	int nr_to_write;
+	struct writeback_control wbc = {
+		.bdi		= NULL,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = &oldest_jif,
+		.nr_to_write	= 0,
+	};

 	sync_supers();
 	get_page_state(&ps);
@@ -238,8 +237,8 @@ static void wb_kupdate(unsigned long arg)
 	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 	start_jif = jiffies;
 	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
-	nr_to_write = ps.nr_dirty;
-	writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
+	wbc.nr_to_write = ps.nr_dirty;
+	writeback_inodes(&wbc);
 	blk_run_queues();
 	yield();

@@ -312,8 +311,6 @@ static int __init page_writeback_init(void)
 		dirty_background_ratio /= 100;
 		dirty_async_ratio *= correction;
 		dirty_async_ratio /= 100;
-		dirty_sync_ratio *= correction;
-		dirty_sync_ratio /= 100;
 	}

 	init_timer(&wb_timer);
@@ -351,7 +348,7 @@ module_init(page_writeback_init);
 * So.  The proper fix is to leave the page locked-and-dirty and to pass
 * it all the way down.
 */
-int generic_vm_writeback(struct page *page, int *nr_to_write)
+int generic_vm_writeback(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;

@@ -363,7 +360,7 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
 	unlock_page(page);

 	if (inode) {
-		do_writepages(inode->i_mapping, nr_to_write);
+		do_writepages(inode->i_mapping, wbc);

 		/*
 		 * This iput() will internally call ext2_discard_prealloc(),
@@ -392,11 +389,11 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
 }
 EXPORT_SYMBOL(generic_vm_writeback);

-int do_writepages(struct address_space *mapping, int *nr_to_write)
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	if (mapping->a_ops->writepages)
-		return mapping->a_ops->writepages(mapping, nr_to_write);
-	return generic_writepages(mapping, nr_to_write);
+		return mapping->a_ops->writepages(mapping, wbc);
+	return generic_writepages(mapping, wbc);
 }

 /**

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -256,14 +256,6 @@ int is_head_of_free_region(struct page *page)
 }
 #endif /* CONFIG_SOFTWARE_SUSPEND */

-#ifndef CONFIG_DISCONTIGMEM
-struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
-{
-	return __alloc_pages(gfp_mask, order,
-		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
-}
-#endif
-
 static /* inline */ struct page *
 balance_classzone(struct zone* classzone, unsigned int gfp_mask,
 			unsigned int order, int * freed)
@@ -680,13 +672,41 @@ void show_free_areas(void)
 /*
 * Builds allocation fallback zone lists.
 */
-static inline void build_zonelists(pg_data_t *pgdat)
+static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
 {
-	int i, j, k;
+	switch (k) {
+		struct zone *zone;
+	default:
+		BUG();
+	case ZONE_HIGHMEM:
+		zone = pgdat->node_zones + ZONE_HIGHMEM;
+		if (zone->size) {
+#ifndef CONFIG_HIGHMEM
+			BUG();
+#endif
+			zonelist->zones[j++] = zone;
+		}
+	case ZONE_NORMAL:
+		zone = pgdat->node_zones + ZONE_NORMAL;
+		if (zone->size)
+			zonelist->zones[j++] = zone;
+	case ZONE_DMA:
+		zone = pgdat->node_zones + ZONE_DMA;
+		if (zone->size)
+			zonelist->zones[j++] = zone;
+	}

+	return j;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+	int i, j, k, node, local_node;
+
+	local_node = pgdat->node_id;
+	printk("Building zonelist for node : %d\n", local_node);
 	for (i = 0; i <= GFP_ZONEMASK; i++) {
 		struct zonelist *zonelist;
-		struct zone *zone;

 		zonelist = pgdat->node_zonelists + i;
 		memset(zonelist, 0, sizeof(*zonelist));
@@ -698,33 +718,49 @@ static inline void build_zonelists(pg_data_t *pgdat)
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;

-		switch (k) {
-			default:
-				BUG();
-			/*
-			 * fallthrough:
-			 */
-			case ZONE_HIGHMEM:
-				zone = pgdat->node_zones + ZONE_HIGHMEM;
-				if (zone->size) {
-#ifndef CONFIG_HIGHMEM
-					BUG();
-#endif
-					zonelist->zones[j++] = zone;
-				}
-			case ZONE_NORMAL:
-				zone = pgdat->node_zones + ZONE_NORMAL;
-				if (zone->size)
-					zonelist->zones[j++] = zone;
-			case ZONE_DMA:
-				zone = pgdat->node_zones + ZONE_DMA;
-				if (zone->size)
-					zonelist->zones[j++] = zone;
-		}
+ 		j = build_zonelists_node(pgdat, zonelist, j, k);
+ 		/*
+ 		 * Now we build the zonelist so that it contains the zones
+ 		 * of all the other nodes.
+ 		 * We don't want to pressure a particular node, so when
+ 		 * building the zones for node N, we make sure that the
+ 		 * zones coming right after the local ones are those from
+ 		 * node N+1 (modulo N)
+ 		 */
+ 		for (node = local_node + 1; node < numnodes; node++)
+ 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ 		for (node = 0; node < local_node; node++)
+ 			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ 
 		zonelist->zones[j++] = NULL;
 	} 
 }

+void __init build_all_zonelists(void)
+{
+	int i;
+
+	for(i = 0 ; i < numnodes ; i++)
+		build_zonelists(NODE_DATA(i));
+}
+
+void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
+	unsigned long *zholes_size)
+{
+	unsigned long realtotalpages, totalpages = 0;
+	int i;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		totalpages += zones_size[i];
+	pgdat->node_size = totalpages;
+
+	realtotalpages = totalpages;
+	if (zholes_size)
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			realtotalpages -= zholes_size[i];
+	printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
 /*
 * Helper functions to size the waitqueue hash table.
 * Essentially these want to choose hash table sizes sufficiently
@@ -775,46 +811,18 @@ static inline unsigned long wait_table_bits(unsigned long size)
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
-void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
-	unsigned long *zones_size, unsigned long zone_start_pfn, 
-	unsigned long *zholes_size, struct page *lmem_map)
+void __init free_area_init_core(pg_data_t *pgdat,
+		unsigned long *zones_size, unsigned long *zholes_size)
 {
 	unsigned long i, j;
-	unsigned long map_size;
-	unsigned long totalpages, offset, realtotalpages;
+	unsigned long local_offset;
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+	int nid = pgdat->node_id;
+	struct page *lmem_map = pgdat->node_mem_map;
+	unsigned long zone_start_pfn = pgdat->node_start_pfn;

-	totalpages = 0;
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		totalpages += zones_size[i];
-
-	realtotalpages = totalpages;
-	if (zholes_size)
-		for (i = 0; i < MAX_NR_ZONES; i++)
-			realtotalpages -= zholes_size[i];
-			
-	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
-
-	/*
-	 * Some architectures (with lots of mem and discontinous memory
-	 * maps) have to search for a good mem_map area:
-	 * For discontigmem, the conceptual mem map array starts from 
-	 * PAGE_OFFSET, we need to align the actual array onto a mem map 
-	 * boundary, so that MAP_NR works.
-	 */
-	map_size = (totalpages + 1)*sizeof(struct page);
-	if (lmem_map == (struct page *)0) {
-		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
-		lmem_map = (struct page *)(PAGE_OFFSET + 
-			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
-	}
-	*gmap = pgdat->node_mem_map = lmem_map;
-	pgdat->node_size = totalpages;
-	pgdat->node_start_pfn = zone_start_pfn;
-	pgdat->node_start_mapnr = (lmem_map - mem_map);
 	pgdat->nr_zones = 0;
-
-	offset = lmem_map - mem_map;	
+	local_offset = 0;                /* offset within lmem_map */
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long mask;
@@ -866,8 +874,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;

-		zone->zone_mem_map = mem_map + offset;
-		zone->zone_start_mapnr = offset;
+		zone->zone_mem_map = lmem_map + local_offset;
 		zone->zone_start_pfn = zone_start_pfn;

 		if ((zone_start_pfn) & (zone_required_alignment-1))
@@ -879,7 +886,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 		 * done. Non-atomic initialization, single-pass.
 		 */
 		for (i = 0; i < size; i++) {
-			struct page *page = mem_map + offset + i;
+			struct page *page = lmem_map + local_offset + i;
 			set_page_zone(page, nid * MAX_NR_ZONES + j);
 			set_page_count(page, 0);
 			SetPageReserved(page);
@@ -893,7 +900,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 			zone_start_pfn++;
 		}

-		offset += size;
+		local_offset += size;
 		for (i = 0; ; i++) {
 			unsigned long bitmap_size;

@@ -932,13 +939,15 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
 		}
 	}
-	build_zonelists(pgdat);
 }

+#ifndef CONFIG_DISCONTIGMEM
 void __init free_area_init(unsigned long *zones_size)
 {
-	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+	free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, NULL);
+	mem_map = contig_page_data.node_mem_map;
 }
+#endif

 static int __init setup_mem_frac(char *str)
 {

--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -131,12 +131,12 @@ int swap_readpage(struct file *file, struct page *page)
 * Swap pages are !PageLocked and PageWriteback while under writeout so that
 * memory allocators will throttle against them.
 */
-static int swap_vm_writeback(struct page *page, int *nr_to_write)
+static int swap_vm_writeback(struct page *page, struct writeback_control *wbc)
 {
 	struct address_space *mapping = page->mapping;

 	unlock_page(page);
-	return generic_writepages(mapping, nr_to_write);
+	return generic_writepages(mapping, wbc);
 }

 struct address_space_operations swap_aops = {

--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,7 +28,6 @@
 #include <linux/pagemap.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/shmem_fs.h>


--- a/mm/swap.c
+++ b/mm/swap.c
@@ -124,9 +124,9 @@ void release_pages(struct page **pages, int nr)
 		if (page_count(page) == 0) {
 			if (!pagevec_add(&pages_to_free, page)) {
 				spin_unlock_irq(&zone->lru_lock);
-				pagevec_free(&pages_to_free);
+				__pagevec_free(&pages_to_free);
 				pagevec_init(&pages_to_free);
-				spin_lock_irq(&zone->lru_lock);
+				zone = NULL;	/* No lock is held */
 			}
 		}
 	}
@@ -165,8 +165,8 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
 }

 /*
- * Move all the inactive pages to the head of the inactive list
- * and release them.  Reinitialises the caller's pagevec.
+ * Move all the inactive pages to the head of the inactive list and release
+ * them.  Reinitialises the caller's pagevec.
 */
 void pagevec_deactivate_inactive(struct pagevec *pvec)
 {
@@ -180,8 +180,6 @@ void pagevec_deactivate_inactive(struct pagevec *pvec)
 		struct zone *pagezone = page_zone(page);

 		if (pagezone != zone) {
-			if (PageActive(page) || !PageLRU(page))
-				continue;
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;

--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -12,7 +12,6 @@
 #include <linux/swap.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>	/* block_sync_page() */

@@ -119,7 +118,7 @@ void __delete_from_swap_cache(struct page *page)
 int add_to_swap(struct page * page)
 {
 	swp_entry_t entry;
-	int flags;
+	int pf_flags;

 	if (!PageLocked(page))
 		BUG();
@@ -142,7 +141,7 @@ int add_to_swap(struct page * page)
 		 * just not all of them.
 		 */

-		flags = current->flags;
+		pf_flags = current->flags;
 		current->flags &= ~PF_MEMALLOC;
 		current->flags |= PF_NOWARN;
 		ClearPageUptodate(page);		/* why? */
@@ -154,20 +153,20 @@ int add_to_swap(struct page * page)
 		 */
 		switch (add_to_swap_cache(page, entry)) {
 		case 0:				/* Success */
-			current->flags = flags;
+			current->flags = pf_flags;
 			SetPageUptodate(page);
 			set_page_dirty(page);
 			swap_free(entry);
 			return 1;
 		case -ENOMEM:			/* radix-tree allocation */
-			current->flags = flags;
+			current->flags = pf_flags;
 			swap_free(entry);
 			return 0;
 		default:			/* ENOENT: raced */
 			break;
 		}
 		/* Raced with "speculative" read_swap_cache_async */
-		current->flags = flags;
+		current->flags = pf_flags;
 		swap_free(entry);
 	}
 }

--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -7,7 +7,6 @@

 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/vmalloc.h>

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
-#include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
@@ -145,6 +144,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
 			if (!add_to_swap(page))
 				goto activate_locked;
 			pte_chain_lock(page);
+			mapping = page->mapping;
 		}

 		/*
@@ -174,15 +174,18 @@ shrink_list(struct list_head *page_list, int nr_pages,
 		 */
 		if (PageDirty(page) && is_page_cache_freeable(page) &&
 					mapping && may_enter_fs) {
-			int (*writeback)(struct page *, int *);
+			int (*writeback)(struct page *,
+					struct writeback_control *);
 			const int cluster_size = SWAP_CLUSTER_MAX;
-			int nr_to_write = cluster_size;
+			struct writeback_control wbc = {
+				.nr_to_write = cluster_size,
+			};

 			writeback = mapping->a_ops->vm_writeback;
 			if (writeback == NULL)
 				writeback = generic_vm_writeback;
-			(*writeback)(page, &nr_to_write);
-			*max_scan -= (cluster_size - nr_to_write);
+			(*writeback)(page, &wbc);
+			*max_scan -= (cluster_size - wbc.nr_to_write);
 			goto keep;
 		}