Commit 2188a617 authored by Linus Torvalds's avatar Linus Torvalds

Merge home.transmeta.com:/home/torvalds/v2.5/akpm

into home.transmeta.com:/home/torvalds/v2.5/linux
parents 075ee978 c7ea169d
......@@ -963,13 +963,6 @@ Contains, as a percentage of total system memory, the number of pages at which
a process which is generating disk writes will itself start writing out dirty
data.
dirty_sync_ratio
----------------
Contains, as a percentage of total system memory, the number of pages at which
a process which is generating disk writes will itself start writing out dirty
data and waiting upon completion of that writeout.
dirty_writeback_centisecs
-------------------------
......
......@@ -21,13 +21,12 @@ Currently, these files are in /proc/sys/vm:
- dirty_async_ratio
- dirty_background_ratio
- dirty_expire_centisecs
- dirty_sync_ratio
- dirty_writeback_centisecs
==============================================================
dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs,
dirty_sync_ratio dirty_writeback_centisecs:
dirty_writeback_centisecs:
See Documentation/filesystems/proc.txt
......
......@@ -286,7 +286,6 @@ void __init paging_init(void)
for (nid = 0; nid < numnodes; nid++) {
unsigned long start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
unsigned long end_pfn = plat_node_bdata[nid].node_low_pfn;
unsigned long lmax_mapnr;
if (dma_local_pfn >= end_pfn - start_pfn)
zones_size[ZONE_DMA] = end_pfn - start_pfn;
......@@ -295,11 +294,6 @@ void __init paging_init(void)
zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
}
free_area_init_node(nid, NODE_DATA(nid), NULL, zones_size, start_pfn, NULL);
lmax_mapnr = PLAT_NODE_DATA_STARTNR(nid) + PLAT_NODE_DATA_SIZE(nid);
if (lmax_mapnr > max_mapnr) {
max_mapnr = lmax_mapnr;
DBGDCONT("Grow max_mapnr to %ld\n", max_mapnr);
}
}
/* Initialize the kernel's ZERO_PGE. */
......
......@@ -154,7 +154,7 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then
define_bool CONFIG_X86_OOSTORE y
fi
bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE
bool 'Huge TLB Page Support' CONFIG_HUGETLB_PAGE
bool 'Symmetric multi-processing support' CONFIG_SMP
bool 'Preemptible Kernel' CONFIG_PREEMPT
......
......@@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret");
static void __init init_amd(struct cpuinfo_x86 *c)
{
u32 l, h;
int mbytes = max_mapnr >> (20-PAGE_SHIFT);
int mbytes = num_physpages >> (20-PAGE_SHIFT);
int r;
/*
......
......@@ -96,4 +96,7 @@ extern struct mtrr_ops * mtrr_if;
extern unsigned int num_var_ranges;
void finalize_mtrr_state(void);
void mtrr_state_warn(void);
extern char * mtrr_if_name[];
......@@ -58,7 +58,11 @@ EXPORT_SYMBOL(boot_cpu_data);
EXPORT_SYMBOL(EISA_bus);
#endif
EXPORT_SYMBOL(MCA_bus);
#ifdef CONFIG_MULTIQUAD
#ifdef CONFIG_DISCONTIGMEM
EXPORT_SYMBOL(node_data);
EXPORT_SYMBOL(pfn_to_nid);
#endif
#ifdef CONFIG_X86_NUMAQ
EXPORT_SYMBOL(xquad_portio);
#endif
EXPORT_SYMBOL(__verify_write);
......
......@@ -82,27 +82,19 @@ static void __init smp_dump_qct(void)
*/
int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
#define MB_TO_ELEMENT(x) (x >> ELEMENT_REPRESENTS)
#define PA_TO_MB(pa) (pa >> 20) /* assumption: a physical address is in bytes */
#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT)
#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT))
int pa_to_nid(u64 pa)
int pfn_to_nid(unsigned long pfn)
{
int nid;
nid = physnode_map[MB_TO_ELEMENT(PA_TO_MB(pa))];
int nid = physnode_map[PFN_TO_ELEMENT(pfn)];
/* the physical address passed in is not in the map for the system */
if (nid == -1)
BUG();
BUG(); /* address is not present */
return nid;
}
int pfn_to_nid(unsigned long pfn)
{
return pa_to_nid(((u64)pfn) << PAGE_SHIFT);
}
/*
* for each node mark the regions
* TOPOFMEM = hi_shrd_mem_start + hi_shrd_mem_size
......@@ -132,7 +124,7 @@ static void __init initialize_physnode_map(void)
topofmem = eq->hi_shrd_mem_start + eq->hi_shrd_mem_size;
while (cur < topofmem) {
physnode_map[cur >> 8] = nid;
cur += (ELEMENT_REPRESENTS - 1);
cur ++;
}
}
}
......
......@@ -275,20 +275,9 @@ void __init set_highmem_pages_init(int bad_ppro)
void __init set_max_mapnr_init(void)
{
#ifdef CONFIG_HIGHMEM
unsigned long lmax_mapnr;
int nid;
highmem_start_page = mem_map + NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_start_mapnr;
highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map;
num_physpages = highend_pfn;
for (nid = 0; nid < numnodes; nid++) {
lmax_mapnr = node_startnr(nid) + node_size(nid);
if (lmax_mapnr > max_mapnr) {
max_mapnr = lmax_mapnr;
}
}
#else
max_mapnr = num_physpages = max_low_pfn;
num_physpages = max_low_pfn;
#endif
}
......@@ -319,7 +319,7 @@ set_new_inode(unsigned long len, int prot, int flag, int key)
}
if (i == MAX_ID)
return NULL;
inode = kmalloc(sizeof (struct inode), GFP_KERNEL);
inode = kmalloc(sizeof (struct inode), GFP_ATOMIC);
if (inode == NULL)
return NULL;
......@@ -502,7 +502,7 @@ set_hugetlb_mem_size(int count)
if (lcount > 0) { /* Increase the mem size. */
while (lcount--) {
page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
if (page == NULL)
break;
map = page;
......
......@@ -440,8 +440,10 @@ void __init mem_init(void)
int tmp;
int bad_ppro;
#ifndef CONFIG_DISCONTIGMEM
if (!mem_map)
BUG();
#endif
bad_ppro = ppro_with_ram_bug();
......@@ -471,7 +473,7 @@ void __init mem_init(void)
printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
max_mapnr << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
......@@ -504,7 +506,7 @@ void __init mem_init(void)
/*Will make this kernel command line. */
INIT_LIST_HEAD(&htlbpage_freelist);
for (i=0; i<htlbzone_pages; i++) {
page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
if (page == NULL)
break;
map = page;
......
......@@ -22,26 +22,29 @@
void show_mem(void)
{
int pfn, total = 0, reserved = 0;
int total = 0, reserved = 0;
int shared = 0, cached = 0;
int highmem = 0;
struct page *page;
pg_data_t *pgdat;
unsigned long i;
printk("Mem-info:\n");
show_free_areas();
printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
pfn = max_mapnr;
while (pfn-- > 0) {
page = pfn_to_page(pfn);
total++;
if (PageHighMem(page))
highmem++;
if (PageReserved(page))
reserved++;
else if (PageSwapCache(page))
cached++;
else if (page_count(page))
shared += page_count(page) - 1;
for_each_pgdat(pgdat) {
for (i = 0; i < pgdat->node_size; ++i) {
page = pgdat->node_mem_map + i;
total++;
if (PageHighMem(page))
highmem++;
if (PageReserved(page))
reserved++;
else if (PageSwapCache(page))
cached++;
else if (page_count(page))
shared += page_count(page) - 1;
}
}
printk("%d pages of RAM\n", total);
printk("%d pages of HIGHMEM\n",highmem);
......
......@@ -254,10 +254,6 @@ void __init paging_init(void)
zones_size[ZONE_DMA] = end_pfn + 1 - start_pfn;
free_area_init_node(node, NODE_DATA(node), 0, zones_size,
start_pfn, 0);
if ((PLAT_NODE_DATA_STARTNR(node) +
PLAT_NODE_DATA_SIZE(node)) > pagenr)
pagenr = PLAT_NODE_DATA_STARTNR(node) +
PLAT_NODE_DATA_SIZE(node);
}
}
......@@ -271,7 +267,6 @@ void __init mem_init(void)
unsigned long codesize, datasize, initsize;
int slot, numslots;
struct page *pg, *pslot;
pfn_t pgnr;
num_physpages = numpages; /* memory already sized by szmem */
max_mapnr = pagenr; /* already found during paging_init */
......@@ -293,7 +288,6 @@ void __init mem_init(void)
* We need to manually do the other slots.
*/
pg = NODE_DATA(nid)->node_mem_map + slot_getsize(nid, 0);
pgnr = PLAT_NODE_DATA_STARTNR(nid) + slot_getsize(nid, 0);
numslots = node_getlastslot(nid);
for (slot = 1; slot <= numslots; slot++) {
pslot = NODE_DATA(nid)->node_mem_map +
......@@ -304,7 +298,7 @@ void __init mem_init(void)
* free up the pages that hold the memmap entries.
*/
while (pg < pslot) {
pg++; pgnr++;
pg++;
}
/*
......@@ -312,8 +306,8 @@ void __init mem_init(void)
*/
pslot += slot_getsize(nid, slot);
while (pg < pslot) {
if (!page_is_ram(pgnr))
continue;
/* if (!page_is_ram(pgnr)) continue; */
/* commented out until page_is_ram works */
ClearPageReserved(pg);
atomic_set(&pg->count, 1);
__free_page(pg);
......
......@@ -1733,7 +1733,7 @@ void __init mem_init(void)
* Set up the zero page, mark it reserved, so that page count
* is not manipulated when freeing the page from user ptes.
*/
mem_map_zero = _alloc_pages(GFP_KERNEL, 0);
mem_map_zero = alloc_pages(GFP_KERNEL, 0);
if (mem_map_zero == NULL) {
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
......
......@@ -36,7 +36,7 @@ static kmem_cache_t *request_cachep;
/*
* plug management
*/
static struct list_head blk_plug_list;
static LIST_HEAD(blk_plug_list);
static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/* blk_dev_struct is:
......@@ -1875,27 +1875,16 @@ void end_that_request_last(struct request *req)
blk_put_request(req);
}
#define MB(kb) ((kb) << 10)
int __init blk_dev_init(void)
{
struct blk_dev_struct *dev;
int total_ram;
int total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request),
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
sizeof(struct request), 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!request_cachep)
panic("Can't create request pool slab cache\n");
for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
dev->queue = NULL;
memset(ro_bits,0,sizeof(ro_bits));
total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
/*
* Free request slots per queue.
* (Half for reads, half for writes)
......@@ -1911,17 +1900,12 @@ int __init blk_dev_init(void)
*/
if ((batch_requests = queue_nr_requests / 4) > 32)
batch_requests = 32;
printk("block: %d slots per queue, batch=%d\n", queue_nr_requests, batch_requests);
printk("block: %d slots per queue, batch=%d\n",
queue_nr_requests, batch_requests);
blk_max_low_pfn = max_low_pfn;
blk_max_pfn = max_pfn;
INIT_LIST_HEAD(&blk_plug_list);
#if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
hd_init();
#endif
return 0;
};
......
......@@ -241,7 +241,7 @@ raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
static ssize_t
raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
{
struct iovec local_iov = { .iov_base = buf, .iov_len = size};
struct iovec local_iov = { .iov_base = (char *)buf, .iov_len = size};
return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
}
......
......@@ -846,7 +846,7 @@ static void __init hd_geninit(void)
}
}
int __init hd_init(void)
static int __init hd_init(void)
{
if (register_blkdev(MAJOR_NR,"hd",&hd_fops)) {
printk("hd: unable to get major %d for hard disk\n",MAJOR_NR);
......
......@@ -160,7 +160,7 @@ struct sa1100_pcmcia_socket {
*/
socket_state_t cs_state;
pccard_io_map io_map[MAX_IO_WIN];
pccard_mem_map mem_map[MAX_WIN];
pccard_mem_map pc_mem_map[MAX_WIN];
void (*handler)(void *, unsigned int);
void *handler_info;
......
......@@ -686,7 +686,7 @@ sa1100_pcmcia_get_mem_map(unsigned int sock, struct pccard_mem_map *map)
DEBUG(2, "%s() for sock %u\n", __FUNCTION__, sock);
if (map->map < MAX_WIN) {
*map = skt->mem_map[map->map];
*map = skt->pc_mem_map[map->map];
ret = 0;
}
......@@ -754,7 +754,7 @@ sa1100_pcmcia_set_mem_map(unsigned int sock, struct pccard_mem_map *map)
map->sys_stop += start;
map->sys_start = start;
skt->mem_map[map->map] = *map;
skt->pc_mem_map[map->map] = *map;
return 0;
} /* sa1100_pcmcia_set_mem_map() */
......
......@@ -2537,6 +2537,7 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)
{
struct scsi_host_sg_pool *sgp;
struct scatterlist *sgl;
int pf_flags;
BUG_ON(!SCpnt->use_sg);
......@@ -2551,9 +2552,10 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)
sgp = scsi_sg_pools + SCpnt->sglist_len;
pf_flags = current->flags;
current->flags |= PF_NOWARN;
sgl = mempool_alloc(sgp->pool, gfp_mask);
current->flags &= ~PF_NOWARN;
current->flags = pf_flags;
if (sgl) {
memset(sgl, 0, sgp->size);
return sgl;
......
......@@ -135,6 +135,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
{
struct bio *bio;
struct bio_vec *bvl = NULL;
int pf_flags = current->flags;
current->flags |= PF_NOWARN;
bio = mempool_alloc(bio_pool, gfp_mask);
......@@ -151,7 +152,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
mempool_free(bio, bio_pool);
bio = NULL;
out:
current->flags &= ~PF_NOWARN;
current->flags = pf_flags;
return bio;
}
......
......@@ -937,9 +937,11 @@ create_buffers(struct page * page, unsigned long size, int retry)
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
int pf_flags = current->flags;
current->flags |= PF_NOWARN;
bh = alloc_buffer_head();
current->flags &= ~PF_NOWARN;
current->flags = pf_flags;
if (!bh)
goto no_grow;
......
......@@ -627,13 +627,13 @@ ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
}
static int
ext2_writepages(struct address_space *mapping, int *nr_to_write)
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
int err;
ret = write_mapping_buffers(mapping);
err = mpage_writepages(mapping, nr_to_write, ext2_get_block);
err = mpage_writepages(mapping, wbc, ext2_get_block);
if (!ret)
ret = err;
return ret;
......
......@@ -1475,13 +1475,13 @@ struct address_space_operations ext3_aops = {
/* For writeback mode, we can use mpage_writepages() */
static int
ext3_writepages(struct address_space *mapping, int *nr_to_write)
ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
int err;
ret = write_mapping_buffers(mapping);
err = mpage_writepages(mapping, nr_to_write, ext3_get_block);
err = mpage_writepages(mapping, wbc, ext3_get_block);
if (!ret)
ret = err;
return ret;
......
......@@ -111,8 +111,7 @@ static void write_inode(struct inode *inode, int sync)
/*
* Write a single inode's dirty pages and inode data out to disk.
* If `sync' is set, wait on the writeout.
* If `nr_to_write' is not NULL, subtract the number of written pages
* from *nr_to_write.
* Subtract the number of written pages from nr_to_write.
*
* Normally it is not legal for a single process to lock more than one
* page at a time, due to ab/ba deadlock problems. But writepages()
......@@ -127,7 +126,9 @@ static void write_inode(struct inode *inode, int sync)
*
* Called under inode_lock.
*/
static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
static void
__sync_single_inode(struct inode *inode, int wait,
struct writeback_control *wbc)
{
unsigned dirty;
unsigned long orig_dirtied_when;
......@@ -144,7 +145,7 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
mapping->dirtied_when = 0; /* assume it's whole-file writeback */
spin_unlock(&inode_lock);
do_writepages(mapping, nr_to_write);
do_writepages(mapping, wbc);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
......@@ -181,7 +182,8 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
* Write out an inode's dirty pages. Called under inode_lock.
*/
static void
__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
__writeback_single_inode(struct inode *inode, int sync,
struct writeback_control *wbc)
{
if (current_is_pdflush() && (inode->i_state & I_LOCK))
return;
......@@ -193,7 +195,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
iput(inode);
spin_lock(&inode_lock);
}
__sync_single_inode(inode, sync, nr_to_write);
__sync_single_inode(inode, sync, wbc);
}
/*
......@@ -226,8 +228,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
* thrlttled threads: we don't want them all piling up on __wait_on_inode.
*/
static void
sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
int sync_mode, int *nr_to_write, unsigned long *older_than_this)
sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{
struct list_head *tmp;
struct list_head *head;
......@@ -241,7 +242,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
struct backing_dev_info *bdi;
int really_sync;
if (single_bdi && mapping->backing_dev_info != single_bdi) {
if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
if (sb != blockdev_superblock)
break; /* inappropriate superblock */
list_move(&inode->i_list, &sb->s_dirty);
......@@ -252,23 +253,20 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
if (time_after(mapping->dirtied_when, start))
break;
if (older_than_this &&
time_after(mapping->dirtied_when, *older_than_this))
if (wbc->older_than_this && time_after(mapping->dirtied_when,
*wbc->older_than_this))
goto out;
bdi = mapping->backing_dev_info;
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
really_sync = (sync_mode == WB_SYNC_ALL);
if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
really_sync = 1;
really_sync = (wbc->sync_mode == WB_SYNC_ALL);
BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
list_move(&inode->i_list, &sb->s_dirty);
__writeback_single_inode(inode, really_sync, nr_to_write);
if (sync_mode == WB_SYNC_HOLD) {
__writeback_single_inode(inode, really_sync, wbc);
if (wbc->sync_mode == WB_SYNC_HOLD) {
mapping->dirtied_when = jiffies;
list_move(&inode->i_list, &sb->s_dirty);
}
......@@ -277,7 +275,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
spin_unlock(&inode_lock);
iput(inode);
spin_lock(&inode_lock);
if (nr_to_write && *nr_to_write <= 0)
if (wbc->nr_to_write <= 0)
break;
}
out:
......@@ -288,16 +286,26 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
}
/*
* Start writeback of dirty pagecache data against all unlocked inodes.
*
* Note:
* We don't need to grab a reference to superblock here. If it has non-empty
* ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
* past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
* empty. Since __sync_single_inode() regains inode_lock before it finally moves
* inode from superblock lists we are OK.
*
* If `older_than_this' is non-zero then only flush inodes which have a
* flushtime older than *older_than_this.
*
* If `bdi' is non-zero then we will scan the first inode against each
* superblock until we find the matching ones. One group will be the dirty
* inodes against a filesystem. Then when we hit the dummy blockdev superblock,
* sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
* super-efficient but we're about to do a ton of I/O...
*/
static void
__writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
enum writeback_sync_modes sync_mode,
unsigned long *older_than_this)
void
writeback_inodes(struct writeback_control *wbc)
{
struct super_block *sb;
......@@ -307,54 +315,16 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
spin_unlock(&sb_lock);
sync_sb_inodes(bdi, sb, sync_mode, nr_to_write,
older_than_this);
sync_sb_inodes(sb, wbc);
spin_lock(&sb_lock);
}
if (nr_to_write && *nr_to_write <= 0)
if (wbc->nr_to_write <= 0)
break;
}
spin_unlock(&sb_lock);
spin_unlock(&inode_lock);
}
/*
* Start writeback of dirty pagecache data against all unlocked inodes.
*
* Note:
* We don't need to grab a reference to superblock here. If it has non-empty
* ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
* past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
* empty. Since __sync_single_inode() regains inode_lock before it finally moves
* inode from superblock lists we are OK.
*
* If `older_than_this' is non-zero then only flush inodes which have a
* flushtime older than *older_than_this.
*
* This is a "memory cleansing" operation, not a "data integrity" operation.
*/
void writeback_unlocked_inodes(int *nr_to_write,
enum writeback_sync_modes sync_mode,
unsigned long *older_than_this)
{
__writeback_unlocked_inodes(NULL, nr_to_write,
sync_mode, older_than_this);
}
/*
* Perform writeback of dirty data against a particular queue.
*
* This is for writer throttling. We don't want processes to write back
* other process's data, espsecially when the other data belongs to a
* different spindle.
*/
void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
enum writeback_sync_modes sync_mode,
unsigned long *older_than_this)
{
__writeback_unlocked_inodes(bdi, nr_to_write,
sync_mode, older_than_this);
}
/*
* writeback and wait upon the filesystem's dirty inodes. The caller will
* do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
......@@ -366,14 +336,17 @@ void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
void sync_inodes_sb(struct super_block *sb, int wait)
{
struct page_state ps;
int nr_to_write;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
.older_than_this = NULL,
.nr_to_write = 0,
};
get_page_state(&ps);
nr_to_write = ps.nr_dirty + ps.nr_dirty / 4;
wbc.nr_to_write = ps.nr_dirty + ps.nr_dirty / 4;
spin_lock(&inode_lock);
sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
&nr_to_write, NULL);
sync_sb_inodes(sb, &wbc);
spin_unlock(&inode_lock);
}
......@@ -466,8 +439,12 @@ void sync_inodes(int wait)
void write_inode_now(struct inode *inode, int sync)
{
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
};
spin_lock(&inode_lock);
__writeback_single_inode(inode, sync, NULL);
__writeback_single_inode(inode, sync, &wbc);
spin_unlock(&inode_lock);
if (sync)
wait_on_inode(inode);
......
......@@ -282,9 +282,10 @@ static int jfs_writepage(struct page *page)
return block_write_full_page(page, jfs_get_block);
}
static int jfs_writepages(struct address_space *mapping, int *nr_to_write)
static int jfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return mpage_writepages(mapping, nr_to_write, jfs_get_block);
return mpage_writepages(mapping, wbc, jfs_get_block);
}
static int jfs_readpage(struct file *file, struct page *page)
......
......@@ -484,7 +484,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
* address space and writepage() all of them.
*
* @mapping: address space structure to write
* @nr_to_write: subtract the number of written pages from *@nr_to_write
* @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @get_block: the filesystem's block mapper function.
* If this is NULL then use a_ops->writepage. Otherwise, go
* direct-to-BIO.
......@@ -520,7 +520,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
*/
int
mpage_writepages(struct address_space *mapping,
int *nr_to_write, get_block_t get_block)
struct writeback_control *wbc, get_block_t get_block)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
......@@ -583,7 +583,7 @@ mpage_writepages(struct address_space *mapping,
__set_page_dirty_nobuffers(page);
ret = 0;
}
if (ret || (nr_to_write && --(*nr_to_write) <= 0))
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
} else {
unlock_page(page);
......
......@@ -394,131 +394,40 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
return res;
}
static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total)
int proc_pid_statm(task_t *task, char *buffer)
{
unsigned long end, pmd_end;
pte_t *pte;
if (pmd_none(*pmd))
return;
if (pmd_bad(*pmd)) {
pmd_ERROR(*pmd);
pmd_clear(pmd);
return;
}
preempt_disable();
pte = pte_offset_map(pmd, address);
end = address + size;
pmd_end = (address + PMD_SIZE) & PMD_MASK;
if (end > pmd_end)
end = pmd_end;
do {
pte_t page = *pte;
struct page *ptpage;
unsigned long pfn;
int size, resident, shared, text, lib, data, dirty;
struct mm_struct *mm = get_task_mm(task);
struct vm_area_struct * vma;
address += PAGE_SIZE;
pte++;
if (pte_none(page))
continue;
++*total;
if (!pte_present(page))
continue;
pfn = pte_pfn(page);
if (!pfn_valid(pfn))
continue;
ptpage = pfn_to_page(pfn);
if (PageReserved(ptpage))
continue;
++*pages;
if (pte_dirty(page))
++*dirty;
if (page_count(pte_page(page)) > 1)
++*shared;
} while (address < end);
pte_unmap(pte - 1);
preempt_enable();
}
size = resident = shared = text = lib = data = dirty = 0;
static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
int * pages, int * shared, int * dirty, int * total)
{
pmd_t * pmd;
unsigned long end;
if (pgd_none(*pgd))
return;
if (pgd_bad(*pgd)) {
pgd_ERROR(*pgd);
pgd_clear(pgd);
return;
}
pmd = pmd_offset(pgd, address);
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
do {
statm_pte_range(pmd, address, end - address, pages, shared, dirty, total);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
}
static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end,
int * pages, int * shared, int * dirty, int * total)
{
while (address < end) {
statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
pgd++;
}
}
if (!mm)
goto out;
int proc_pid_statm(struct task_struct *task, char * buffer)
{
int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
struct mm_struct *mm = get_task_mm(task);
down_read(&mm->mmap_sem);
resident = mm->rss;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
if (mm) {
struct vm_area_struct * vma;
down_read(&mm->mmap_sem);
vma = mm->mmap;
while (vma) {
pgd_t *pgd = pgd_offset(mm, vma->vm_start);
int pages = 0, shared = 0, dirty = 0, total = 0;
if (is_vm_hugetlb_page(vma)) {
int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE);
resident += num_pages;
if (!(vma->vm_flags & VM_DONTCOPY))
share += num_pages;
if (vma->vm_flags & VM_WRITE)
dt += num_pages;
drs += num_pages;
vma = vma->vm_next;
continue;
}
statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
resident += pages;
share += shared;
dt += dirty;
size += total;
if (vma->vm_flags & VM_EXECUTABLE)
trs += pages; /* text */
else if (vma->vm_flags & VM_GROWSDOWN)
drs += pages; /* stack */
else if (vma->vm_end > 0x60000000)
lrs += pages; /* library */
else
drs += pages;
vma = vma->vm_next;
size += pages;
if (is_vm_hugetlb_page(vma)) {
if (!(vma->vm_flags & VM_DONTCOPY))
shared += pages;
continue;
}
up_read(&mm->mmap_sem);
mmput(mm);
if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared))
shared += pages;
if (vma->vm_flags & VM_EXECUTABLE)
text += pages;
else
data += pages;
}
up_read(&mm->mmap_sem);
mmput(mm);
out:
return sprintf(buffer,"%d %d %d %d %d %d %d\n",
size, resident, share, trs, lrs, drs, dt);
size, resident, shared, text, lib, data, dirty);
}
/*
......
......@@ -36,18 +36,14 @@ extern plat_pg_data_t *plat_node_data[];
#ifdef CONFIG_ALPHA_WILDFIRE
# define ALPHA_PA_TO_NID(pa) ((pa) >> 36) /* 16 nodes max due 43bit kseg */
#define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */
#define MAX_NUMNODES WILDFIRE_MAX_QBB
# define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */
#else
# define ALPHA_PA_TO_NID(pa) (0)
#define NODE_MAX_MEM_SIZE (~0UL)
#define MAX_NUMNODES 1
# define NODE_MAX_MEM_SIZE (~0UL)
#endif
#define PHYSADDR_TO_NID(pa) ALPHA_PA_TO_NID(pa)
#define PLAT_NODE_DATA(n) (plat_node_data[(n)])
#define PLAT_NODE_DATA_STARTNR(n) \
(PLAT_NODE_DATA(n)->gendata.node_start_mapnr)
#define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size)
#if 1
......
#ifndef _ASM_MAX_NUMNODES_H
#define _ASM_MAX_NUMNODES_H
/*
* Currently the Wildfire is the only discontigmem/NUMA capable Alpha core.
*/
#if defined(CONFIG_ALPHA_WILDFIRE) || defined(CONFIG_ALPHA_GENERIC)
# include <asm/core_wildfire.h>
# define MAX_NUMNODES WILDFIRE_MAX_QBB
#endif
#endif /* _ASM_MAX_NUMNODES_H */
......@@ -6,12 +6,13 @@
#ifndef _ASM_MMZONE_H_
#define _ASM_MMZONE_H_
#include <asm/smp.h>
#ifdef CONFIG_DISCONTIGMEM
#ifdef CONFIG_X86_NUMAQ
#include <asm/numaq.h>
#else
#define pa_to_nid(pa) (0)
#define pfn_to_nid(pfn) (0)
#ifdef CONFIG_NUMA
#define _cpu_to_node(cpu) 0
......@@ -44,7 +45,6 @@ extern struct pglist_data *node_data[];
#define alloc_bootmem_low_pages_node(ignore, x) \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
#define node_startnr(nid) (node_data[nid]->node_start_mapnr)
#define node_size(nid) (node_data[nid]->node_size)
#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn)
......@@ -55,7 +55,7 @@ extern struct pglist_data *node_data[];
/*
* Given a kernel address, find the home node of the underlying memory.
*/
#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr))
#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
/*
* Return a pointer to the node data for node n.
......@@ -64,6 +64,8 @@ extern struct pglist_data *node_data[];
#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
NODE_DATA(nid)->node_size)
#define local_mapnr(kvaddr) \
( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
......@@ -74,5 +76,13 @@ extern struct pglist_data *node_data[];
#define pfn_to_page(pfn) (node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn)))
#define page_to_pfn(page) ((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn)
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
/*
* pfn_valid should be made as fast as possible, and the current definition
* is valid for machines that are NUMA, but still contiguous, which is what
* is currently supported. A more generalised, but slower definition would
* be something like this - mbligh:
* ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) )
*/
#define pfn_valid(pfn) (pfn < num_physpages)
#endif /* CONFIG_DISCONTIGMEM */
#endif /* _ASM_MMZONE_H_ */
......@@ -32,17 +32,18 @@
/*
* for now assume that 64Gb is max amount of RAM for whole system
* 64Gb * 1024Mb/Gb = 65536 Mb
* 65536 Mb / 256Mb = 256
* 64Gb / 4096bytes/page = 16777216 pages
*/
#define MAX_NR_PAGES 16777216
#define MAX_ELEMENTS 256
#define ELEMENT_REPRESENTS 8 /* 256 Mb */
#define PAGES_PER_ELEMENT (16777216/256)
#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn))
#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT)
#define MAX_NUMNODES 8
#ifdef CONFIG_NUMA
#define _cpu_to_node(cpu) (cpu_to_logical_apicid(cpu) >> 4)
#endif /* CONFIG_NUMA */
extern int pa_to_nid(u64);
extern int pfn_to_nid(unsigned long);
extern void get_memcfg_numaq(void);
#define get_memcfg_numa() get_memcfg_numaq()
......
......@@ -145,10 +145,10 @@ static __inline__ int get_order(unsigned long size)
#ifndef CONFIG_DISCONTIGMEM
#define pfn_to_page(pfn) (mem_map + (pfn))
#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
#define pfn_valid(pfn) ((pfn) < max_mapnr)
#endif /* !CONFIG_DISCONTIGMEM */
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define pfn_valid(pfn) ((pfn) < max_mapnr)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
......
......@@ -24,7 +24,6 @@ extern plat_pg_data_t *plat_node_data[];
#define PHYSADDR_TO_NID(pa) NASID_TO_COMPACT_NODEID(NASID_GET(pa))
#define PLAT_NODE_DATA(n) (plat_node_data[n])
#define PLAT_NODE_DATA_STARTNR(n) (PLAT_NODE_DATA(n)->gendata.node_start_mapnr)
#define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size)
#define PLAT_NODE_DATA_LOCALNR(p, n) \
(((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn)
......
......@@ -373,10 +373,10 @@ extern inline void pgd_clear(pgd_t *pgdp)
#ifndef CONFIG_DISCONTIGMEM
#define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT)))
#else
#define mips64_pte_pagenr(x) \
(PLAT_NODE_DATA_STARTNR(PHYSADDR_TO_NID(pte_val(x))) + \
PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))))
#define pte_page(x) (mem_map+mips64_pte_pagenr(x))
#define pte_page(x) ( NODE_MEM_MAP(PHYSADDR_TO_NID(pte_val(x))) +
PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))) )
#endif
/*
......
......@@ -279,6 +279,7 @@ struct iattr {
*/
struct page;
struct address_space;
struct writeback_control;
struct address_space_operations {
int (*writepage)(struct page *);
......@@ -286,10 +287,10 @@ struct address_space_operations {
int (*sync_page)(struct page *);
/* Write back some dirty pages from this mapping. */
int (*writepages)(struct address_space *, int *nr_to_write);
int (*writepages)(struct address_space *, struct writeback_control *);
/* Perform a writeback as a memory-freeing operation. */
int (*vm_writeback)(struct page *, int *nr_to_write);
int (*vm_writeback)(struct page *, struct writeback_control *);
/* Set a page dirty */
int (*set_page_dirty)(struct page *page);
......@@ -1259,7 +1260,8 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int generic_vm_writeback(struct page *page, int *nr_to_write);
extern int generic_vm_writeback(struct page *page,
struct writeback_control *wbc);
extern struct file_operations generic_ro_fops;
......
......@@ -39,18 +39,25 @@
* can allocate highmem pages, the *get*page*() variants return
* virtual kernel addresses to the allocated page(s).
*/
extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, struct zonelist *zonelist));
extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
/*
* We get the zone list from the current node and the gfp_mask.
* This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
*
* For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
* optimized to &contig_page_data at compile-time.
*/
static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
{
/*
* Gets optimized away by the compiler.
*/
if (order >= MAX_ORDER)
pg_data_t *pgdat = NODE_DATA(numa_node_id());
unsigned int idx = (gfp_mask & GFP_ZONEMASK);
if (unlikely(order >= MAX_ORDER))
return NULL;
return _alloc_pages(gfp_mask, order);
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + idx);
}
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
......
......@@ -15,7 +15,10 @@
#include <linux/rbtree.h>
#include <linux/fs.h>
#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;
#endif
extern unsigned long num_physpages;
extern void * high_memory;
extern int page_cluster;
......@@ -345,8 +348,10 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_MINOR 1
#define VM_FAULT_MAJOR 2
/* The array of struct pages */
#ifndef CONFIG_DISCONTIGMEM
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
#endif
extern void show_free_areas(void);
......
......@@ -10,11 +10,14 @@
#include <linux/wait.h>
#include <linux/cache.h>
#include <asm/atomic.h>
#ifdef CONFIG_DISCONTIGMEM
#include <asm/numnodes.h>
#endif
#ifndef MAX_NUMNODES
#define MAX_NUMNODES 1
#endif
/*
* Free memory management - zoned buddy allocator.
*/
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
......@@ -112,7 +115,6 @@ struct zone {
struct page *zone_mem_map;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
unsigned long zone_start_mapnr;
/*
* rarely used fields:
......@@ -138,7 +140,7 @@ struct zone {
* footprint of this construct is very small.
*/
struct zonelist {
struct zone *zones[MAX_NR_ZONES+1]; // NULL delimited
struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
};
#define GFP_ZONEMASK 0x0f
......@@ -163,7 +165,6 @@ typedef struct pglist_data {
unsigned long *valid_addr_bitmap;
struct bootmem_data *bdata;
unsigned long node_start_pfn;
unsigned long node_start_mapnr;
unsigned long node_size;
int node_id;
struct pglist_data *pgdat_next;
......@@ -187,10 +188,12 @@ memclass(struct zone *pgzone, struct zone *classzone)
* prototypes for the discontig memory code.
*/
struct page;
void free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
unsigned long *zones_size, unsigned long paddr, unsigned long *zholes_size,
struct page *pmap);
extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
unsigned long *zholes_size);
extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size,
unsigned long *zholes_size);
void get_zone_counts(unsigned long *active, unsigned long *inactive);
extern void build_all_zonelists(void);
extern pg_data_t contig_page_data;
......
......@@ -10,14 +10,16 @@
* nested includes. Get it right in the .c file).
*/
struct writeback_control;
int mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block);
int mpage_readpage(struct page *page, get_block_t get_block);
int mpage_writepages(struct address_space *mapping,
int *nr_to_write, get_block_t get_block);
struct writeback_control *wbc, get_block_t get_block);
static inline int
generic_writepages(struct address_space *mapping, int *nr_to_write)
generic_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
return mpage_writepages(mapping, nr_to_write, NULL);
return mpage_writepages(mapping, wbc, NULL);
}
......@@ -128,7 +128,6 @@ enum
KERN_TAINTED=53, /* int: various kernel tainted flags */
KERN_CADPID=54, /* int: PID of the process to notify on CAD */
KERN_PIDMAX=55, /* int: PID # limit */
KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */
};
......@@ -147,12 +146,12 @@ enum
VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */
VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */
VM_DIRTY_ASYNC=12, /* dirty_async_ratio */
VM_DIRTY_SYNC=13, /* dirty_sync_ratio */
VM_DIRTY_WB_CS=14, /* dirty_writeback_centisecs */
VM_DIRTY_EXPIRE_CS=15, /* dirty_expire_centisecs */
VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */
VM_OVERCOMMIT_RATIO=17, /* percent of RAM to allow overcommit in */
VM_PAGEBUF=18 /* struct: Control pagebuf parameters */
VM_DIRTY_WB_CS=13, /* dirty_writeback_centisecs */
VM_DIRTY_EXPIRE_CS=14, /* dirty_expire_centisecs */
VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */
VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
};
......
......@@ -35,7 +35,11 @@ struct iovec
#endif
/*
* Total number of bytes covered by an iovec
* Total number of bytes covered by an iovec.
*
* NOTE that it is not safe to use this function until all the iovec's
* segment lengths have been validated. Because the individual lengths can
* overflow a size_t when added together.
*/
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
......
......@@ -27,22 +27,29 @@ static inline int current_is_pdflush(void)
* fs/fs-writeback.c
*/
enum writeback_sync_modes {
WB_SYNC_NONE = 0, /* Don't wait on anything */
WB_SYNC_LAST = 1, /* Wait on the last-written mapping */
WB_SYNC_ALL = 2, /* Wait on every mapping */
WB_SYNC_HOLD = 3, /* Hold the inode on sb_dirty for sys_sync() */
WB_SYNC_NONE, /* Don't wait on anything */
WB_SYNC_ALL, /* Wait on every mapping */
WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */
};
void writeback_unlocked_inodes(int *nr_to_write,
enum writeback_sync_modes sync_mode,
unsigned long *older_than_this);
/*
* A control structure which tells the writeback code what to do
*/
struct writeback_control {
struct backing_dev_info *bdi; /* If !NULL, only write back this
queue */
enum writeback_sync_modes sync_mode;
unsigned long *older_than_this; /* If !NULL, only write back inodes
older than this */
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
};
void writeback_inodes(struct writeback_control *wbc);
void wake_up_inode(struct inode *inode);
void __wait_on_inode(struct inode * inode);
void sync_inodes_sb(struct super_block *, int wait);
void sync_inodes(int wait);
void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
enum writeback_sync_modes sync_mode,
unsigned long *older_than_this);
/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
......@@ -57,7 +64,6 @@ static inline void wait_on_inode(struct inode *inode)
/* These 5 are exported to sysctl. */
extern int dirty_background_ratio;
extern int dirty_async_ratio;
extern int dirty_sync_ratio;
extern int dirty_writeback_centisecs;
extern int dirty_expire_centisecs;
......@@ -65,7 +71,7 @@ extern int dirty_expire_centisecs;
void balance_dirty_pages(struct address_space *mapping);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
int do_writepages(struct address_space *mapping, int *nr_to_write);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
/* pdflush.c */
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
......
......@@ -393,6 +393,7 @@ asmlinkage void __init start_kernel(void)
printk(linux_banner);
setup_arch(&command_line);
setup_per_cpu_areas();
build_all_zonelists();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
......
......@@ -91,7 +91,6 @@ EXPORT_SYMBOL(do_brk);
EXPORT_SYMBOL(exit_mm);
/* internal kernel memory management */
EXPORT_SYMBOL(_alloc_pages);
EXPORT_SYMBOL(__alloc_pages);
EXPORT_SYMBOL(alloc_pages_node);
EXPORT_SYMBOL(__get_free_pages);
......@@ -116,9 +115,12 @@ EXPORT_SYMBOL(vmalloc_32);
EXPORT_SYMBOL(vmap);
EXPORT_SYMBOL(vunmap);
EXPORT_SYMBOL(vmalloc_to_page);
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(remap_page_range);
#ifndef CONFIG_DISCONTIGMEM
EXPORT_SYMBOL(contig_page_data);
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(max_mapnr);
#endif
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmtruncate);
EXPORT_SYMBOL(find_vma);
......
......@@ -525,11 +525,11 @@ void release_console_sem(void)
{
unsigned long flags;
unsigned long _con_start, _log_end;
unsigned long must_wake_klogd = 0;
unsigned long wake_klogd = 0;
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
must_wake_klogd |= log_start - log_end;
wake_klogd |= log_start - log_end;
if (con_start == log_end)
break; /* Nothing to print */
_con_start = con_start;
......@@ -541,7 +541,7 @@ void release_console_sem(void)
console_may_schedule = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
if (must_wake_klogd && !oops_in_progress)
if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
wake_up_interruptible(&log_wait);
}
......
......@@ -471,10 +471,12 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)
int nr_copy_pages = 0;
int pfn;
struct page *page;
#ifndef CONFIG_DISCONTIGMEM
if (max_mapnr != num_physpages)
panic("mapnr is not expected");
for (pfn = 0; pfn < max_mapnr; pfn++) {
#endif
for (pfn = 0; pfn < num_physpages; pfn++) {
page = pfn_to_page(pfn);
if (PageHighMem(page))
panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
......@@ -514,19 +516,20 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)
static void free_suspend_pagedir(unsigned long this_pagedir)
{
struct page *page = mem_map;
int i;
struct page *page;
int pfn;
unsigned long this_pagedir_end = this_pagedir +
(PAGE_SIZE << pagedir_order);
for(i=0; i < num_physpages; i++, page++) {
for(pfn = 0; pfn < num_physpages; pfn++) {
page = pfn_to_page(pfn);
if (!TestClearPageNosave(page))
continue;
if (ADDRESS(i) >= this_pagedir && ADDRESS(i) < this_pagedir_end)
if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
continue; /* old pagedir gets freed in one */
free_page(ADDRESS(i));
free_page(ADDRESS(pfn));
}
free_pages(this_pagedir, pagedir_order);
}
......
......@@ -99,8 +99,8 @@ extern int acct_parm[];
#endif
#ifdef CONFIG_HUGETLB_PAGE
extern int htlbpage_max;
extern int set_hugetlb_mem_size(int);
extern int htlbpage_max;
extern int set_hugetlb_mem_size(int);
#endif
static int parse_table(int *, int, void *, size_t *, void *, size_t,
......@@ -263,10 +263,6 @@ static ctl_table kern_table[] = {
#endif
{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
0600, NULL, &proc_dointvec},
#ifdef CONFIG_HUGETLB_PAGE
{KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL,
&proc_dointvec},
#endif
{0}
};
......@@ -292,9 +288,6 @@ static ctl_table vm_table[] = {
{VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio,
sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec_minmax,
&sysctl_intvec, NULL, &zero, &one_hundred },
{VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio,
sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec_minmax,
&sysctl_intvec, NULL, &zero, &one_hundred },
{VM_DIRTY_WB_CS, "dirty_writeback_centisecs",
&dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644,
NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL,
......@@ -317,6 +310,10 @@ static ctl_table vm_table[] = {
{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
&nr_pdflush_threads, sizeof nr_pdflush_threads,
0444 /* read-only*/, NULL, &proc_dointvec},
#ifdef CONFIG_HUGETLB_PAGE
{VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL,
&proc_dointvec},
#endif
{0}
};
......
......@@ -487,9 +487,13 @@ EXPORT_SYMBOL(fail_writepage);
int filemap_fdatawrite(struct address_space *mapping)
{
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = mapping->nrpages * 2,
};
current->flags |= PF_SYNC;
ret = do_writepages(mapping, NULL);
ret = do_writepages(mapping, &wbc);
current->flags &= ~PF_SYNC;
return ret;
}
......@@ -1130,10 +1134,26 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg;
size_t count = iov_length(iov, nr_segs);
size_t count;
if ((ssize_t) count < 0)
return -EINVAL;
count = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
count += iv->iov_len;
if (unlikely((ssize_t)(count|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
nr_segs = seg;
break;
}
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
......@@ -1162,11 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
goto out;
}
for (seg = 0; seg < nr_segs; seg++) {
if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len))
return -EFAULT;
}
retval = 0;
if (count) {
for (seg = 0; seg < nr_segs; seg++) {
......@@ -1626,6 +1641,63 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
return left;
}
static inline int
__filemap_copy_from_user_iovec(char *vaddr,
const struct iovec *iov, size_t base, unsigned bytes)
{
int left = 0;
while (bytes) {
char *buf = iov->iov_base + base;
int copy = min(bytes, iov->iov_len - base);
base = 0;
if ((left = __copy_from_user(vaddr, buf, copy)))
break;
bytes -= copy;
vaddr += copy;
iov++;
}
return left;
}
static inline int
filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
const struct iovec *iov, size_t base, unsigned bytes)
{
char *kaddr;
int left;
kaddr = kmap_atomic(page, KM_USER0);
left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
kunmap_atomic(kaddr, KM_USER0);
if (left != 0) {
kaddr = kmap(page);
left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
kunmap(page);
}
return left;
}
static inline void
filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, unsigned bytes)
{
const struct iovec *iov = *iovp;
size_t base = *basep;
while (bytes) {
int copy = min(bytes, iov->iov_len - base);
bytes -= copy;
base += copy;
if (iov->iov_len == base) {
iov++;
base = 0;
}
}
*iovp = iov;
*basep = base;
}
/*
* Write to a file through the page cache.
*
......@@ -1641,8 +1713,8 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
{
struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
struct address_space_operations *a_ops = mapping->a_ops;
const size_t ocount = iov_length(iov, nr_segs);
size_t count = ocount;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
long status = 0;
......@@ -1654,19 +1726,30 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned bytes;
time_t time_now;
struct pagevec lru_pvec;
struct iovec *cur_iov;
unsigned iov_bytes; /* Cumulative count to the end of the
current iovec */
const struct iovec *cur_iov = iov; /* current iovec */
unsigned iov_base = 0; /* offset in the current iovec */
unsigned long seg;
char *buf;
if (unlikely((ssize_t)count < 0))
return -EINVAL;
ocount = 0;
for (seg = 0; seg < nr_segs; seg++) {
if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len))
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
ocount += iv->iov_len;
if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
nr_segs = seg;
break;
}
count = ocount;
pos = *ppos;
if (unlikely(pos < 0))
......@@ -1788,9 +1871,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
goto out_status;
}
cur_iov = (struct iovec *)iov;
iov_bytes = cur_iov->iov_len;
buf = cur_iov->iov_base;
buf = iov->iov_base;
do {
unsigned long index;
unsigned long offset;
......@@ -1801,8 +1882,6 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
if (bytes + written > iov_bytes)
bytes = iov_bytes - written;
/*
* Bring in the user page that we will copy from _first_.
......@@ -1830,7 +1909,12 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
vmtruncate(inode, inode->i_size);
break;
}
page_fault = filemap_copy_from_user(page, offset, buf, bytes);
if (likely(nr_segs == 1))
page_fault = filemap_copy_from_user(page, offset,
buf, bytes);
else
page_fault = filemap_copy_from_user_iovec(page, offset,
cur_iov, iov_base, bytes);
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (unlikely(page_fault)) {
......@@ -1844,11 +1928,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
count -= status;
pos += status;
buf += status;
if (written == iov_bytes && count) {
cur_iov++;
iov_bytes += cur_iov->iov_len;
buf = cur_iov->iov_base;
}
if (unlikely(nr_segs > 1))
filemap_set_next_iovec(&cur_iov,
&iov_base, status);
}
}
if (!PageReferenced(page))
......
......@@ -40,7 +40,6 @@
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/smp_lock.h>
#include <linux/iobuf.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
......@@ -53,7 +52,12 @@
#include <linux/swapops.h>
#ifndef CONFIG_DISCONTIGMEM
/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
struct page *mem_map;
#endif
unsigned long num_physpages;
void * high_memory;
struct page *highmem_start_page;
......@@ -72,8 +76,6 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned
copy_user_highpage(to, from, address);
}
struct page *mem_map;
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
......
......@@ -187,11 +187,12 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
int curr_nr;
DECLARE_WAITQUEUE(wait, current);
int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
int pf_flags = current->flags;
repeat_alloc:
current->flags |= PF_NOWARN;
element = pool->alloc(gfp_nowait, pool->pool_data);
current->flags &= ~PF_NOWARN;
current->flags = pf_flags;
if (likely(element != NULL))
return element;
......
......@@ -11,7 +11,6 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
......@@ -444,6 +443,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
*/
vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED) {
if (!capable(CAP_IPC_LOCK))
return -EPERM;
vm_flags |= VM_LOCKED;
}
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked = mm->locked_vm << PAGE_SHIFT;
......@@ -1073,7 +1077,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
* Jeremy Fitzhardine <jeremy@sw.oz.au>
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
......
......@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/swap.h>
......
......@@ -22,11 +22,21 @@ pg_data_t contig_page_data = { .bdata = &contig_bootmem_data };
* Should be invoked with paramters (0, 0, unsigned long *[], start_paddr).
*/
void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long *zones_size, unsigned long zone_start_pfn,
unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size,
zone_start_pfn, zholes_size, pmap);
unsigned long size;
contig_page_data.node_id = 0;
contig_page_data.node_start_pfn = node_start_pfn;
calculate_totalpages (&contig_page_data, zones_size, zholes_size);
if (pmap == (struct page *)0) {
size = (pgdat->node_size + 1) * sizeof(struct page);
pmap = (struct page *) alloc_bootmem_node(pgdat, size);
}
contig_page_data.node_mem_map = pmap;
free_area_init_core(&contig_page_data, zones_size, zholes_size);
mem_map = contig_page_data.node_mem_map;
}
#endif /* !CONFIG_DISCONTIGMEM */
......@@ -48,22 +58,26 @@ struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int orde
* Nodes can be initialized parallely, in no particular order.
*/
void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
unsigned long *zones_size, unsigned long zone_start_pfn,
unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
int i, size = 0;
struct page *discard;
if (mem_map == NULL)
mem_map = (struct page *)PAGE_OFFSET;
int i;
unsigned long size;
free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_pfn,
zholes_size, pmap);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_totalpages (pgdat, zones_size, zholes_size);
if (pmap == (struct page *)0) {
size = (pgdat->node_size + 1) * sizeof(struct page);
pmap = (struct page *) alloc_bootmem_node(pgdat, size);
}
pgdat->node_mem_map = pmap;
free_area_init_core(pgdat, zones_size, zholes_size);
/*
* Get space for the valid bitmap.
*/
size = 0;
for (i = 0; i < MAX_NR_ZONES; i++)
size += zones_size[i];
size = LONG_ALIGN((size + 7) >> 3);
......@@ -71,48 +85,4 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
memset(pgdat->valid_addr_bitmap, 0, size);
}
static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
unsigned int order)
{
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
}
/*
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
*/
struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
{
struct page *ret = 0;
pg_data_t *start, *temp;
#ifndef CONFIG_NUMA
unsigned long flags;
static pg_data_t *next = 0;
#endif
if (order >= MAX_ORDER)
return NULL;
#ifdef CONFIG_NUMA
temp = NODE_DATA(numa_node_id());
#else
if (!next)
next = pgdat_list;
temp = next;
next = next->pgdat_next;
#endif
start = temp;
while (temp) {
if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
temp = temp->pgdat_next;
}
temp = pgdat_list;
while (temp != start) {
if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
temp = temp->pgdat_next;
}
return(0);
}
#endif /* CONFIG_DISCONTIGMEM */
......@@ -51,7 +51,7 @@ static long total_pages;
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
* large amounts of I/O are submitted.
*/
static inline int sync_writeback_pages(void)
static inline long sync_writeback_pages(void)
{
return ratelimit_pages + ratelimit_pages / 2;
}
......@@ -72,11 +72,6 @@ int dirty_background_ratio = 10;
*/
int dirty_async_ratio = 40;
/*
* The generator of dirty data performs sync writeout at this level
*/
int dirty_sync_ratio = 50;
/*
* The interval between `kupdate'-style writebacks, in centiseconds
* (hundredths of a second)
......@@ -105,15 +100,11 @@ static void background_writeout(unsigned long _min_pages);
* - Does nothing at all.
*
* balance_dirty_pages() can sleep.
*
* FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty
* inode on the superblock list. It should wait when nr_to_write is
* exhausted. Doesn't seem to matter.
*/
void balance_dirty_pages(struct address_space *mapping)
{
struct page_state ps;
long background_thresh, async_thresh, sync_thresh;
long background_thresh, async_thresh;
unsigned long dirty_and_writeback;
struct backing_dev_info *bdi;
......@@ -122,18 +113,17 @@ void balance_dirty_pages(struct address_space *mapping)
background_thresh = (dirty_background_ratio * total_pages) / 100;
async_thresh = (dirty_async_ratio * total_pages) / 100;
sync_thresh = (dirty_sync_ratio * total_pages) / 100;
bdi = mapping->backing_dev_info;
if (dirty_and_writeback > sync_thresh) {
int nr_to_write = sync_writeback_pages();
if (dirty_and_writeback > async_thresh) {
struct writeback_control wbc = {
.bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = sync_writeback_pages(),
};
writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
get_page_state(&ps);
} else if (dirty_and_writeback > async_thresh) {
int nr_to_write = sync_writeback_pages();
writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
writeback_inodes(&wbc);
get_page_state(&ps);
}
......@@ -177,7 +167,12 @@ static void background_writeout(unsigned long _min_pages)
{
long min_pages = _min_pages;
long background_thresh;
int nr_to_write;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = 0,
};
CHECK_EMERGENCY_SYNC
......@@ -185,14 +180,13 @@ static void background_writeout(unsigned long _min_pages)
do {
struct page_state ps;
get_page_state(&ps);
if (ps.nr_dirty < background_thresh && min_pages <= 0)
break;
nr_to_write = MAX_WRITEBACK_PAGES;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
min_pages -= MAX_WRITEBACK_PAGES - nr_to_write;
} while (nr_to_write <= 0);
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
} while (wbc.nr_to_write <= 0);
blk_run_queues();
}
......@@ -230,7 +224,12 @@ static void wb_kupdate(unsigned long arg)
unsigned long start_jif;
unsigned long next_jif;
struct page_state ps;
int nr_to_write;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
.nr_to_write = 0,
};
sync_supers();
get_page_state(&ps);
......@@ -238,8 +237,8 @@ static void wb_kupdate(unsigned long arg)
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
nr_to_write = ps.nr_dirty;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
wbc.nr_to_write = ps.nr_dirty;
writeback_inodes(&wbc);
blk_run_queues();
yield();
......@@ -312,8 +311,6 @@ static int __init page_writeback_init(void)
dirty_background_ratio /= 100;
dirty_async_ratio *= correction;
dirty_async_ratio /= 100;
dirty_sync_ratio *= correction;
dirty_sync_ratio /= 100;
}
init_timer(&wb_timer);
......@@ -351,7 +348,7 @@ module_init(page_writeback_init);
* So. The proper fix is to leave the page locked-and-dirty and to pass
* it all the way down.
*/
int generic_vm_writeback(struct page *page, int *nr_to_write)
int generic_vm_writeback(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
......@@ -363,7 +360,7 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
unlock_page(page);
if (inode) {
do_writepages(inode->i_mapping, nr_to_write);
do_writepages(inode->i_mapping, wbc);
/*
* This iput() will internally call ext2_discard_prealloc(),
......@@ -392,11 +389,11 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
}
EXPORT_SYMBOL(generic_vm_writeback);
int do_writepages(struct address_space *mapping, int *nr_to_write)
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
if (mapping->a_ops->writepages)
return mapping->a_ops->writepages(mapping, nr_to_write);
return generic_writepages(mapping, nr_to_write);
return mapping->a_ops->writepages(mapping, wbc);
return generic_writepages(mapping, wbc);
}
/**
......
......@@ -256,14 +256,6 @@ int is_head_of_free_region(struct page *page)
}
#endif /* CONFIG_SOFTWARE_SUSPEND */
#ifndef CONFIG_DISCONTIGMEM
struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
{
return __alloc_pages(gfp_mask, order,
contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
}
#endif
static /* inline */ struct page *
balance_classzone(struct zone* classzone, unsigned int gfp_mask,
unsigned int order, int * freed)
......@@ -680,13 +672,41 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
*/
static inline void build_zonelists(pg_data_t *pgdat)
static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
{
int i, j, k;
switch (k) {
struct zone *zone;
default:
BUG();
case ZONE_HIGHMEM:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->size) {
#ifndef CONFIG_HIGHMEM
BUG();
#endif
zonelist->zones[j++] = zone;
}
case ZONE_NORMAL:
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->size)
zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->size)
zonelist->zones[j++] = zone;
}
return j;
}
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k, node, local_node;
local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
for (i = 0; i <= GFP_ZONEMASK; i++) {
struct zonelist *zonelist;
struct zone *zone;
zonelist = pgdat->node_zonelists + i;
memset(zonelist, 0, sizeof(*zonelist));
......@@ -698,33 +718,49 @@ static inline void build_zonelists(pg_data_t *pgdat)
if (i & __GFP_DMA)
k = ZONE_DMA;
switch (k) {
default:
BUG();
/*
* fallthrough:
*/
case ZONE_HIGHMEM:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->size) {
#ifndef CONFIG_HIGHMEM
BUG();
#endif
zonelist->zones[j++] = zone;
}
case ZONE_NORMAL:
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->size)
zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->size)
zonelist->zones[j++] = zone;
}
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < numnodes; node++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
for (node = 0; node < local_node; node++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
zonelist->zones[j++] = NULL;
}
}
void __init build_all_zonelists(void)
{
int i;
for(i = 0 ; i < numnodes ; i++)
build_zonelists(NODE_DATA(i));
}
void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
int i;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zones_size[i];
pgdat->node_size = totalpages;
realtotalpages = totalpages;
if (zholes_size)
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -= zholes_size[i];
printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
}
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
......@@ -775,46 +811,18 @@ static inline unsigned long wait_table_bits(unsigned long size)
* - mark all memory queues empty
* - clear the memory bitmaps
*/
void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
unsigned long *zones_size, unsigned long zone_start_pfn,
unsigned long *zholes_size, struct page *lmem_map)
void __init free_area_init_core(pg_data_t *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long i, j;
unsigned long map_size;
unsigned long totalpages, offset, realtotalpages;
unsigned long local_offset;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
int nid = pgdat->node_id;
struct page *lmem_map = pgdat->node_mem_map;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
totalpages = 0;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zones_size[i];
realtotalpages = totalpages;
if (zholes_size)
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -= zholes_size[i];
printk("On node %d totalpages: %lu\n", nid, realtotalpages);
/*
* Some architectures (with lots of mem and discontinous memory
* maps) have to search for a good mem_map area:
* For discontigmem, the conceptual mem map array starts from
* PAGE_OFFSET, we need to align the actual array onto a mem map
* boundary, so that MAP_NR works.
*/
map_size = (totalpages + 1)*sizeof(struct page);
if (lmem_map == (struct page *)0) {
lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
lmem_map = (struct page *)(PAGE_OFFSET +
MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
}
*gmap = pgdat->node_mem_map = lmem_map;
pgdat->node_size = totalpages;
pgdat->node_start_pfn = zone_start_pfn;
pgdat->node_start_mapnr = (lmem_map - mem_map);
pgdat->nr_zones = 0;
offset = lmem_map - mem_map;
local_offset = 0; /* offset within lmem_map */
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long mask;
......@@ -866,8 +874,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone->pages_low = mask*2;
zone->pages_high = mask*3;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_mem_map = lmem_map + local_offset;
zone->zone_start_pfn = zone_start_pfn;
if ((zone_start_pfn) & (zone_required_alignment-1))
......@@ -879,7 +886,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
* done. Non-atomic initialization, single-pass.
*/
for (i = 0; i < size; i++) {
struct page *page = mem_map + offset + i;
struct page *page = lmem_map + local_offset + i;
set_page_zone(page, nid * MAX_NR_ZONES + j);
set_page_count(page, 0);
SetPageReserved(page);
......@@ -893,7 +900,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone_start_pfn++;
}
offset += size;
local_offset += size;
for (i = 0; ; i++) {
unsigned long bitmap_size;
......@@ -932,13 +939,15 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
build_zonelists(pgdat);
}
#ifndef CONFIG_DISCONTIGMEM
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, NULL);
mem_map = contig_page_data.node_mem_map;
}
#endif
static int __init setup_mem_frac(char *str)
{
......
......@@ -131,12 +131,12 @@ int swap_readpage(struct file *file, struct page *page)
* Swap pages are !PageLocked and PageWriteback while under writeout so that
* memory allocators will throttle against them.
*/
static int swap_vm_writeback(struct page *page, int *nr_to_write)
static int swap_vm_writeback(struct page *page, struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
unlock_page(page);
return generic_writepages(mapping, nr_to_write);
return generic_writepages(mapping, wbc);
}
struct address_space_operations swap_aops = {
......
......@@ -28,7 +28,6 @@
#include <linux/pagemap.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
......
......@@ -124,9 +124,9 @@ void release_pages(struct page **pages, int nr)
if (page_count(page) == 0) {
if (!pagevec_add(&pages_to_free, page)) {
spin_unlock_irq(&zone->lru_lock);
pagevec_free(&pages_to_free);
__pagevec_free(&pages_to_free);
pagevec_init(&pages_to_free);
spin_lock_irq(&zone->lru_lock);
zone = NULL; /* No lock is held */
}
}
}
......@@ -165,8 +165,8 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
}
/*
* Move all the inactive pages to the head of the inactive list
* and release them. Reinitialises the caller's pagevec.
* Move all the inactive pages to the head of the inactive list and release
* them. Reinitialises the caller's pagevec.
*/
void pagevec_deactivate_inactive(struct pagevec *pvec)
{
......@@ -180,8 +180,6 @@ void pagevec_deactivate_inactive(struct pagevec *pvec)
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (PageActive(page) || !PageLRU(page))
continue;
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
......
......@@ -12,7 +12,6 @@
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h> /* block_sync_page() */
......@@ -119,7 +118,7 @@ void __delete_from_swap_cache(struct page *page)
int add_to_swap(struct page * page)
{
swp_entry_t entry;
int flags;
int pf_flags;
if (!PageLocked(page))
BUG();
......@@ -142,7 +141,7 @@ int add_to_swap(struct page * page)
* just not all of them.
*/
flags = current->flags;
pf_flags = current->flags;
current->flags &= ~PF_MEMALLOC;
current->flags |= PF_NOWARN;
ClearPageUptodate(page); /* why? */
......@@ -154,20 +153,20 @@ int add_to_swap(struct page * page)
*/
switch (add_to_swap_cache(page, entry)) {
case 0: /* Success */
current->flags = flags;
current->flags = pf_flags;
SetPageUptodate(page);
set_page_dirty(page);
swap_free(entry);
return 1;
case -ENOMEM: /* radix-tree allocation */
current->flags = flags;
current->flags = pf_flags;
swap_free(entry);
return 0;
default: /* ENOENT: raced */
break;
}
/* Raced with "speculative" read_swap_cache_async */
current->flags = flags;
current->flags = pf_flags;
swap_free(entry);
}
}
......
......@@ -7,7 +7,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
......
......@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/smp_lock.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
......@@ -145,6 +144,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
if (!add_to_swap(page))
goto activate_locked;
pte_chain_lock(page);
mapping = page->mapping;
}
/*
......@@ -174,15 +174,18 @@ shrink_list(struct list_head *page_list, int nr_pages,
*/
if (PageDirty(page) && is_page_cache_freeable(page) &&
mapping && may_enter_fs) {
int (*writeback)(struct page *, int *);
int (*writeback)(struct page *,
struct writeback_control *);
const int cluster_size = SWAP_CLUSTER_MAX;
int nr_to_write = cluster_size;
struct writeback_control wbc = {
.nr_to_write = cluster_size,
};
writeback = mapping->a_ops->vm_writeback;
if (writeback == NULL)
writeback = generic_vm_writeback;
(*writeback)(page, &nr_to_write);
*max_scan -= (cluster_size - nr_to_write);
(*writeback)(page, &wbc);
*max_scan -= (cluster_size - wbc.nr_to_write);
goto keep;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment