Commit e7144e64 authored by Linus Torvalds's avatar Linus Torvalds

Merge master.kernel.org:/home/davem/BK/net-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux
parents da29f6a8 407ee6c8
...@@ -46,13 +46,76 @@ static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; ...@@ -46,13 +46,76 @@ static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */ struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
/* /*
* How many reqeusts do we allocate per queue, * Number of requests per queue. This many for reads and for writes (twice
* and how many do we "batch" on freeing them? * this number, total).
*/ */
int queue_nr_requests, batch_requests; static int queue_nr_requests;
/*
* How many free requests must be available before we wake a process which
* is waiting for a request?
*/
static int batch_requests;
unsigned long blk_max_low_pfn, blk_max_pfn; unsigned long blk_max_low_pfn, blk_max_pfn;
int blk_nohighio = 0; int blk_nohighio = 0;
static struct congestion_state {
wait_queue_head_t wqh;
atomic_t nr_congested_queues;
} congestion_states[2];
/*
* Return the threshold (number of free requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
* context switch rate down.
*/
static inline int queue_congestion_on_threshold(void)
{
int ret;
ret = queue_nr_requests / 4 - 1;
if (ret < 0)
ret = 1;
return ret;
}
/*
* The threshold at which a queue is considered to be uncongested
*/
static inline int queue_congestion_off_threshold(void)
{
int ret;
ret = queue_nr_requests / 4 + 1;
if (ret > queue_nr_requests)
ret = queue_nr_requests;
return ret;
}
static void clear_queue_congested(request_queue_t *q, int rw)
{
enum bdi_state bit;
struct congestion_state *cs = &congestion_states[rw];
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
if (test_and_clear_bit(bit, &q->backing_dev_info.state))
atomic_dec(&cs->nr_congested_queues);
if (waitqueue_active(&cs->wqh))
wake_up(&cs->wqh);
}
static void set_queue_congested(request_queue_t *q, int rw)
{
enum bdi_state bit;
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
if (!test_and_set_bit(bit, &q->backing_dev_info.state))
atomic_inc(&congestion_states[rw].nr_congested_queues);
}
/** /**
* bdev_get_queue: - return the queue that matches the given device * bdev_get_queue: - return the queue that matches the given device
* @bdev: device * @bdev: device
...@@ -360,8 +423,8 @@ int blk_queue_init_tags(request_queue_t *q, int depth) ...@@ -360,8 +423,8 @@ int blk_queue_init_tags(request_queue_t *q, int depth)
struct blk_queue_tag *tags; struct blk_queue_tag *tags;
int bits, i; int bits, i;
if (depth > queue_nr_requests) { if (depth > (queue_nr_requests*2)) {
depth = queue_nr_requests; depth = (queue_nr_requests*2);
printk("blk_queue_init_tags: adjusted depth to %d\n", depth); printk("blk_queue_init_tags: adjusted depth to %d\n", depth);
} }
...@@ -1019,7 +1082,7 @@ static int __blk_cleanup_queue(struct request_list *list) ...@@ -1019,7 +1082,7 @@ static int __blk_cleanup_queue(struct request_list *list)
**/ **/
void blk_cleanup_queue(request_queue_t * q) void blk_cleanup_queue(request_queue_t * q)
{ {
int count = queue_nr_requests; int count = (queue_nr_requests*2);
count -= __blk_cleanup_queue(&q->rq[READ]); count -= __blk_cleanup_queue(&q->rq[READ]);
count -= __blk_cleanup_queue(&q->rq[WRITE]); count -= __blk_cleanup_queue(&q->rq[WRITE]);
...@@ -1050,7 +1113,7 @@ static int blk_init_free_list(request_queue_t *q) ...@@ -1050,7 +1113,7 @@ static int blk_init_free_list(request_queue_t *q)
* Divide requests in half between read and write * Divide requests in half between read and write
*/ */
rl = &q->rq[READ]; rl = &q->rq[READ];
for (i = 0; i < queue_nr_requests; i++) { for (i = 0; i < (queue_nr_requests*2); i++) {
rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL); rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
if (!rq) if (!rq)
goto nomem; goto nomem;
...@@ -1058,7 +1121,7 @@ static int blk_init_free_list(request_queue_t *q) ...@@ -1058,7 +1121,7 @@ static int blk_init_free_list(request_queue_t *q)
/* /*
* half way through, switch to WRITE list * half way through, switch to WRITE list
*/ */
if (i == queue_nr_requests / 2) if (i == queue_nr_requests)
rl = &q->rq[WRITE]; rl = &q->rq[WRITE];
memset(rq, 0, sizeof(struct request)); memset(rq, 0, sizeof(struct request));
...@@ -1144,7 +1207,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock) ...@@ -1144,7 +1207,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
* Get a free request. queue lock must be held and interrupts * Get a free request. queue lock must be held and interrupts
* disabled on the way in. * disabled on the way in.
*/ */
static inline struct request *get_request(request_queue_t *q, int rw) static struct request *get_request(request_queue_t *q, int rw)
{ {
struct request *rq = NULL; struct request *rq = NULL;
struct request_list *rl = q->rq + rw; struct request_list *rl = q->rq + rw;
...@@ -1153,6 +1216,8 @@ static inline struct request *get_request(request_queue_t *q, int rw) ...@@ -1153,6 +1216,8 @@ static inline struct request *get_request(request_queue_t *q, int rw)
rq = blkdev_free_rq(&rl->free); rq = blkdev_free_rq(&rl->free);
list_del(&rq->queuelist); list_del(&rq->queuelist);
rl->count--; rl->count--;
if (rl->count < queue_congestion_on_threshold())
set_queue_congested(q, rw);
rq->flags = 0; rq->flags = 0;
rq->rq_status = RQ_ACTIVE; rq->rq_status = RQ_ACTIVE;
rq->special = NULL; rq->special = NULL;
...@@ -1365,13 +1430,50 @@ void blk_put_request(struct request *req) ...@@ -1365,13 +1430,50 @@ void blk_put_request(struct request *req)
* it didn't come out of our reserved rq pools * it didn't come out of our reserved rq pools
*/ */
if (rl) { if (rl) {
int rw = 0;
list_add(&req->queuelist, &rl->free); list_add(&req->queuelist, &rl->free);
if (++rl->count >= batch_requests &&waitqueue_active(&rl->wait)) if (rl == &q->rq[WRITE])
rw = WRITE;
else if (rl == &q->rq[READ])
rw = READ;
else
BUG();
rl->count++;
if (rl->count >= queue_congestion_off_threshold())
clear_queue_congested(q, rw);
if (rl->count >= batch_requests && waitqueue_active(&rl->wait))
wake_up(&rl->wait); wake_up(&rl->wait);
} }
} }
/**
* blk_congestion_wait - wait for a queue to become uncongested
* @rw: READ or WRITE
* @timeout: timeout in jiffies
*
* Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
* If no queues are congested then just return, in the hope that the caller
* will submit some more IO.
*/
void blk_congestion_wait(int rw, long timeout)
{
DECLARE_WAITQUEUE(wait, current);
struct congestion_state *cs = &congestion_states[rw];
if (atomic_read(&cs->nr_congested_queues) == 0)
return;
blk_run_queues();
set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&cs->wqh, &wait);
if (atomic_read(&cs->nr_congested_queues) != 0)
schedule_timeout(timeout);
set_current_state(TASK_RUNNING);
remove_wait_queue(&cs->wqh, &wait);
}
/* /*
* Has to be called with the request spinlock acquired * Has to be called with the request spinlock acquired
*/ */
...@@ -1868,6 +1970,7 @@ void end_that_request_last(struct request *req) ...@@ -1868,6 +1970,7 @@ void end_that_request_last(struct request *req)
int __init blk_dev_init(void) int __init blk_dev_init(void)
{ {
int total_ram = nr_free_pages() << (PAGE_SHIFT - 10); int total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
int i;
request_cachep = kmem_cache_create("blkdev_requests", request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request), 0, sizeof(struct request), 0,
...@@ -1876,26 +1979,33 @@ int __init blk_dev_init(void) ...@@ -1876,26 +1979,33 @@ int __init blk_dev_init(void)
panic("Can't create request pool slab cache\n"); panic("Can't create request pool slab cache\n");
/* /*
* Free request slots per queue. * Free request slots per queue. One per quarter-megabyte.
* (Half for reads, half for writes) * We use this many requests for reads, and this many for writes.
*/
queue_nr_requests = (total_ram >> 8) & ~15; /* One per quarter-megabyte */
if (queue_nr_requests < 32)
queue_nr_requests = 32;
if (queue_nr_requests > 256)
queue_nr_requests = 256;
/*
* Batch frees according to queue length
*/ */
if ((batch_requests = queue_nr_requests / 4) > 32) queue_nr_requests = (total_ram >> 9) & ~7;
batch_requests = 32; if (queue_nr_requests < 16)
printk("block: %d slots per queue, batch=%d\n", queue_nr_requests = 16;
queue_nr_requests, batch_requests); if (queue_nr_requests > 128)
queue_nr_requests = 128;
batch_requests = queue_nr_requests / 8;
if (batch_requests > 8)
batch_requests = 8;
printk("block request queues:\n");
printk(" %d requests per read queue\n", queue_nr_requests);
printk(" %d requests per write queue\n", queue_nr_requests);
printk(" %d requests per batch\n", batch_requests);
printk(" enter congestion at %d\n", queue_congestion_on_threshold());
printk(" exit congestion at %d\n", queue_congestion_off_threshold());
blk_max_low_pfn = max_low_pfn; blk_max_low_pfn = max_low_pfn;
blk_max_pfn = max_pfn; blk_max_pfn = max_pfn;
for (i = 0; i < ARRAY_SIZE(congestion_states); i++) {
init_waitqueue_head(&congestion_states[i].wqh);
atomic_set(&congestion_states[i].nr_congested_queues, 0);
}
return 0; return 0;
}; };
......
...@@ -28,7 +28,8 @@ ...@@ -28,7 +28,8 @@
#include <linux/smp_lock.h> #include <linux/smp_lock.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/buffer_head.h> /* for fsync_bdev()/wakeup_bdflush() */ #include <linux/writeback.h>
#include <linux/buffer_head.h> /* for fsync_bdev() */
#include <linux/spinlock.h> #include <linux/spinlock.h>
...@@ -227,7 +228,7 @@ static void sysrq_handle_sync(int key, struct pt_regs *pt_regs, ...@@ -227,7 +228,7 @@ static void sysrq_handle_sync(int key, struct pt_regs *pt_regs,
struct tty_struct *tty) struct tty_struct *tty)
{ {
emergency_sync_scheduled = EMERG_SYNC; emergency_sync_scheduled = EMERG_SYNC;
wakeup_bdflush(); wakeup_bdflush(0);
} }
static struct sysrq_key_op sysrq_sync_op = { static struct sysrq_key_op sysrq_sync_op = {
handler: sysrq_handle_sync, handler: sysrq_handle_sync,
...@@ -239,7 +240,7 @@ static void sysrq_handle_mountro(int key, struct pt_regs *pt_regs, ...@@ -239,7 +240,7 @@ static void sysrq_handle_mountro(int key, struct pt_regs *pt_regs,
struct tty_struct *tty) struct tty_struct *tty)
{ {
emergency_sync_scheduled = EMERG_REMOUNT; emergency_sync_scheduled = EMERG_REMOUNT;
wakeup_bdflush(); wakeup_bdflush(0);
} }
static struct sysrq_key_op sysrq_mountro_op = { static struct sysrq_key_op sysrq_mountro_op = {
handler: sysrq_handle_mountro, handler: sysrq_handle_mountro,
......
...@@ -458,19 +458,17 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) ...@@ -458,19 +458,17 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
} }
/* /*
* FIXME: What is this function actually trying to do? Why "zones[0]"? * Kick pdflush then try to free up some ZONE_NORMAL memory.
* Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
*/ */
static void free_more_memory(void) static void free_more_memory(void)
{ {
struct zone *zone; struct zone *zone;
zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0]; zone = contig_page_data.node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
wakeup_bdflush(1024);
wakeup_bdflush();
try_to_free_pages(zone, GFP_NOFS, 0);
blk_run_queues(); blk_run_queues();
yield(); yield();
try_to_free_pages(zone, GFP_NOFS, 0);
} }
/* /*
......
...@@ -16,9 +16,9 @@ ...@@ -16,9 +16,9 @@
#include "ext2.h" #include "ext2.h"
#include <linux/quotaops.h> #include <linux/quotaops.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
/* /*
* ialloc.c contains the inodes allocation and deallocation routines * ialloc.c contains the inodes allocation and deallocation routines
*/ */
...@@ -169,6 +169,13 @@ static void ext2_preread_inode(struct inode *inode) ...@@ -169,6 +169,13 @@ static void ext2_preread_inode(struct inode *inode)
unsigned long block; unsigned long block;
struct buffer_head *bh; struct buffer_head *bh;
struct ext2_group_desc * gdp; struct ext2_group_desc * gdp;
struct backing_dev_info *bdi;
bdi = inode->i_mapping->backing_dev_info;
if (bdi_read_congested(bdi))
return;
if (bdi_write_congested(bdi))
return;
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
gdp = ext2_get_group_desc(inode->i_sb, block_group, &bh); gdp = ext2_get_group_desc(inode->i_sb, block_group, &bh);
......
...@@ -1473,7 +1473,7 @@ struct address_space_operations ext3_aops = { ...@@ -1473,7 +1473,7 @@ struct address_space_operations ext3_aops = {
}; };
/* For writeback mode, we can use mpage_writepages() */ /* For writeback mode, we can use mpage_writepages() */
#if 0 /* Doesn't work for shared mappings */
static int static int
ext3_writepages(struct address_space *mapping, struct writeback_control *wbc) ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
{ {
...@@ -1486,12 +1486,12 @@ ext3_writepages(struct address_space *mapping, struct writeback_control *wbc) ...@@ -1486,12 +1486,12 @@ ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
ret = err; ret = err;
return ret; return ret;
} }
#endif
struct address_space_operations ext3_writeback_aops = { struct address_space_operations ext3_writeback_aops = {
.readpage = ext3_readpage, /* BKL not held. Don't need */ .readpage = ext3_readpage, /* BKL not held. Don't need */
.readpages = ext3_readpages, /* BKL not held. Don't need */ .readpages = ext3_readpages, /* BKL not held. Don't need */
.writepage = ext3_writepage, /* BKL not held. We take it */ .writepage = ext3_writepage, /* BKL not held. We take it */
.writepages = ext3_writepages, /* BKL not held. Don't need */
.sync_page = block_sync_page, .sync_page = block_sync_page,
.prepare_write = ext3_prepare_write, /* BKL not held. We take it */ .prepare_write = ext3_prepare_write, /* BKL not held. We take it */
.commit_write = ext3_commit_write, /* BKL not held. We take it */ .commit_write = ext3_commit_write, /* BKL not held. We take it */
......
...@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync, ...@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync,
* *
* FIXME: this linear search could get expensive with many fileystems. But * FIXME: this linear search could get expensive with many fileystems. But
* how to fix? We need to go from an address_space to all inodes which share * how to fix? We need to go from an address_space to all inodes which share
* a queue with that address_space. * a queue with that address_space. (Easy: have a global "dirty superblocks"
* list).
* *
* The inodes to be written are parked on sb->s_io. They are moved back onto * The inodes to be written are parked on sb->s_io. They are moved back onto
* sb->s_dirty as they are selected for writing. This way, none can be missed * sb->s_dirty as they are selected for writing. This way, none can be missed
* on the writer throttling path, and we get decent balancing between many * on the writer throttling path, and we get decent balancing between many
* thrlttled threads: we don't want them all piling up on __wait_on_inode. * throlttled threads: we don't want them all piling up on __wait_on_inode.
*/ */
static void static void
sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{ {
struct list_head *tmp;
struct list_head *head;
const unsigned long start = jiffies; /* livelock avoidance */ const unsigned long start = jiffies; /* livelock avoidance */
list_splice_init(&sb->s_dirty, &sb->s_io); list_splice_init(&sb->s_dirty, &sb->s_io);
head = &sb->s_io; while (!list_empty(&sb->s_io)) {
while ((tmp = head->prev) != head) { struct inode *inode = list_entry(sb->s_io.prev,
struct inode *inode = list_entry(tmp, struct inode, i_list); struct inode, i_list);
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi; struct backing_dev_info *bdi = mapping->backing_dev_info;
int really_sync; int really_sync;
if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) { if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
if (sb != blockdev_superblock) if (sb != blockdev_superblock)
break; /* inappropriate superblock */ break; /* Skip a congested fs */
list_move(&inode->i_list, &sb->s_dirty); list_move(&inode->i_list, &sb->s_dirty);
continue; /* not this blockdev */ continue; /* Skip a congested blockdev */
}
if (wbc->bdi && bdi != wbc->bdi) {
if (sb != blockdev_superblock)
break; /* fs has the wrong queue */
list_move(&inode->i_list, &sb->s_dirty);
continue; /* blockdev has wrong queue */
} }
/* Was this inode dirtied after sync_sb_inodes was called? */ /* Was this inode dirtied after sync_sb_inodes was called? */
if (time_after(mapping->dirtied_when, start)) if (time_after(mapping->dirtied_when, start))
break; break;
/* Was this inode dirtied too recently? */
if (wbc->older_than_this && time_after(mapping->dirtied_when, if (wbc->older_than_this && time_after(mapping->dirtied_when,
*wbc->older_than_this)) *wbc->older_than_this))
goto out; break;
bdi = mapping->backing_dev_info; /* Is another pdflush already flushing this queue? */
if (current_is_pdflush() && !writeback_acquire(bdi)) if (current_is_pdflush() && !writeback_acquire(bdi))
break; break;
...@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) ...@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0) if (wbc->nr_to_write <= 0)
break; break;
} }
out: return; /* Leave any unwritten inodes on s_io */
/*
* Leave any unwritten inodes on s_io.
*/
return;
} }
/* /*
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/mpage.h> #include <linux/mpage.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
/* /*
...@@ -522,6 +523,7 @@ int ...@@ -522,6 +523,7 @@ int
mpage_writepages(struct address_space *mapping, mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block) struct writeback_control *wbc, get_block_t get_block)
{ {
struct backing_dev_info *bdi = mapping->backing_dev_info;
struct bio *bio = NULL; struct bio *bio = NULL;
sector_t last_block_in_bio = 0; sector_t last_block_in_bio = 0;
int ret = 0; int ret = 0;
...@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping, ...@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping,
struct pagevec pvec; struct pagevec pvec;
int (*writepage)(struct page *); int (*writepage)(struct page *);
if (wbc->nonblocking && bdi_write_congested(bdi)) {
blk_run_queues();
wbc->encountered_congestion = 1;
return 0;
}
writepage = NULL; writepage = NULL;
if (get_block == NULL) if (get_block == NULL)
writepage = mapping->a_ops->writepage; writepage = mapping->a_ops->writepage;
...@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping, ...@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping,
} }
if (ret || (--(wbc->nr_to_write) <= 0)) if (ret || (--(wbc->nr_to_write) <= 0))
done = 1; done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
blk_run_queues();
wbc->encountered_congestion = 1;
done = 1;
}
} else { } else {
unlock_page(page); unlock_page(page);
} }
......
...@@ -8,11 +8,15 @@ ...@@ -8,11 +8,15 @@
#ifndef _LINUX_BACKING_DEV_H #ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H
#include <asm/atomic.h>
/* /*
* Bits in backing_dev_info.state * Bits in backing_dev_info.state
*/ */
enum bdi_state { enum bdi_state {
BDI_pdflush, /* A pdflush thread is working this device */ BDI_pdflush, /* A pdflush thread is working this device */
BDI_write_congested, /* The write queue is getting full */
BDI_read_congested, /* The read queue is getting full */
BDI_unused, /* Available bits start here */ BDI_unused, /* Available bits start here */
}; };
...@@ -28,4 +32,14 @@ int writeback_acquire(struct backing_dev_info *bdi); ...@@ -28,4 +32,14 @@ int writeback_acquire(struct backing_dev_info *bdi);
int writeback_in_progress(struct backing_dev_info *bdi); int writeback_in_progress(struct backing_dev_info *bdi);
void writeback_release(struct backing_dev_info *bdi); void writeback_release(struct backing_dev_info *bdi);
static inline int bdi_read_congested(struct backing_dev_info *bdi)
{
return test_bit(BDI_read_congested, &bdi->state);
}
static inline int bdi_write_congested(struct backing_dev_info *bdi)
{
return test_bit(BDI_write_congested, &bdi->state);
}
#endif /* _LINUX_BACKING_DEV_H */ #endif /* _LINUX_BACKING_DEV_H */
...@@ -345,6 +345,7 @@ extern void blk_queue_end_tag(request_queue_t *, struct request *); ...@@ -345,6 +345,7 @@ extern void blk_queue_end_tag(request_queue_t *, struct request *);
extern int blk_queue_init_tags(request_queue_t *, int); extern int blk_queue_init_tags(request_queue_t *, int);
extern void blk_queue_free_tags(request_queue_t *); extern void blk_queue_free_tags(request_queue_t *);
extern void blk_queue_invalidate_tags(request_queue_t *); extern void blk_queue_invalidate_tags(request_queue_t *);
extern void blk_congestion_wait(int rw, long timeout);
#define MAX_PHYS_SEGMENTS 128 #define MAX_PHYS_SEGMENTS 128
#define MAX_HW_SEGMENTS 128 #define MAX_HW_SEGMENTS 128
......
...@@ -163,7 +163,6 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int); ...@@ -163,7 +163,6 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int);
void __brelse(struct buffer_head *); void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *); void __bforget(struct buffer_head *);
struct buffer_head *__bread(struct block_device *, sector_t block, int size); struct buffer_head *__bread(struct block_device *, sector_t block, int size);
void wakeup_bdflush(void);
struct buffer_head *alloc_buffer_head(void); struct buffer_head *alloc_buffer_head(void);
void free_buffer_head(struct buffer_head * bh); void free_buffer_head(struct buffer_head * bh);
void FASTCALL(unlock_buffer(struct buffer_head *bh)); void FASTCALL(unlock_buffer(struct buffer_head *bh));
......
...@@ -273,6 +273,7 @@ extern struct user_struct root_user; ...@@ -273,6 +273,7 @@ extern struct user_struct root_user;
#define INIT_USER (&root_user) #define INIT_USER (&root_user)
typedef struct prio_array prio_array_t; typedef struct prio_array prio_array_t;
struct backing_dev_info;
struct task_struct { struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
...@@ -398,6 +399,7 @@ struct task_struct { ...@@ -398,6 +399,7 @@ struct task_struct {
/* journalling filesystem info */ /* journalling filesystem info */
void *journal_info; void *journal_info;
struct dentry *proc_dentry; struct dentry *proc_dentry;
struct backing_dev_info *backing_dev_info;
}; };
extern void __put_task_struct(struct task_struct *tsk); extern void __put_task_struct(struct task_struct *tsk);
......
...@@ -43,6 +43,8 @@ struct writeback_control { ...@@ -43,6 +43,8 @@ struct writeback_control {
older than this */ older than this */
long nr_to_write; /* Write this many pages, and decrement long nr_to_write; /* Write this many pages, and decrement
this for each page written */ this for each page written */
int nonblocking; /* Don't get stuck on request queues */
int encountered_congestion; /* An output: a queue is full */
}; };
void writeback_inodes(struct writeback_control *wbc); void writeback_inodes(struct writeback_control *wbc);
...@@ -61,6 +63,8 @@ static inline void wait_on_inode(struct inode *inode) ...@@ -61,6 +63,8 @@ static inline void wait_on_inode(struct inode *inode)
/* /*
* mm/page-writeback.c * mm/page-writeback.c
*/ */
int wakeup_bdflush(long nr_pages);
/* These 5 are exported to sysctl. */ /* These 5 are exported to sysctl. */
extern int dirty_background_ratio; extern int dirty_background_ratio;
extern int dirty_async_ratio; extern int dirty_async_ratio;
......
...@@ -81,7 +81,6 @@ unsigned char software_suspend_enabled = 0; ...@@ -81,7 +81,6 @@ unsigned char software_suspend_enabled = 0;
#define TIMEOUT (6 * HZ) /* Timeout for stopping processes */ #define TIMEOUT (6 * HZ) /* Timeout for stopping processes */
#define ADDRESS(x) ((unsigned long) phys_to_virt(((x) << PAGE_SHIFT))) #define ADDRESS(x) ((unsigned long) phys_to_virt(((x) << PAGE_SHIFT)))
extern void wakeup_bdflush(void);
extern int C_A_D; extern int C_A_D;
/* References to section boundaries */ /* References to section boundaries */
......
...@@ -1755,6 +1755,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, ...@@ -1755,6 +1755,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
if (unlikely(pos < 0)) if (unlikely(pos < 0))
return -EINVAL; return -EINVAL;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
pagevec_init(&lru_pvec); pagevec_init(&lru_pvec);
if (unlikely(file->f_error)) { if (unlikely(file->f_error)) {
...@@ -1959,6 +1962,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, ...@@ -1959,6 +1962,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
err = written ? written : status; err = written ? written : status;
out: out:
pagevec_lru_add(&lru_pvec); pagevec_lru_add(&lru_pvec);
current->backing_dev_info = 0;
return err; return err;
} }
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/buffer_head.h> /* for wakeup_bdflush() */ #include <linux/writeback.h>
static void add_element(mempool_t *pool, void *element) static void add_element(mempool_t *pool, void *element)
{ {
...@@ -210,7 +210,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask) ...@@ -210,7 +210,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
/* /*
* Kick the VM at this point. * Kick the VM at this point.
*/ */
wakeup_bdflush(); wakeup_bdflush(0);
spin_lock_irqsave(&pool->lock, flags); spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) { if (likely(pool->curr_nr)) {
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/sysrq.h> #include <linux/sysrq.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/mpage.h> #include <linux/mpage.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/smp.h> #include <linux/smp.h>
...@@ -172,33 +173,47 @@ static void background_writeout(unsigned long _min_pages) ...@@ -172,33 +173,47 @@ static void background_writeout(unsigned long _min_pages)
.sync_mode = WB_SYNC_NONE, .sync_mode = WB_SYNC_NONE,
.older_than_this = NULL, .older_than_this = NULL,
.nr_to_write = 0, .nr_to_write = 0,
.nonblocking = 1,
}; };
CHECK_EMERGENCY_SYNC CHECK_EMERGENCY_SYNC
background_thresh = (dirty_background_ratio * total_pages) / 100; background_thresh = (dirty_background_ratio * total_pages) / 100;
for ( ; ; ) {
do {
struct page_state ps; struct page_state ps;
get_page_state(&ps); get_page_state(&ps);
if (ps.nr_dirty < background_thresh && min_pages <= 0) if (ps.nr_dirty < background_thresh && min_pages <= 0)
break; break;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc); writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
} while (wbc.nr_to_write <= 0); if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
/* Wrote nothing */
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ/10);
else
break;
}
}
blk_run_queues(); blk_run_queues();
} }
/* /*
* Start heavy writeback of everything. * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/ */
void wakeup_bdflush(void) int wakeup_bdflush(long nr_pages)
{ {
if (nr_pages == 0) {
struct page_state ps; struct page_state ps;
get_page_state(&ps); get_page_state(&ps);
pdflush_operation(background_writeout, ps.nr_dirty); nr_pages = ps.nr_dirty;
}
return pdflush_operation(background_writeout, nr_pages);
} }
static struct timer_list wb_timer; static struct timer_list wb_timer;
...@@ -223,25 +238,36 @@ static void wb_kupdate(unsigned long arg) ...@@ -223,25 +238,36 @@ static void wb_kupdate(unsigned long arg)
unsigned long oldest_jif; unsigned long oldest_jif;
unsigned long start_jif; unsigned long start_jif;
unsigned long next_jif; unsigned long next_jif;
long nr_to_write;
struct page_state ps; struct page_state ps;
struct writeback_control wbc = { struct writeback_control wbc = {
.bdi = NULL, .bdi = NULL,
.sync_mode = WB_SYNC_NONE, .sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif, .older_than_this = &oldest_jif,
.nr_to_write = 0, .nr_to_write = 0,
.nonblocking = 1,
}; };
sync_supers(); sync_supers();
get_page_state(&ps);
get_page_state(&ps);
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies; start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
wbc.nr_to_write = ps.nr_dirty; nr_to_write = ps.nr_dirty;
while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc); writeback_inodes(&wbc);
if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ);
else
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
blk_run_queues(); blk_run_queues();
yield();
if (time_before(next_jif, jiffies + HZ)) if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ; next_jif = jiffies + HZ;
mod_timer(&wb_timer, next_jif); mod_timer(&wb_timer, next_jif);
...@@ -493,7 +519,6 @@ int __set_page_dirty_buffers(struct page *page) ...@@ -493,7 +519,6 @@ int __set_page_dirty_buffers(struct page *page)
buffer_error(); buffer_error();
spin_lock(&mapping->private_lock); spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) { if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page); struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head; struct buffer_head *bh = head;
...@@ -506,6 +531,7 @@ int __set_page_dirty_buffers(struct page *page) ...@@ -506,6 +531,7 @@ int __set_page_dirty_buffers(struct page *page)
bh = bh->b_this_page; bh = bh->b_this_page;
} while (bh != head); } while (bh != head);
} }
spin_unlock(&mapping->private_lock);
if (!TestSetPageDirty(page)) { if (!TestSetPageDirty(page)) {
write_lock(&mapping->page_lock); write_lock(&mapping->page_lock);
...@@ -519,7 +545,6 @@ int __set_page_dirty_buffers(struct page *page) ...@@ -519,7 +545,6 @@ int __set_page_dirty_buffers(struct page *page)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
} }
spin_unlock(&mapping->private_lock);
out: out:
return ret; return ret;
} }
......
...@@ -918,6 +918,26 @@ static int setup_swap_extents(struct swap_info_struct *sis) ...@@ -918,6 +918,26 @@ static int setup_swap_extents(struct swap_info_struct *sis)
return ret; return ret;
} }
#if 0 /* We don't need this yet */
#include <linux/backing-dev.h>
int page_queue_congested(struct page *page)
{
struct backing_dev_info *bdi;
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
bdi = page->mapping->backing_dev_info;
if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page->index };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
}
return bdi_write_congested(bdi);
}
#endif
asmlinkage long sys_swapoff(const char * specialfile) asmlinkage long sys_swapoff(const char * specialfile)
{ {
struct swap_info_struct * p = NULL; struct swap_info_struct * p = NULL;
......
...@@ -21,9 +21,11 @@ ...@@ -21,9 +21,11 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page() */ #include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap-locking.h> #include <linux/rmap-locking.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
...@@ -32,11 +34,11 @@ ...@@ -32,11 +34,11 @@
/* /*
* The "priority" of VM scanning is how much of the queues we * The "priority" of VM scanning is how much of the queues we
* will scan in one go. A value of 6 for DEF_PRIORITY implies * will scan in one go. A value of 12 for DEF_PRIORITY implies
* that we'll scan 1/64th of the queues ("queue_length >> 6") * that we'll scan 1/4096th of the queues ("queue_length >> 12")
* during a normal aging round. * during a normal aging round.
*/ */
#define DEF_PRIORITY (6) #define DEF_PRIORITY 12
#ifdef ARCH_HAS_PREFETCH #ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \ #define prefetch_prev_lru_page(_page, _base, _field) \
...@@ -95,7 +97,7 @@ static inline int is_page_cache_freeable(struct page *page) ...@@ -95,7 +97,7 @@ static inline int is_page_cache_freeable(struct page *page)
static /* inline */ int static /* inline */ int
shrink_list(struct list_head *page_list, int nr_pages, shrink_list(struct list_head *page_list, int nr_pages,
unsigned int gfp_mask, int priority, int *max_scan) unsigned int gfp_mask, int *max_scan)
{ {
struct address_space *mapping; struct address_space *mapping;
LIST_HEAD(ret_pages); LIST_HEAD(ret_pages);
...@@ -117,10 +119,21 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -117,10 +119,21 @@ shrink_list(struct list_head *page_list, int nr_pages,
BUG_ON(PageActive(page)); BUG_ON(PageActive(page));
may_enter_fs = (gfp_mask & __GFP_FS) || may_enter_fs = (gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (gfp_mask & __GFP_IO)); (PageSwapCache(page) && (gfp_mask & __GFP_IO));
/*
* If the page is mapped into pagetables then wait on it, to
* throttle this allocator to the rate at which we can clear
* MAP_SHARED data. This will also throttle against swapcache
* writes.
*/
if (PageWriteback(page)) { if (PageWriteback(page)) {
if (may_enter_fs) if (may_enter_fs) {
wait_on_page_writeback(page); /* throttling */ if (page->pte.direct ||
else page->mapping->backing_dev_info ==
current->backing_dev_info) {
wait_on_page_writeback(page);
}
}
goto keep_locked; goto keep_locked;
} }
...@@ -172,15 +185,43 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -172,15 +185,43 @@ shrink_list(struct list_head *page_list, int nr_pages,
* will write it. So we're back to page-at-a-time writepage * will write it. So we're back to page-at-a-time writepage
* in LRU order. * in LRU order.
*/ */
if (PageDirty(page) && is_page_cache_freeable(page) && /*
mapping && may_enter_fs) { * If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in generic_file_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
* See swapfile.c:page_queue_congested().
*/
if (PageDirty(page)) {
int (*writeback)(struct page *, int (*writeback)(struct page *,
struct writeback_control *); struct writeback_control *);
struct backing_dev_info *bdi;
const int cluster_size = SWAP_CLUSTER_MAX; const int cluster_size = SWAP_CLUSTER_MAX;
struct writeback_control wbc = { struct writeback_control wbc = {
.nr_to_write = cluster_size, .nr_to_write = cluster_size,
}; };
if (!is_page_cache_freeable(page))
goto keep_locked;
if (!mapping)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
bdi = mapping->backing_dev_info;
if (bdi != current->backing_dev_info &&
bdi_write_congested(bdi))
goto keep_locked;
writeback = mapping->a_ops->vm_writeback; writeback = mapping->a_ops->vm_writeback;
if (writeback == NULL) if (writeback == NULL)
writeback = generic_vm_writeback; writeback = generic_vm_writeback;
...@@ -279,7 +320,7 @@ shrink_list(struct list_head *page_list, int nr_pages, ...@@ -279,7 +320,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
*/ */
static /* inline */ int static /* inline */ int
shrink_cache(int nr_pages, struct zone *zone, shrink_cache(int nr_pages, struct zone *zone,
unsigned int gfp_mask, int priority, int max_scan) unsigned int gfp_mask, int max_scan)
{ {
LIST_HEAD(page_list); LIST_HEAD(page_list);
struct pagevec pvec; struct pagevec pvec;
...@@ -298,9 +339,11 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -298,9 +339,11 @@ shrink_cache(int nr_pages, struct zone *zone,
spin_lock_irq(&zone->lru_lock); spin_lock_irq(&zone->lru_lock);
while (max_scan > 0 && nr_pages > 0) { while (max_scan > 0 && nr_pages > 0) {
struct page *page; struct page *page;
int n = 0; int nr_taken = 0;
int nr_scan = 0;
while (n < nr_to_process && !list_empty(&zone->inactive_list)) { while (nr_scan++ < nr_to_process &&
!list_empty(&zone->inactive_list)) {
page = list_entry(zone->inactive_list.prev, page = list_entry(zone->inactive_list.prev,
struct page, lru); struct page, lru);
...@@ -318,18 +361,17 @@ shrink_cache(int nr_pages, struct zone *zone, ...@@ -318,18 +361,17 @@ shrink_cache(int nr_pages, struct zone *zone,
} }
list_add(&page->lru, &page_list); list_add(&page->lru, &page_list);
page_cache_get(page); page_cache_get(page);
n++; nr_taken++;
} }
zone->nr_inactive -= n; zone->nr_inactive -= nr_taken;
spin_unlock_irq(&zone->lru_lock); spin_unlock_irq(&zone->lru_lock);
if (list_empty(&page_list)) if (nr_taken == 0)
goto done; goto done;
max_scan -= n; max_scan -= nr_scan;
KERNEL_STAT_ADD(pgscan, n); KERNEL_STAT_ADD(pgscan, nr_scan);
nr_pages = shrink_list(&page_list, nr_pages, nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan);
gfp_mask, priority, &max_scan);
if (nr_pages <= 0 && list_empty(&page_list)) if (nr_pages <= 0 && list_empty(&page_list))
goto done; goto done;
...@@ -420,6 +462,15 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -420,6 +462,15 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
} }
pte_chain_unlock(page); pte_chain_unlock(page);
} }
/*
* FIXME: need to consider page_count(page) here if/when we
* reap orphaned pages via the LRU (Daniel's locking stuff)
*/
if (total_swap_pages == 0 && !page->mapping &&
!PagePrivate(page)) {
list_add(&page->lru, &l_active);
continue;
}
list_add(&page->lru, &l_inactive); list_add(&page->lru, &l_inactive);
pgdeactivate++; pgdeactivate++;
} }
...@@ -470,11 +521,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) ...@@ -470,11 +521,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
} }
static /* inline */ int static /* inline */ int
shrink_zone(struct zone *zone, int priority, shrink_zone(struct zone *zone, int max_scan,
unsigned int gfp_mask, int nr_pages) unsigned int gfp_mask, int nr_pages)
{ {
unsigned long ratio; unsigned long ratio;
int max_scan;
/* This is bogus for ZONE_HIGHMEM? */ /* This is bogus for ZONE_HIGHMEM? */
if (kmem_cache_reap(gfp_mask) >= nr_pages) if (kmem_cache_reap(gfp_mask) >= nr_pages)
...@@ -497,43 +547,50 @@ shrink_zone(struct zone *zone, int priority, ...@@ -497,43 +547,50 @@ shrink_zone(struct zone *zone, int priority,
atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
refill_inactive_zone(zone, SWAP_CLUSTER_MAX); refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
} }
nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan);
max_scan = zone->nr_inactive / priority;
nr_pages = shrink_cache(nr_pages, zone,
gfp_mask, priority, max_scan);
if (nr_pages <= 0)
return 0;
wakeup_bdflush();
shrink_dcache_memory(priority, gfp_mask);
/* After shrinking the dcache, get rid of unused inodes too .. */
shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
return nr_pages; return nr_pages;
} }
static int static int
shrink_caches(struct zone *classzone, int priority, shrink_caches(struct zone *classzone, int priority,
int gfp_mask, int nr_pages) int *total_scanned, int gfp_mask, int nr_pages)
{ {
struct zone *first_classzone; struct zone *first_classzone;
struct zone *zone; struct zone *zone;
first_classzone = classzone->zone_pgdat->node_zones; first_classzone = classzone->zone_pgdat->node_zones;
zone = classzone; for (zone = classzone; zone >= first_classzone; zone--) {
while (zone >= first_classzone && nr_pages > 0) { int max_scan;
if (zone->free_pages <= zone->pages_high) { int to_reclaim;
nr_pages = shrink_zone(zone, priority, int unreclaimed;
gfp_mask, nr_pages);
} to_reclaim = zone->pages_high - zone->free_pages;
zone--; if (to_reclaim < 0)
continue; /* zone has enough memory */
if (to_reclaim > SWAP_CLUSTER_MAX)
to_reclaim = SWAP_CLUSTER_MAX;
if (to_reclaim < nr_pages)
to_reclaim = nr_pages;
/*
* If we cannot reclaim `nr_pages' pages by scanning twice
* that many pages then fall back to the next zone.
*/
max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2;
unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim);
nr_pages -= to_reclaim - unreclaimed;
*total_scanned += max_scan;
} }
shrink_dcache_memory(priority, gfp_mask);
shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
return nr_pages; return nr_pages;
} }
...@@ -564,12 +621,25 @@ try_to_free_pages(struct zone *classzone, ...@@ -564,12 +621,25 @@ try_to_free_pages(struct zone *classzone,
KERNEL_STAT_INC(pageoutrun); KERNEL_STAT_INC(pageoutrun);
for (priority = DEF_PRIORITY; priority; priority--) { for (priority = DEF_PRIORITY; priority; priority--) {
nr_pages = shrink_caches(classzone, priority, int total_scanned = 0;
nr_pages = shrink_caches(classzone, priority, &total_scanned,
gfp_mask, nr_pages); gfp_mask, nr_pages);
if (nr_pages <= 0) if (nr_pages <= 0)
return 1; return 1;
if (total_scanned == 0)
return 1; /* All zones had enough free memory */
if (!(gfp_mask & __GFP_FS)) if (!(gfp_mask & __GFP_FS))
break; break; /* Let the caller handle it */
/*
* Try to write back as many pages as we just scanned. Not
* sure if that makes sense, but it's an attempt to avoid
* creating IO storms unnecessarily
*/
wakeup_bdflush(total_scanned);
/* Take a nap, wait for some writeback to complete */
blk_congestion_wait(WRITE, HZ/4);
} }
if (gfp_mask & __GFP_FS) if (gfp_mask & __GFP_FS)
out_of_memory(); out_of_memory();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment