Commit e7144e64 authored by Linus Torvalds's avatar Linus Torvalds

Merge master.kernel.org:/home/davem/BK/net-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux
parents da29f6a8 407ee6c8
......@@ -46,13 +46,76 @@ static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
/*
* How many reqeusts do we allocate per queue,
* and how many do we "batch" on freeing them?
* Number of requests per queue. This many for reads and for writes (twice
* this number, total).
*/
int queue_nr_requests, batch_requests;
static int queue_nr_requests;
/*
* How many free requests must be available before we wake a process which
* is waiting for a request?
*/
static int batch_requests;
unsigned long blk_max_low_pfn, blk_max_pfn;
int blk_nohighio = 0;
static struct congestion_state {
wait_queue_head_t wqh;
atomic_t nr_congested_queues;
} congestion_states[2];
/*
* Return the threshold (number of free requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
* context switch rate down.
*/
static inline int queue_congestion_on_threshold(void)
{
int ret;
ret = queue_nr_requests / 4 - 1;
if (ret < 0)
ret = 1;
return ret;
}
/*
* The threshold at which a queue is considered to be uncongested
*/
static inline int queue_congestion_off_threshold(void)
{
int ret;
ret = queue_nr_requests / 4 + 1;
if (ret > queue_nr_requests)
ret = queue_nr_requests;
return ret;
}
static void clear_queue_congested(request_queue_t *q, int rw)
{
enum bdi_state bit;
struct congestion_state *cs = &congestion_states[rw];
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
if (test_and_clear_bit(bit, &q->backing_dev_info.state))
atomic_dec(&cs->nr_congested_queues);
if (waitqueue_active(&cs->wqh))
wake_up(&cs->wqh);
}
static void set_queue_congested(request_queue_t *q, int rw)
{
enum bdi_state bit;
bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
if (!test_and_set_bit(bit, &q->backing_dev_info.state))
atomic_inc(&congestion_states[rw].nr_congested_queues);
}
/**
* bdev_get_queue: - return the queue that matches the given device
* @bdev: device
......@@ -360,8 +423,8 @@ int blk_queue_init_tags(request_queue_t *q, int depth)
struct blk_queue_tag *tags;
int bits, i;
if (depth > queue_nr_requests) {
depth = queue_nr_requests;
if (depth > (queue_nr_requests*2)) {
depth = (queue_nr_requests*2);
printk("blk_queue_init_tags: adjusted depth to %d\n", depth);
}
......@@ -1019,7 +1082,7 @@ static int __blk_cleanup_queue(struct request_list *list)
**/
void blk_cleanup_queue(request_queue_t * q)
{
int count = queue_nr_requests;
int count = (queue_nr_requests*2);
count -= __blk_cleanup_queue(&q->rq[READ]);
count -= __blk_cleanup_queue(&q->rq[WRITE]);
......@@ -1050,7 +1113,7 @@ static int blk_init_free_list(request_queue_t *q)
* Divide requests in half between read and write
*/
rl = &q->rq[READ];
for (i = 0; i < queue_nr_requests; i++) {
for (i = 0; i < (queue_nr_requests*2); i++) {
rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
if (!rq)
goto nomem;
......@@ -1058,7 +1121,7 @@ static int blk_init_free_list(request_queue_t *q)
/*
* half way through, switch to WRITE list
*/
if (i == queue_nr_requests / 2)
if (i == queue_nr_requests)
rl = &q->rq[WRITE];
memset(rq, 0, sizeof(struct request));
......@@ -1144,7 +1207,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
* Get a free request. queue lock must be held and interrupts
* disabled on the way in.
*/
static inline struct request *get_request(request_queue_t *q, int rw)
static struct request *get_request(request_queue_t *q, int rw)
{
struct request *rq = NULL;
struct request_list *rl = q->rq + rw;
......@@ -1153,6 +1216,8 @@ static inline struct request *get_request(request_queue_t *q, int rw)
rq = blkdev_free_rq(&rl->free);
list_del(&rq->queuelist);
rl->count--;
if (rl->count < queue_congestion_on_threshold())
set_queue_congested(q, rw);
rq->flags = 0;
rq->rq_status = RQ_ACTIVE;
rq->special = NULL;
......@@ -1365,13 +1430,50 @@ void blk_put_request(struct request *req)
* it didn't come out of our reserved rq pools
*/
if (rl) {
int rw = 0;
list_add(&req->queuelist, &rl->free);
if (++rl->count >= batch_requests &&waitqueue_active(&rl->wait))
if (rl == &q->rq[WRITE])
rw = WRITE;
else if (rl == &q->rq[READ])
rw = READ;
else
BUG();
rl->count++;
if (rl->count >= queue_congestion_off_threshold())
clear_queue_congested(q, rw);
if (rl->count >= batch_requests && waitqueue_active(&rl->wait))
wake_up(&rl->wait);
}
}
/**
* blk_congestion_wait - wait for a queue to become uncongested
* @rw: READ or WRITE
* @timeout: timeout in jiffies
*
* Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
* If no queues are congested then just return, in the hope that the caller
* will submit some more IO.
*/
void blk_congestion_wait(int rw, long timeout)
{
DECLARE_WAITQUEUE(wait, current);
struct congestion_state *cs = &congestion_states[rw];
if (atomic_read(&cs->nr_congested_queues) == 0)
return;
blk_run_queues();
set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&cs->wqh, &wait);
if (atomic_read(&cs->nr_congested_queues) != 0)
schedule_timeout(timeout);
set_current_state(TASK_RUNNING);
remove_wait_queue(&cs->wqh, &wait);
}
/*
* Has to be called with the request spinlock acquired
*/
......@@ -1868,6 +1970,7 @@ void end_that_request_last(struct request *req)
int __init blk_dev_init(void)
{
int total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
int i;
request_cachep = kmem_cache_create("blkdev_requests",
sizeof(struct request), 0,
......@@ -1876,26 +1979,33 @@ int __init blk_dev_init(void)
panic("Can't create request pool slab cache\n");
/*
* Free request slots per queue.
* (Half for reads, half for writes)
*/
queue_nr_requests = (total_ram >> 8) & ~15; /* One per quarter-megabyte */
if (queue_nr_requests < 32)
queue_nr_requests = 32;
if (queue_nr_requests > 256)
queue_nr_requests = 256;
/*
* Batch frees according to queue length
* Free request slots per queue. One per quarter-megabyte.
* We use this many requests for reads, and this many for writes.
*/
if ((batch_requests = queue_nr_requests / 4) > 32)
batch_requests = 32;
printk("block: %d slots per queue, batch=%d\n",
queue_nr_requests, batch_requests);
queue_nr_requests = (total_ram >> 9) & ~7;
if (queue_nr_requests < 16)
queue_nr_requests = 16;
if (queue_nr_requests > 128)
queue_nr_requests = 128;
batch_requests = queue_nr_requests / 8;
if (batch_requests > 8)
batch_requests = 8;
printk("block request queues:\n");
printk(" %d requests per read queue\n", queue_nr_requests);
printk(" %d requests per write queue\n", queue_nr_requests);
printk(" %d requests per batch\n", batch_requests);
printk(" enter congestion at %d\n", queue_congestion_on_threshold());
printk(" exit congestion at %d\n", queue_congestion_off_threshold());
blk_max_low_pfn = max_low_pfn;
blk_max_pfn = max_pfn;
for (i = 0; i < ARRAY_SIZE(congestion_states); i++) {
init_waitqueue_head(&congestion_states[i].wqh);
atomic_set(&congestion_states[i].nr_congested_queues, 0);
}
return 0;
};
......
......@@ -28,7 +28,8 @@
#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h> /* for fsync_bdev()/wakeup_bdflush() */
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* for fsync_bdev() */
#include <linux/spinlock.h>
......@@ -227,7 +228,7 @@ static void sysrq_handle_sync(int key, struct pt_regs *pt_regs,
struct tty_struct *tty)
{
emergency_sync_scheduled = EMERG_SYNC;
wakeup_bdflush();
wakeup_bdflush(0);
}
static struct sysrq_key_op sysrq_sync_op = {
handler: sysrq_handle_sync,
......@@ -239,7 +240,7 @@ static void sysrq_handle_mountro(int key, struct pt_regs *pt_regs,
struct tty_struct *tty)
{
emergency_sync_scheduled = EMERG_REMOUNT;
wakeup_bdflush();
wakeup_bdflush(0);
}
static struct sysrq_key_op sysrq_mountro_op = {
handler: sysrq_handle_mountro,
......
......@@ -458,19 +458,17 @@ void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
}
/*
* FIXME: What is this function actually trying to do? Why "zones[0]"?
* Is it still correct/needed if/when blockdev mappings use GFP_HIGHUSER?
* Kick pdflush then try to free up some ZONE_NORMAL memory.
*/
static void free_more_memory(void)
{
struct zone *zone;
zone = contig_page_data.node_zonelists[GFP_NOFS & GFP_ZONEMASK].zones[0];
wakeup_bdflush();
try_to_free_pages(zone, GFP_NOFS, 0);
zone = contig_page_data.node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0];
wakeup_bdflush(1024);
blk_run_queues();
yield();
try_to_free_pages(zone, GFP_NOFS, 0);
}
/*
......
......@@ -16,9 +16,9 @@
#include "ext2.h"
#include <linux/quotaops.h>
#include <linux/sched.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
/*
* ialloc.c contains the inodes allocation and deallocation routines
*/
......@@ -169,6 +169,13 @@ static void ext2_preread_inode(struct inode *inode)
unsigned long block;
struct buffer_head *bh;
struct ext2_group_desc * gdp;
struct backing_dev_info *bdi;
bdi = inode->i_mapping->backing_dev_info;
if (bdi_read_congested(bdi))
return;
if (bdi_write_congested(bdi))
return;
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
gdp = ext2_get_group_desc(inode->i_sb, block_group, &bh);
......
......@@ -1473,7 +1473,7 @@ struct address_space_operations ext3_aops = {
};
/* For writeback mode, we can use mpage_writepages() */
#if 0 /* Doesn't work for shared mappings */
static int
ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
......@@ -1486,12 +1486,12 @@ ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
ret = err;
return ret;
}
#endif
struct address_space_operations ext3_writeback_aops = {
.readpage = ext3_readpage, /* BKL not held. Don't need */
.readpages = ext3_readpages, /* BKL not held. Don't need */
.writepage = ext3_writepage, /* BKL not held. We take it */
.writepages = ext3_writepages, /* BKL not held. Don't need */
.sync_page = block_sync_page,
.prepare_write = ext3_prepare_write, /* BKL not held. We take it */
.commit_write = ext3_commit_write, /* BKL not held. We take it */
......
......@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync,
*
* FIXME: this linear search could get expensive with many fileystems. But
* how to fix? We need to go from an address_space to all inodes which share
* a queue with that address_space.
* a queue with that address_space. (Easy: have a global "dirty superblocks"
* list).
*
* The inodes to be written are parked on sb->s_io. They are moved back onto
* sb->s_dirty as they are selected for writing. This way, none can be missed
* on the writer throttling path, and we get decent balancing between many
* thrlttled threads: we don't want them all piling up on __wait_on_inode.
* throlttled threads: we don't want them all piling up on __wait_on_inode.
*/
static void
sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{
struct list_head *tmp;
struct list_head *head;
const unsigned long start = jiffies; /* livelock avoidance */
list_splice_init(&sb->s_dirty, &sb->s_io);
head = &sb->s_io;
while ((tmp = head->prev) != head) {
struct inode *inode = list_entry(tmp, struct inode, i_list);
while (!list_empty(&sb->s_io)) {
struct inode *inode = list_entry(sb->s_io.prev,
struct inode, i_list);
struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi;
struct backing_dev_info *bdi = mapping->backing_dev_info;
int really_sync;
if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
if (sb != blockdev_superblock)
break; /* inappropriate superblock */
break; /* Skip a congested fs */
list_move(&inode->i_list, &sb->s_dirty);
continue; /* not this blockdev */
continue; /* Skip a congested blockdev */
}
if (wbc->bdi && bdi != wbc->bdi) {
if (sb != blockdev_superblock)
break; /* fs has the wrong queue */
list_move(&inode->i_list, &sb->s_dirty);
continue; /* blockdev has wrong queue */
}
/* Was this inode dirtied after sync_sb_inodes was called? */
if (time_after(mapping->dirtied_when, start))
break;
/* Was this inode dirtied too recently? */
if (wbc->older_than_this && time_after(mapping->dirtied_when,
*wbc->older_than_this))
goto out;
break;
bdi = mapping->backing_dev_info;
/* Is another pdflush already flushing this queue? */
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
......@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0)
break;
}
out:
/*
* Leave any unwritten inodes on s_io.
*/
return;
return; /* Leave any unwritten inodes on s_io */
}
/*
......
......@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
/*
......@@ -522,6 +523,7 @@ int
mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
int ret = 0;
......@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping,
struct pagevec pvec;
int (*writepage)(struct page *);
if (wbc->nonblocking && bdi_write_congested(bdi)) {
blk_run_queues();
wbc->encountered_congestion = 1;
return 0;
}
writepage = NULL;
if (get_block == NULL)
writepage = mapping->a_ops->writepage;
......@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping,
}
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
blk_run_queues();
wbc->encountered_congestion = 1;
done = 1;
}
} else {
unlock_page(page);
}
......
......@@ -8,11 +8,15 @@
#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H
#include <asm/atomic.h>
/*
* Bits in backing_dev_info.state
*/
enum bdi_state {
BDI_pdflush, /* A pdflush thread is working this device */
BDI_write_congested, /* The write queue is getting full */
BDI_read_congested, /* The read queue is getting full */
BDI_unused, /* Available bits start here */
};
......@@ -28,4 +32,14 @@ int writeback_acquire(struct backing_dev_info *bdi);
int writeback_in_progress(struct backing_dev_info *bdi);
void writeback_release(struct backing_dev_info *bdi);
static inline int bdi_read_congested(struct backing_dev_info *bdi)
{
return test_bit(BDI_read_congested, &bdi->state);
}
static inline int bdi_write_congested(struct backing_dev_info *bdi)
{
return test_bit(BDI_write_congested, &bdi->state);
}
#endif /* _LINUX_BACKING_DEV_H */
......@@ -345,6 +345,7 @@ extern void blk_queue_end_tag(request_queue_t *, struct request *);
extern int blk_queue_init_tags(request_queue_t *, int);
extern void blk_queue_free_tags(request_queue_t *);
extern void blk_queue_invalidate_tags(request_queue_t *);
extern void blk_congestion_wait(int rw, long timeout);
#define MAX_PHYS_SEGMENTS 128
#define MAX_HW_SEGMENTS 128
......
......@@ -163,7 +163,6 @@ struct buffer_head * __getblk(struct block_device *, sector_t, int);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
struct buffer_head *__bread(struct block_device *, sector_t block, int size);
void wakeup_bdflush(void);
struct buffer_head *alloc_buffer_head(void);
void free_buffer_head(struct buffer_head * bh);
void FASTCALL(unlock_buffer(struct buffer_head *bh));
......
......@@ -273,6 +273,7 @@ extern struct user_struct root_user;
#define INIT_USER (&root_user)
typedef struct prio_array prio_array_t;
struct backing_dev_info;
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
......@@ -398,6 +399,7 @@ struct task_struct {
/* journalling filesystem info */
void *journal_info;
struct dentry *proc_dentry;
struct backing_dev_info *backing_dev_info;
};
extern void __put_task_struct(struct task_struct *tsk);
......
......@@ -43,6 +43,8 @@ struct writeback_control {
older than this */
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
int nonblocking; /* Don't get stuck on request queues */
int encountered_congestion; /* An output: a queue is full */
};
void writeback_inodes(struct writeback_control *wbc);
......@@ -61,6 +63,8 @@ static inline void wait_on_inode(struct inode *inode)
/*
* mm/page-writeback.c
*/
int wakeup_bdflush(long nr_pages);
/* These 5 are exported to sysctl. */
extern int dirty_background_ratio;
extern int dirty_async_ratio;
......
......@@ -81,7 +81,6 @@ unsigned char software_suspend_enabled = 0;
#define TIMEOUT (6 * HZ) /* Timeout for stopping processes */
#define ADDRESS(x) ((unsigned long) phys_to_virt(((x) << PAGE_SHIFT)))
extern void wakeup_bdflush(void);
extern int C_A_D;
/* References to section boundaries */
......
......@@ -1755,6 +1755,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
if (unlikely(pos < 0))
return -EINVAL;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
pagevec_init(&lru_pvec);
if (unlikely(file->f_error)) {
......@@ -1959,6 +1962,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
err = written ? written : status;
out:
pagevec_lru_add(&lru_pvec);
current->backing_dev_info = 0;
return err;
}
......
......@@ -12,7 +12,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/mempool.h>
#include <linux/buffer_head.h> /* for wakeup_bdflush() */
#include <linux/writeback.h>
static void add_element(mempool_t *pool, void *element)
{
......@@ -210,7 +210,7 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
/*
* Kick the VM at this point.
*/
wakeup_bdflush();
wakeup_bdflush(0);
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
......
......@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/notifier.h>
#include <linux/smp.h>
......@@ -172,33 +173,47 @@ static void background_writeout(unsigned long _min_pages)
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
};
CHECK_EMERGENCY_SYNC
background_thresh = (dirty_background_ratio * total_pages) / 100;
do {
for ( ; ; ) {
struct page_state ps;
get_page_state(&ps);
if (ps.nr_dirty < background_thresh && min_pages <= 0)
break;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
} while (wbc.nr_to_write <= 0);
if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
/* Wrote nothing */
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ/10);
else
break;
}
}
blk_run_queues();
}
/*
* Start heavy writeback of everything.
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
* -1 if all pdflush threads were busy.
*/
void wakeup_bdflush(void)
int wakeup_bdflush(long nr_pages)
{
struct page_state ps;
if (nr_pages == 0) {
struct page_state ps;
get_page_state(&ps);
pdflush_operation(background_writeout, ps.nr_dirty);
get_page_state(&ps);
nr_pages = ps.nr_dirty;
}
return pdflush_operation(background_writeout, nr_pages);
}
static struct timer_list wb_timer;
......@@ -223,25 +238,36 @@ static void wb_kupdate(unsigned long arg)
unsigned long oldest_jif;
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
struct page_state ps;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
.nr_to_write = 0,
.nonblocking = 1,
};
sync_supers();
get_page_state(&ps);
get_page_state(&ps);
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
wbc.nr_to_write = ps.nr_dirty;
writeback_inodes(&wbc);
nr_to_write = ps.nr_dirty;
while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ);
else
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
blk_run_queues();
yield();
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
mod_timer(&wb_timer, next_jif);
......@@ -493,7 +519,6 @@ int __set_page_dirty_buffers(struct page *page)
buffer_error();
spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
......@@ -506,6 +531,7 @@ int __set_page_dirty_buffers(struct page *page)
bh = bh->b_this_page;
} while (bh != head);
}
spin_unlock(&mapping->private_lock);
if (!TestSetPageDirty(page)) {
write_lock(&mapping->page_lock);
......@@ -519,7 +545,6 @@ int __set_page_dirty_buffers(struct page *page)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
spin_unlock(&mapping->private_lock);
out:
return ret;
}
......
......@@ -918,6 +918,26 @@ static int setup_swap_extents(struct swap_info_struct *sis)
return ret;
}
#if 0 /* We don't need this yet */
#include <linux/backing-dev.h>
int page_queue_congested(struct page *page)
{
struct backing_dev_info *bdi;
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
bdi = page->mapping->backing_dev_info;
if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page->index };
struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry));
bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
}
return bdi_write_congested(bdi);
}
#endif
asmlinkage long sys_swapoff(const char * specialfile)
{
struct swap_info_struct * p = NULL;
......
......@@ -21,9 +21,11 @@
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/suspend.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h> /* for try_to_release_page() */
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/rmap-locking.h>
#include <asm/pgalloc.h>
......@@ -32,11 +34,11 @@
/*
* The "priority" of VM scanning is how much of the queues we
* will scan in one go. A value of 6 for DEF_PRIORITY implies
* that we'll scan 1/64th of the queues ("queue_length >> 6")
* will scan in one go. A value of 12 for DEF_PRIORITY implies
* that we'll scan 1/4096th of the queues ("queue_length >> 12")
* during a normal aging round.
*/
#define DEF_PRIORITY (6)
#define DEF_PRIORITY 12
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
......@@ -95,7 +97,7 @@ static inline int is_page_cache_freeable(struct page *page)
static /* inline */ int
shrink_list(struct list_head *page_list, int nr_pages,
unsigned int gfp_mask, int priority, int *max_scan)
unsigned int gfp_mask, int *max_scan)
{
struct address_space *mapping;
LIST_HEAD(ret_pages);
......@@ -117,11 +119,22 @@ shrink_list(struct list_head *page_list, int nr_pages,
BUG_ON(PageActive(page));
may_enter_fs = (gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (gfp_mask & __GFP_IO));
/*
* If the page is mapped into pagetables then wait on it, to
* throttle this allocator to the rate at which we can clear
* MAP_SHARED data. This will also throttle against swapcache
* writes.
*/
if (PageWriteback(page)) {
if (may_enter_fs)
wait_on_page_writeback(page); /* throttling */
else
goto keep_locked;
if (may_enter_fs) {
if (page->pte.direct ||
page->mapping->backing_dev_info ==
current->backing_dev_info) {
wait_on_page_writeback(page);
}
}
goto keep_locked;
}
pte_chain_lock(page);
......@@ -172,15 +185,43 @@ shrink_list(struct list_head *page_list, int nr_pages,
* will write it. So we're back to page-at-a-time writepage
* in LRU order.
*/
if (PageDirty(page) && is_page_cache_freeable(page) &&
mapping && may_enter_fs) {
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in generic_file_write() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
* See swapfile.c:page_queue_congested().
*/
if (PageDirty(page)) {
int (*writeback)(struct page *,
struct writeback_control *);
struct backing_dev_info *bdi;
const int cluster_size = SWAP_CLUSTER_MAX;
struct writeback_control wbc = {
.nr_to_write = cluster_size,
};
if (!is_page_cache_freeable(page))
goto keep_locked;
if (!mapping)
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
bdi = mapping->backing_dev_info;
if (bdi != current->backing_dev_info &&
bdi_write_congested(bdi))
goto keep_locked;
writeback = mapping->a_ops->vm_writeback;
if (writeback == NULL)
writeback = generic_vm_writeback;
......@@ -279,7 +320,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
*/
static /* inline */ int
shrink_cache(int nr_pages, struct zone *zone,
unsigned int gfp_mask, int priority, int max_scan)
unsigned int gfp_mask, int max_scan)
{
LIST_HEAD(page_list);
struct pagevec pvec;
......@@ -298,11 +339,13 @@ shrink_cache(int nr_pages, struct zone *zone,
spin_lock_irq(&zone->lru_lock);
while (max_scan > 0 && nr_pages > 0) {
struct page *page;
int n = 0;
int nr_taken = 0;
int nr_scan = 0;
while (n < nr_to_process && !list_empty(&zone->inactive_list)) {
while (nr_scan++ < nr_to_process &&
!list_empty(&zone->inactive_list)) {
page = list_entry(zone->inactive_list.prev,
struct page, lru);
struct page, lru);
prefetchw_prev_lru_page(page,
&zone->inactive_list, flags);
......@@ -318,18 +361,17 @@ shrink_cache(int nr_pages, struct zone *zone,
}
list_add(&page->lru, &page_list);
page_cache_get(page);
n++;
nr_taken++;
}
zone->nr_inactive -= n;
zone->nr_inactive -= nr_taken;
spin_unlock_irq(&zone->lru_lock);
if (list_empty(&page_list))
if (nr_taken == 0)
goto done;
max_scan -= n;
KERNEL_STAT_ADD(pgscan, n);
nr_pages = shrink_list(&page_list, nr_pages,
gfp_mask, priority, &max_scan);
max_scan -= nr_scan;
KERNEL_STAT_ADD(pgscan, nr_scan);
nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan);
if (nr_pages <= 0 && list_empty(&page_list))
goto done;
......@@ -420,6 +462,15 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
}
pte_chain_unlock(page);
}
/*
* FIXME: need to consider page_count(page) here if/when we
* reap orphaned pages via the LRU (Daniel's locking stuff)
*/
if (total_swap_pages == 0 && !page->mapping &&
!PagePrivate(page)) {
list_add(&page->lru, &l_active);
continue;
}
list_add(&page->lru, &l_inactive);
pgdeactivate++;
}
......@@ -470,11 +521,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
}
static /* inline */ int
shrink_zone(struct zone *zone, int priority,
unsigned int gfp_mask, int nr_pages)
shrink_zone(struct zone *zone, int max_scan,
unsigned int gfp_mask, int nr_pages)
{
unsigned long ratio;
int max_scan;
/* This is bogus for ZONE_HIGHMEM? */
if (kmem_cache_reap(gfp_mask) >= nr_pages)
......@@ -497,43 +547,50 @@ shrink_zone(struct zone *zone, int priority,
atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
}
max_scan = zone->nr_inactive / priority;
nr_pages = shrink_cache(nr_pages, zone,
gfp_mask, priority, max_scan);
if (nr_pages <= 0)
return 0;
wakeup_bdflush();
shrink_dcache_memory(priority, gfp_mask);
/* After shrinking the dcache, get rid of unused inodes too .. */
shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan);
return nr_pages;
}
static int
shrink_caches(struct zone *classzone, int priority,
int gfp_mask, int nr_pages)
int *total_scanned, int gfp_mask, int nr_pages)
{
struct zone *first_classzone;
struct zone *zone;
first_classzone = classzone->zone_pgdat->node_zones;
zone = classzone;
while (zone >= first_classzone && nr_pages > 0) {
if (zone->free_pages <= zone->pages_high) {
nr_pages = shrink_zone(zone, priority,
gfp_mask, nr_pages);
}
zone--;
for (zone = classzone; zone >= first_classzone; zone--) {
int max_scan;
int to_reclaim;
int unreclaimed;
to_reclaim = zone->pages_high - zone->free_pages;
if (to_reclaim < 0)
continue; /* zone has enough memory */
if (to_reclaim > SWAP_CLUSTER_MAX)
to_reclaim = SWAP_CLUSTER_MAX;
if (to_reclaim < nr_pages)
to_reclaim = nr_pages;
/*
* If we cannot reclaim `nr_pages' pages by scanning twice
* that many pages then fall back to the next zone.
*/
max_scan = zone->nr_inactive >> priority;
if (max_scan < to_reclaim * 2)
max_scan = to_reclaim * 2;
unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim);
nr_pages -= to_reclaim - unreclaimed;
*total_scanned += max_scan;
}
shrink_dcache_memory(priority, gfp_mask);
shrink_icache_memory(1, gfp_mask);
#ifdef CONFIG_QUOTA
shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
#endif
return nr_pages;
}
......@@ -564,12 +621,25 @@ try_to_free_pages(struct zone *classzone,
KERNEL_STAT_INC(pageoutrun);
for (priority = DEF_PRIORITY; priority; priority--) {
nr_pages = shrink_caches(classzone, priority,
int total_scanned = 0;
nr_pages = shrink_caches(classzone, priority, &total_scanned,
gfp_mask, nr_pages);
if (nr_pages <= 0)
return 1;
if (total_scanned == 0)
return 1; /* All zones had enough free memory */
if (!(gfp_mask & __GFP_FS))
break;
break; /* Let the caller handle it */
/*
* Try to write back as many pages as we just scanned. Not
* sure if that makes sense, but it's an attempt to avoid
* creating IO storms unnecessarily
*/
wakeup_bdflush(total_scanned);
/* Take a nap, wait for some writeback to complete */
blk_congestion_wait(WRITE, HZ/4);
}
if (gfp_mask & __GFP_FS)
out_of_memory();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment