Commit c9b22619 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] use the congestion APIs in pdflush

The key concept here is that pdflush does not block on request queues
any more.  Instead, it circulates across the queues, keeping any
non-congested queues full of write data.  When all queues are full,
pdflush takes a nap, to be woken when *any* queue exits write
congestion.

This code can keep sixty spindles saturated - we've never been able to
do that before.

 - Add the `nonblocking' flag to struct writeback_control, and teach
   the writeback paths to honour it.

 - Add the `encountered_congestion' flag to struct writeback_control
   and teach the writeback paths to set it.

So as soon as a mapping's backing_dev_info indicates that it is getting
congested, bale out of writeback.  And don't even start writeback
against filesystems whose queues are congested.

 - Convert pdflush's background_writeback() function to use
   nonblocking writeback.

This way, a single pdflush thread will circulate around all the
dirty queues, keeping them filled.

 - Convert the pdlfush `kupdate' function to do the same thing.

This solves the problem of pdflush thread pool exhaustion.

It solves the problem of pdflush startup latency.

It solves the (minor) problem wherein `kupdate' writeback only writes
back a single disk at a time (it was getting blocked on each queue in
turn).

It probably means that we only ever need a single pdflush thread.
parent f3332384
...@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync, ...@@ -220,44 +220,52 @@ __writeback_single_inode(struct inode *inode, int sync,
* *
* FIXME: this linear search could get expensive with many fileystems. But * FIXME: this linear search could get expensive with many fileystems. But
* how to fix? We need to go from an address_space to all inodes which share * how to fix? We need to go from an address_space to all inodes which share
* a queue with that address_space. * a queue with that address_space. (Easy: have a global "dirty superblocks"
* list).
* *
* The inodes to be written are parked on sb->s_io. They are moved back onto * The inodes to be written are parked on sb->s_io. They are moved back onto
* sb->s_dirty as they are selected for writing. This way, none can be missed * sb->s_dirty as they are selected for writing. This way, none can be missed
* on the writer throttling path, and we get decent balancing between many * on the writer throttling path, and we get decent balancing between many
* thrlttled threads: we don't want them all piling up on __wait_on_inode. * throlttled threads: we don't want them all piling up on __wait_on_inode.
*/ */
static void static void
sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{ {
struct list_head *tmp;
struct list_head *head;
const unsigned long start = jiffies; /* livelock avoidance */ const unsigned long start = jiffies; /* livelock avoidance */
list_splice_init(&sb->s_dirty, &sb->s_io); list_splice_init(&sb->s_dirty, &sb->s_io);
head = &sb->s_io; while (!list_empty(&sb->s_io)) {
while ((tmp = head->prev) != head) { struct inode *inode = list_entry(sb->s_io.prev,
struct inode *inode = list_entry(tmp, struct inode, i_list); struct inode, i_list);
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi; struct backing_dev_info *bdi = mapping->backing_dev_info;
int really_sync; int really_sync;
if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) { if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
if (sb != blockdev_superblock) if (sb != blockdev_superblock)
break; /* inappropriate superblock */ break; /* Skip a congested fs */
list_move(&inode->i_list, &sb->s_dirty); list_move(&inode->i_list, &sb->s_dirty);
continue; /* not this blockdev */ continue; /* Skip a congested blockdev */
}
if (wbc->bdi && bdi != wbc->bdi) {
if (sb != blockdev_superblock)
break; /* fs has the wrong queue */
list_move(&inode->i_list, &sb->s_dirty);
continue; /* blockdev has wrong queue */
} }
/* Was this inode dirtied after sync_sb_inodes was called? */ /* Was this inode dirtied after sync_sb_inodes was called? */
if (time_after(mapping->dirtied_when, start)) if (time_after(mapping->dirtied_when, start))
break; break;
/* Was this inode dirtied too recently? */
if (wbc->older_than_this && time_after(mapping->dirtied_when, if (wbc->older_than_this && time_after(mapping->dirtied_when,
*wbc->older_than_this)) *wbc->older_than_this))
goto out; break;
bdi = mapping->backing_dev_info; /* Is another pdflush already flushing this queue? */
if (current_is_pdflush() && !writeback_acquire(bdi)) if (current_is_pdflush() && !writeback_acquire(bdi))
break; break;
...@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) ...@@ -278,11 +286,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (wbc->nr_to_write <= 0) if (wbc->nr_to_write <= 0)
break; break;
} }
out: return; /* Leave any unwritten inodes on s_io */
/*
* Leave any unwritten inodes on s_io.
*/
return;
} }
/* /*
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/mpage.h> #include <linux/mpage.h>
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
/* /*
...@@ -522,6 +523,7 @@ int ...@@ -522,6 +523,7 @@ int
mpage_writepages(struct address_space *mapping, mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block) struct writeback_control *wbc, get_block_t get_block)
{ {
struct backing_dev_info *bdi = mapping->backing_dev_info;
struct bio *bio = NULL; struct bio *bio = NULL;
sector_t last_block_in_bio = 0; sector_t last_block_in_bio = 0;
int ret = 0; int ret = 0;
...@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping, ...@@ -530,6 +532,12 @@ mpage_writepages(struct address_space *mapping,
struct pagevec pvec; struct pagevec pvec;
int (*writepage)(struct page *); int (*writepage)(struct page *);
if (wbc->nonblocking && bdi_write_congested(bdi)) {
blk_run_queues();
wbc->encountered_congestion = 1;
return 0;
}
writepage = NULL; writepage = NULL;
if (get_block == NULL) if (get_block == NULL)
writepage = mapping->a_ops->writepage; writepage = mapping->a_ops->writepage;
...@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping, ...@@ -585,6 +593,11 @@ mpage_writepages(struct address_space *mapping,
} }
if (ret || (--(wbc->nr_to_write) <= 0)) if (ret || (--(wbc->nr_to_write) <= 0))
done = 1; done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
blk_run_queues();
wbc->encountered_congestion = 1;
done = 1;
}
} else { } else {
unlock_page(page); unlock_page(page);
} }
......
...@@ -43,6 +43,8 @@ struct writeback_control { ...@@ -43,6 +43,8 @@ struct writeback_control {
older than this */ older than this */
long nr_to_write; /* Write this many pages, and decrement long nr_to_write; /* Write this many pages, and decrement
this for each page written */ this for each page written */
int nonblocking; /* Don't get stuck on request queues */
int encountered_congestion; /* An output: a queue is full */
}; };
void writeback_inodes(struct writeback_control *wbc); void writeback_inodes(struct writeback_control *wbc);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/init.h> #include <linux/init.h>
#include <linux/sysrq.h> #include <linux/sysrq.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/mpage.h> #include <linux/mpage.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/smp.h> #include <linux/smp.h>
...@@ -172,21 +173,30 @@ static void background_writeout(unsigned long _min_pages) ...@@ -172,21 +173,30 @@ static void background_writeout(unsigned long _min_pages)
.sync_mode = WB_SYNC_NONE, .sync_mode = WB_SYNC_NONE,
.older_than_this = NULL, .older_than_this = NULL,
.nr_to_write = 0, .nr_to_write = 0,
.nonblocking = 1,
}; };
CHECK_EMERGENCY_SYNC CHECK_EMERGENCY_SYNC
background_thresh = (dirty_background_ratio * total_pages) / 100; background_thresh = (dirty_background_ratio * total_pages) / 100;
for ( ; ; ) {
do {
struct page_state ps; struct page_state ps;
get_page_state(&ps); get_page_state(&ps);
if (ps.nr_dirty < background_thresh && min_pages <= 0) if (ps.nr_dirty < background_thresh && min_pages <= 0)
break; break;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc); writeback_inodes(&wbc);
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
} while (wbc.nr_to_write <= 0); if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
/* Wrote nothing */
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ/10);
else
break;
}
}
blk_run_queues(); blk_run_queues();
} }
...@@ -223,25 +233,36 @@ static void wb_kupdate(unsigned long arg) ...@@ -223,25 +233,36 @@ static void wb_kupdate(unsigned long arg)
unsigned long oldest_jif; unsigned long oldest_jif;
unsigned long start_jif; unsigned long start_jif;
unsigned long next_jif; unsigned long next_jif;
long nr_to_write;
struct page_state ps; struct page_state ps;
struct writeback_control wbc = { struct writeback_control wbc = {
.bdi = NULL, .bdi = NULL,
.sync_mode = WB_SYNC_NONE, .sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif, .older_than_this = &oldest_jif,
.nr_to_write = 0, .nr_to_write = 0,
.nonblocking = 1,
}; };
sync_supers(); sync_supers();
get_page_state(&ps);
get_page_state(&ps);
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies; start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
wbc.nr_to_write = ps.nr_dirty; nr_to_write = ps.nr_dirty;
writeback_inodes(&wbc); while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
if (wbc.nr_to_write == MAX_WRITEBACK_PAGES) {
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ);
else
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
blk_run_queues(); blk_run_queues();
yield();
if (time_before(next_jif, jiffies + HZ)) if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ; next_jif = jiffies + HZ;
mod_timer(&wb_timer, next_jif); mod_timer(&wb_timer, next_jif);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment