diff --git a/fs/block_dev.c b/fs/block_dev.c index 2afbdb794943780da90d5ec84ce8eab6b4ec8849..aa496e1bdd299686e28bb1f22390d9bfbad79eed 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -196,6 +196,7 @@ static struct file_system_type bd_type = { }; static struct vfsmount *bd_mnt; +struct super_block *blockdev_superblock; /* * bdev cache handling - shamelessly stolen from inode.c @@ -251,6 +252,7 @@ void __init bdev_cache_init(void) err = PTR_ERR(bd_mnt); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* @@ -567,13 +569,6 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file * } } } - if (bdev->bd_inode->i_data.backing_dev_info == - &default_backing_dev_info) { - struct backing_dev_info *bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; - inode->i_data.backing_dev_info = bdi; - } if (bdev->bd_op->open) { ret = bdev->bd_op->open(inode, file); if (ret) @@ -594,6 +589,16 @@ static int do_open(struct block_device *bdev, struct inode *inode, struct file * bdev->bd_queue = p->queue(dev); else bdev->bd_queue = &p->request_queue; + if (bdev->bd_inode->i_data.backing_dev_info == + &default_backing_dev_info) { + struct backing_dev_info *bdi; + + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) + bdi = &default_backing_dev_info; + inode->i_data.backing_dev_info = bdi; + bdev->bd_inode->i_data.backing_dev_info = bdi; + } } bdev->bd_openers++; unlock_kernel(); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5a3e1ffdb8186e66831ba483d808eb6711e4557..c738abdc2cdf4eca33ad2bf512c8cd419faf750e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,9 +19,12 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +extern struct super_block *blockdev_superblock; + /** * __mark_inode_dirty - internal function * @inode: inode to mark @@ -91,10 +94,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) * If the inode was already on s_dirty, don't reposition * it (that would break s_dirty time-ordering). */ - if (!was_dirty) { - list_del(&inode->i_list); - list_add(&inode->i_list, &sb->s_dirty); - } + if (!was_dirty) + list_move(&inode->i_list, &sb->s_dirty); } out: spin_unlock(&inode_lock); @@ -133,8 +134,7 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) struct address_space *mapping = inode->i_mapping; struct super_block *sb = inode->i_sb; - list_del(&inode->i_list); - list_add(&inode->i_list, &sb->s_locked_inodes); + list_move(&inode->i_list, &sb->s_locked_inodes); BUG_ON(inode->i_state & I_LOCK); @@ -212,9 +212,19 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) * that it can be located for waiting on in __writeback_single_inode(). * * Called under inode_lock. + * + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. + * + * FIXME: this linear search could get expensive with many fileystems. But + * how to fix? We need to go from an address_space to all inodes which share + * a queue with that address_space. */ -static void sync_sb_inodes(struct super_block *sb, int sync_mode, - int *nr_to_write, unsigned long *older_than_this) +static void +sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, + int sync_mode, int *nr_to_write, unsigned long *older_than_this) { struct list_head *tmp; struct list_head *head; @@ -228,7 +238,14 @@ static void sync_sb_inodes(struct super_block *sb, int sync_mode, struct backing_dev_info *bdi; int really_sync; - /* Was this inode dirtied after __sync_list was called? */ + if (single_bdi && mapping->backing_dev_info != single_bdi) { + if (sb != blockdev_superblock) + break; /* inappropriate superblock */ + list_move(&inode->i_list, &inode->i_sb->s_dirty); + continue; /* not this blockdev */ + } + + /* Was this inode dirtied after sync_sb_inodes was called? */ if (time_after(mapping->dirtied_when, start)) break; @@ -249,8 +266,7 @@ static void sync_sb_inodes(struct super_block *sb, int sync_mode, __writeback_single_inode(inode, really_sync, nr_to_write); if (sync_mode == WB_SYNC_HOLD) { mapping->dirtied_when = jiffies; - list_del(&inode->i_list); - list_add(&inode->i_list, &inode->i_sb->s_dirty); + list_move(&inode->i_list, &inode->i_sb->s_dirty); } if (current_is_pdflush()) writeback_release(bdi); @@ -269,23 +285,16 @@ static void sync_sb_inodes(struct super_block *sb, int sync_mode, } /* - * Start writeback of dirty pagecache data against all unlocked inodes. - * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. - * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. - * - * This is a "memory cleansing" operation, not a "data integrity" operation. + * If `bdi' is non-zero then we will scan the first inode against each + * superblock until we find the matching ones. One group will be the dirty + * inodes against a filesystem. Then when we hit the dummy blockdev superblock, + * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not + * super-efficient but we're about to do a ton of I/O... */ -void writeback_unlocked_inodes(int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this) +static void +__writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write, + enum writeback_sync_modes sync_mode, + unsigned long *older_than_this) { struct super_block *sb; @@ -295,7 +304,7 @@ void writeback_unlocked_inodes(int *nr_to_write, for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { if (!list_empty(&sb->s_dirty)) { spin_unlock(&sb_lock); - sync_sb_inodes(sb, sync_mode, nr_to_write, + sync_sb_inodes(bdi, sb, sync_mode, nr_to_write, older_than_this); spin_lock(&sb_lock); } @@ -306,6 +315,43 @@ void writeback_unlocked_inodes(int *nr_to_write, spin_unlock(&inode_lock); } +/* + * Start writeback of dirty pagecache data against all unlocked inodes. + * + * Note: + * We don't need to grab a reference to superblock here. If it has non-empty + * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed + * past sync_inodes_sb() until both ->s_dirty and ->s_locked_inodes are + * empty. Since __sync_single_inode() regains inode_lock before it finally moves + * inode from superblock lists we are OK. + * + * If `older_than_this' is non-zero then only flush inodes which have a + * flushtime older than *older_than_this. + * + * This is a "memory cleansing" operation, not a "data integrity" operation. + */ +void writeback_unlocked_inodes(int *nr_to_write, + enum writeback_sync_modes sync_mode, + unsigned long *older_than_this) +{ + __writeback_unlocked_inodes(NULL, nr_to_write, + sync_mode, older_than_this); +} +/* + * Perform writeback of dirty data against a particular queue. + * + * This is for writer throttling. We don't want processes to write back + * other process's data, espsecially when the other data belongs to a + * different spindle. + */ +void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write, + enum writeback_sync_modes sync_mode, + unsigned long *older_than_this) +{ + __writeback_unlocked_inodes(bdi, nr_to_write, + sync_mode, older_than_this); +} + static void __wait_on_locked(struct list_head *head) { struct list_head * tmp; @@ -336,7 +382,7 @@ void sync_inodes_sb(struct super_block *sb, int wait) nr_to_write = ps.nr_dirty + ps.nr_dirty / 4; spin_lock(&inode_lock); - sync_sb_inodes(sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD, &nr_to_write, NULL); if (wait) __wait_on_locked(&sb->s_locked_inodes); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b34e983207959204a71036f2ccff4e640a158587..5ebb5673154e3c02576da0495e8c4d35a0e7d1f3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -37,7 +37,7 @@ * will look to see if it needs to force writeback or throttling. Probably * should be scaled by memory size. */ -#define RATELIMIT_PAGES 1000 +#define RATELIMIT_PAGES ((512 * 1024) / PAGE_SIZE) /* * When balance_dirty_pages decides that the caller needs to perform some @@ -45,7 +45,7 @@ * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably * large amounts of I/O are submitted. */ -#define SYNC_WRITEBACK_PAGES 1500 +#define SYNC_WRITEBACK_PAGES ((RATELIMIT_PAGES * 3) / 2) /* The following parameters are exported via /proc/sys/vm */ @@ -108,6 +108,7 @@ void balance_dirty_pages(struct address_space *mapping) struct page_state ps; int background_thresh, async_thresh, sync_thresh; unsigned long dirty_and_writeback; + struct backing_dev_info *bdi; get_page_state(&ps); dirty_and_writeback = ps.nr_dirty + ps.nr_writeback; @@ -115,21 +116,21 @@ void balance_dirty_pages(struct address_space *mapping) background_thresh = (dirty_background_ratio * tot) / 100; async_thresh = (dirty_async_ratio * tot) / 100; sync_thresh = (dirty_sync_ratio * tot) / 100; + bdi = mapping->backing_dev_info; if (dirty_and_writeback > sync_thresh) { int nr_to_write = SYNC_WRITEBACK_PAGES; - writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL); + writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL); get_page_state(&ps); } else if (dirty_and_writeback > async_thresh) { int nr_to_write = SYNC_WRITEBACK_PAGES; - writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); + writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL); get_page_state(&ps); } - if (!writeback_in_progress(mapping->backing_dev_info) && - ps.nr_dirty > background_thresh) + if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh) pdflush_operation(background_writeout, 0); }