Commit 17a74e88 authored by Andrew Morton's avatar Andrew Morton Committed by Arnaldo Carvalho de Melo

[PATCH] pdflush exclusion

Use the pdflush exclusion infrastructure to ensure that only one
pdlfush thread is ever performing writeback against a particular
request_queue.

This works rather well.  It requires a lot of activity against a lot of
disks to cause more pdflush threads to start up.  Possibly the
thread-creation logic is a little weak: it starts more threads when a
pdflush thread goes back to sleep.  It may be better to start new
threads within pdlfush_operation().

All non-request_queue-backed address_spaces share the global
default_backing_dev_info structure.  So at present only a single
pdflush instance will be available for background writeback of *all*
NFS filesystems (for example).

If there is benefit in concurrent background writeback for multiple NFS
mounts then NFS would need to create per-mount backing_dev_info
structures and install those into new inode's address_spaces in some
manner.
parent 1f6acea0
......@@ -187,6 +187,9 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
static void
__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
{
if (current_is_pdflush() && (inode->i_state & I_LOCK))
return;
while (inode->i_state & I_LOCK) {
__iget(inode);
spin_unlock(&inode_lock);
......@@ -213,6 +216,9 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
* had their first dirtying at a time earlier than *older_than_this.
*
* Called under inode_lock.
*
* If we're a pdlfush thread, then implement pdlfush collision avoidance
* against the entire list.
*/
static void __sync_list(struct list_head *head, int sync_mode,
int *nr_to_write, unsigned long *older_than_this)
......@@ -223,6 +229,8 @@ static void __sync_list(struct list_head *head, int sync_mode,
while ((tmp = head->prev) != head) {
struct inode *inode = list_entry(tmp, struct inode, i_list);
struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi;
int really_sync;
/* Was this inode dirtied after __sync_list was called? */
......@@ -233,10 +241,18 @@ static void __sync_list(struct list_head *head, int sync_mode,
time_after(mapping->dirtied_when, *older_than_this))
break;
bdi = mapping->backing_dev_info;
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
really_sync = (sync_mode == WB_SYNC_ALL);
if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
really_sync = 1;
__writeback_single_inode(inode, really_sync, nr_to_write);
if (current_is_pdflush())
writeback_release(bdi);
if (nr_to_write && *nr_to_write == 0)
break;
}
......@@ -255,6 +271,8 @@ static void __sync_list(struct list_head *head, int sync_mode,
*
* If `older_than_this' is non-zero then only flush inodes which have a
* flushtime older than *older_than_this.
*
* This is a "memory cleansing" operation, not a "data integrity" operation.
*/
void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
unsigned long *older_than_this)
......@@ -276,29 +294,12 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
if (sb->s_writeback_gen == writeback_gen)
continue;
sb->s_writeback_gen = writeback_gen;
if (current->flags & PF_FLUSHER) {
if (sb->s_flags & MS_FLUSHING) {
/*
* There's no point in two pdflush threads
* flushing the same device. But for other
* callers, we want to perform the flush
* because the fdatasync is how we implement
* writer throttling.
*/
continue;
}
sb->s_flags |= MS_FLUSHING;
}
if (!list_empty(&sb->s_dirty)) {
spin_unlock(&sb_lock);
__sync_list(&sb->s_dirty, sync_mode,
nr_to_write, older_than_this);
spin_lock(&sb_lock);
}
if (current->flags & PF_FLUSHER)
sb->s_flags &= ~MS_FLUSHING;
if (nr_to_write && *nr_to_write == 0)
break;
}
......@@ -307,7 +308,7 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
}
/*
* Called under inode_lock
* Called under inode_lock.
*/
static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
{
......@@ -318,7 +319,17 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
inode = list_entry(tmp, struct inode, i_list);
if (!atomic_read(&inode->i_count)) {
struct backing_dev_info *bdi;
bdi = inode->i_mapping->backing_dev_info;
if (current_is_pdflush() && !writeback_acquire(bdi))
goto out;
__sync_single_inode(inode, 0, NULL);
if (current_is_pdflush())
writeback_release(bdi);
nr_inodes--;
/*
......@@ -328,7 +339,7 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
tmp = head;
}
}
out:
return nr_inodes;
}
......@@ -421,7 +432,11 @@ void sync_inodes(void)
}
}
void try_to_writeback_unused_inodes(unsigned long pexclusive)
/*
* FIXME: the try_to_writeback_unused functions look dreadfully similar to
* writeback_unlocked_inodes...
*/
void try_to_writeback_unused_inodes(unsigned long unused)
{
struct super_block * sb;
int nr_inodes = inodes_stat.nr_unused;
......@@ -440,7 +455,6 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive)
}
spin_unlock(&sb_lock);
spin_unlock(&inode_lock);
clear_bit(0, (unsigned long *)pexclusive);
}
/**
......
......@@ -404,21 +404,14 @@ void prune_icache(int goal)
dispose_list(freeable);
/*
* If we didn't freed enough clean inodes schedule
* a sync of the dirty inodes, we cannot do it
* from here or we're either synchronously dogslow
* or we deadlock with oom.
* If we didn't free enough clean inodes then schedule writeback of
* the dirty inodes. We cannot do it from here or we're either
* synchronously dogslow or we deadlock with oom.
*/
if (goal) {
static unsigned long exclusive;
if (!test_and_set_bit(0, &exclusive)) {
if (pdflush_operation(try_to_writeback_unused_inodes,
(unsigned long)&exclusive))
clear_bit(0, &exclusive);
}
}
if (goal)
pdflush_operation(try_to_writeback_unused_inodes, 0);
}
/*
* This is called from kswapd when we think we need some
* more memory, but aren't really sure how much. So we
......
......@@ -112,7 +112,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
#define MS_MOVE 8192
#define MS_REC 16384
#define MS_VERBOSE 32768
#define MS_FLUSHING (1<<16) /* inodes are currently under writeout */
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)
......@@ -156,7 +155,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
#define IS_FLUSHING(inode) __IS_FLG(inode, MS_FLUSHING)
#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA)
#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
......
......@@ -12,6 +12,15 @@ extern spinlock_t inode_lock;
extern struct list_head inode_in_use;
extern struct list_head inode_unused;
/*
* Yes, writeback.h requires sched.h
* No, sched.h is not included from here.
*/
static inline int current_is_pdflush(void)
{
return current->flags & PF_FLUSHER;
}
/*
* fs/fs-writeback.c
*/
......
......@@ -20,6 +20,7 @@
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/backing-dev.h>
/*
* Memory thresholds, in percentages
......@@ -86,10 +87,7 @@ void balance_dirty_pages(struct address_space *mapping)
wake_pdflush = 1;
}
if (wake_pdflush && !IS_FLUSHING(mapping->host)) {
/*
* There is no flush thread against this device. Start one now.
*/
if (wake_pdflush && !writeback_in_progress(mapping->backing_dev_info)) {
if (dirty_and_writeback > async_thresh) {
pdflush_flush(dirty_and_writeback - async_thresh);
yield();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment