Commit 17a74e88 authored by Andrew Morton's avatar Andrew Morton Committed by Arnaldo Carvalho de Melo

[PATCH] pdflush exclusion

Use the pdflush exclusion infrastructure to ensure that only one
pdlfush thread is ever performing writeback against a particular
request_queue.

This works rather well.  It requires a lot of activity against a lot of
disks to cause more pdflush threads to start up.  Possibly the
thread-creation logic is a little weak: it starts more threads when a
pdflush thread goes back to sleep.  It may be better to start new
threads within pdlfush_operation().

All non-request_queue-backed address_spaces share the global
default_backing_dev_info structure.  So at present only a single
pdflush instance will be available for background writeback of *all*
NFS filesystems (for example).

If there is benefit in concurrent background writeback for multiple NFS
mounts then NFS would need to create per-mount backing_dev_info
structures and install those into new inode's address_spaces in some
manner.
parent 1f6acea0
...@@ -187,6 +187,9 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) ...@@ -187,6 +187,9 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
static void static void
__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
{ {
if (current_is_pdflush() && (inode->i_state & I_LOCK))
return;
while (inode->i_state & I_LOCK) { while (inode->i_state & I_LOCK) {
__iget(inode); __iget(inode);
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
...@@ -213,6 +216,9 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) ...@@ -213,6 +216,9 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
* had their first dirtying at a time earlier than *older_than_this. * had their first dirtying at a time earlier than *older_than_this.
* *
* Called under inode_lock. * Called under inode_lock.
*
* If we're a pdlfush thread, then implement pdlfush collision avoidance
* against the entire list.
*/ */
static void __sync_list(struct list_head *head, int sync_mode, static void __sync_list(struct list_head *head, int sync_mode,
int *nr_to_write, unsigned long *older_than_this) int *nr_to_write, unsigned long *older_than_this)
...@@ -223,6 +229,8 @@ static void __sync_list(struct list_head *head, int sync_mode, ...@@ -223,6 +229,8 @@ static void __sync_list(struct list_head *head, int sync_mode,
while ((tmp = head->prev) != head) { while ((tmp = head->prev) != head) {
struct inode *inode = list_entry(tmp, struct inode, i_list); struct inode *inode = list_entry(tmp, struct inode, i_list);
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi;
int really_sync; int really_sync;
/* Was this inode dirtied after __sync_list was called? */ /* Was this inode dirtied after __sync_list was called? */
...@@ -233,10 +241,18 @@ static void __sync_list(struct list_head *head, int sync_mode, ...@@ -233,10 +241,18 @@ static void __sync_list(struct list_head *head, int sync_mode,
time_after(mapping->dirtied_when, *older_than_this)) time_after(mapping->dirtied_when, *older_than_this))
break; break;
bdi = mapping->backing_dev_info;
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
really_sync = (sync_mode == WB_SYNC_ALL); really_sync = (sync_mode == WB_SYNC_ALL);
if ((sync_mode == WB_SYNC_LAST) && (head->prev == head)) if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
really_sync = 1; really_sync = 1;
__writeback_single_inode(inode, really_sync, nr_to_write); __writeback_single_inode(inode, really_sync, nr_to_write);
if (current_is_pdflush())
writeback_release(bdi);
if (nr_to_write && *nr_to_write == 0) if (nr_to_write && *nr_to_write == 0)
break; break;
} }
...@@ -255,6 +271,8 @@ static void __sync_list(struct list_head *head, int sync_mode, ...@@ -255,6 +271,8 @@ static void __sync_list(struct list_head *head, int sync_mode,
* *
* If `older_than_this' is non-zero then only flush inodes which have a * If `older_than_this' is non-zero then only flush inodes which have a
* flushtime older than *older_than_this. * flushtime older than *older_than_this.
*
* This is a "memory cleansing" operation, not a "data integrity" operation.
*/ */
void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
unsigned long *older_than_this) unsigned long *older_than_this)
...@@ -276,29 +294,12 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, ...@@ -276,29 +294,12 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
if (sb->s_writeback_gen == writeback_gen) if (sb->s_writeback_gen == writeback_gen)
continue; continue;
sb->s_writeback_gen = writeback_gen; sb->s_writeback_gen = writeback_gen;
if (current->flags & PF_FLUSHER) {
if (sb->s_flags & MS_FLUSHING) {
/*
* There's no point in two pdflush threads
* flushing the same device. But for other
* callers, we want to perform the flush
* because the fdatasync is how we implement
* writer throttling.
*/
continue;
}
sb->s_flags |= MS_FLUSHING;
}
if (!list_empty(&sb->s_dirty)) { if (!list_empty(&sb->s_dirty)) {
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
__sync_list(&sb->s_dirty, sync_mode, __sync_list(&sb->s_dirty, sync_mode,
nr_to_write, older_than_this); nr_to_write, older_than_this);
spin_lock(&sb_lock); spin_lock(&sb_lock);
} }
if (current->flags & PF_FLUSHER)
sb->s_flags &= ~MS_FLUSHING;
if (nr_to_write && *nr_to_write == 0) if (nr_to_write && *nr_to_write == 0)
break; break;
} }
...@@ -307,7 +308,7 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode, ...@@ -307,7 +308,7 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
} }
/* /*
* Called under inode_lock * Called under inode_lock.
*/ */
static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
{ {
...@@ -318,7 +319,17 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) ...@@ -318,7 +319,17 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
inode = list_entry(tmp, struct inode, i_list); inode = list_entry(tmp, struct inode, i_list);
if (!atomic_read(&inode->i_count)) { if (!atomic_read(&inode->i_count)) {
struct backing_dev_info *bdi;
bdi = inode->i_mapping->backing_dev_info;
if (current_is_pdflush() && !writeback_acquire(bdi))
goto out;
__sync_single_inode(inode, 0, NULL); __sync_single_inode(inode, 0, NULL);
if (current_is_pdflush())
writeback_release(bdi);
nr_inodes--; nr_inodes--;
/* /*
...@@ -328,7 +339,7 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes) ...@@ -328,7 +339,7 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
tmp = head; tmp = head;
} }
} }
out:
return nr_inodes; return nr_inodes;
} }
...@@ -421,7 +432,11 @@ void sync_inodes(void) ...@@ -421,7 +432,11 @@ void sync_inodes(void)
} }
} }
void try_to_writeback_unused_inodes(unsigned long pexclusive) /*
* FIXME: the try_to_writeback_unused functions look dreadfully similar to
* writeback_unlocked_inodes...
*/
void try_to_writeback_unused_inodes(unsigned long unused)
{ {
struct super_block * sb; struct super_block * sb;
int nr_inodes = inodes_stat.nr_unused; int nr_inodes = inodes_stat.nr_unused;
...@@ -440,7 +455,6 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive) ...@@ -440,7 +455,6 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive)
} }
spin_unlock(&sb_lock); spin_unlock(&sb_lock);
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
clear_bit(0, (unsigned long *)pexclusive);
} }
/** /**
......
...@@ -404,21 +404,14 @@ void prune_icache(int goal) ...@@ -404,21 +404,14 @@ void prune_icache(int goal)
dispose_list(freeable); dispose_list(freeable);
/* /*
* If we didn't freed enough clean inodes schedule * If we didn't free enough clean inodes then schedule writeback of
* a sync of the dirty inodes, we cannot do it * the dirty inodes. We cannot do it from here or we're either
* from here or we're either synchronously dogslow * synchronously dogslow or we deadlock with oom.
* or we deadlock with oom.
*/ */
if (goal) { if (goal)
static unsigned long exclusive; pdflush_operation(try_to_writeback_unused_inodes, 0);
if (!test_and_set_bit(0, &exclusive)) {
if (pdflush_operation(try_to_writeback_unused_inodes,
(unsigned long)&exclusive))
clear_bit(0, &exclusive);
}
}
} }
/* /*
* This is called from kswapd when we think we need some * This is called from kswapd when we think we need some
* more memory, but aren't really sure how much. So we * more memory, but aren't really sure how much. So we
......
...@@ -112,7 +112,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time; ...@@ -112,7 +112,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
#define MS_MOVE 8192 #define MS_MOVE 8192
#define MS_REC 16384 #define MS_REC 16384
#define MS_VERBOSE 32768 #define MS_VERBOSE 32768
#define MS_FLUSHING (1<<16) /* inodes are currently under writeout */
#define MS_ACTIVE (1<<30) #define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31) #define MS_NOUSER (1<<31)
...@@ -156,7 +155,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time; ...@@ -156,7 +155,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC)) #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
#define IS_FLUSHING(inode) __IS_FLG(inode, MS_FLUSHING)
#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) #define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA)
#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
......
...@@ -12,6 +12,15 @@ extern spinlock_t inode_lock; ...@@ -12,6 +12,15 @@ extern spinlock_t inode_lock;
extern struct list_head inode_in_use; extern struct list_head inode_in_use;
extern struct list_head inode_unused; extern struct list_head inode_unused;
/*
* Yes, writeback.h requires sched.h
* No, sched.h is not included from here.
*/
static inline int current_is_pdflush(void)
{
return current->flags & PF_FLUSHER;
}
/* /*
* fs/fs-writeback.c * fs/fs-writeback.c
*/ */
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/writeback.h> #include <linux/writeback.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/sysrq.h> #include <linux/sysrq.h>
#include <linux/backing-dev.h>
/* /*
* Memory thresholds, in percentages * Memory thresholds, in percentages
...@@ -86,10 +87,7 @@ void balance_dirty_pages(struct address_space *mapping) ...@@ -86,10 +87,7 @@ void balance_dirty_pages(struct address_space *mapping)
wake_pdflush = 1; wake_pdflush = 1;
} }
if (wake_pdflush && !IS_FLUSHING(mapping->host)) { if (wake_pdflush && !writeback_in_progress(mapping->backing_dev_info)) {
/*
* There is no flush thread against this device. Start one now.
*/
if (dirty_and_writeback > async_thresh) { if (dirty_and_writeback > async_thresh) {
pdflush_flush(dirty_and_writeback - async_thresh); pdflush_flush(dirty_and_writeback - async_thresh);
yield(); yield();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment