[PATCH] pdflush exclusion

Use the pdflush exclusion infrastructure to ensure that only one pdlfush thread is ever performing writeback against a particular request_queue. This works rather well. It requires a lot of activity against a lot of disks to cause more pdflush threads to start up. Possibly the thread-creation logic is a little weak: it starts more threads when a pdflush thread goes back to sleep. It may be better to start new threads within pdlfush_operation(). All non-request_queue-backed address_spaces share the global default_backing_dev_info structure. So at present only a single pdflush instance will be available for background writeback of *all* NFS filesystems (for example). If there is benefit in concurrent background writeback for multiple NFS mounts then NFS would need to create per-mount backing_dev_info structures and install those into new inode's address_spaces in some manner.

[PATCH] pdflush exclusion
Use the pdflush exclusion infrastructure to ensure that only one pdlfush thread is ever performing writeback against a particular request_queue. This works rather well. It requires a lot of activity against a lot of disks to cause more pdflush threads to start up. Possibly the thread-creation logic is a little weak: it starts more threads when a pdflush thread goes back to sleep. It may be better to start new threads within pdlfush_operation(). All non-request_queue-backed address_spaces share the global default_backing_dev_info structure. So at present only a single pdflush instance will be available for background writeback of *all* NFS filesystems (for example). If there is benefit in concurrent background writeback for multiple NFS mounts then NFS would need to create per-mount backing_dev_info structures and install those into new inode's address_spaces in some manner.
17a74e88 · Andrew Morton · Arnaldo Carvalho de Melo · 1f6acea0 · 17a74e88 · 17a74e88
Commit 17a74e88 authored May 19, 2002 by Andrew Morton Committed by Arnaldo Carvalho de Melo May 19, 2002
5 changed files
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -187,6 +187,9 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
 static void
 __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
 {
+	if (current_is_pdflush() && (inode->i_state & I_LOCK))
+		return;
+
 	while (inode->i_state & I_LOCK) {
 		__iget(inode);
 		spin_unlock(&inode_lock);
@@ -213,6 +216,9 @@ void writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
 * had their first dirtying at a time earlier than *older_than_this.
 *
 * Called under inode_lock.
+ *
+ * If we're a pdlfush thread, then implement pdlfush collision avoidance
+ * against the entire list.
 */
 static void __sync_list(struct list_head *head, int sync_mode,
 		int *nr_to_write, unsigned long *older_than_this)
@@ -223,6 +229,8 @@ static void __sync_list(struct list_head *head, int sync_mode,
 	while ((tmp = head->prev) != head) {
 		struct inode *inode = list_entry(tmp, struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
+		struct backing_dev_info *bdi;
+
 		int really_sync;

 		/* Was this inode dirtied after __sync_list was called? */
@@ -233,10 +241,18 @@ static void __sync_list(struct list_head *head, int sync_mode,
 			time_after(mapping->dirtied_when, *older_than_this))
 			break;

+		bdi = mapping->backing_dev_info;
+		if (current_is_pdflush() && !writeback_acquire(bdi))
+			break;
+
 		really_sync = (sync_mode == WB_SYNC_ALL);
 		if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
 			really_sync = 1;
 		__writeback_single_inode(inode, really_sync, nr_to_write);
+
+		if (current_is_pdflush())
+			writeback_release(bdi);
+
 		if (nr_to_write && *nr_to_write == 0)
 			break;
 	}
@@ -255,6 +271,8 @@ static void __sync_list(struct list_head *head, int sync_mode,
 *
 * If `older_than_this' is non-zero then only flush inodes which have a
 * flushtime older than *older_than_this.
+ *
+ * This is a "memory cleansing" operation, not a "data integrity" operation.
 */
 void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
 				unsigned long *older_than_this)
@@ -276,29 +294,12 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
 		if (sb->s_writeback_gen == writeback_gen)
 			continue;
 		sb->s_writeback_gen = writeback_gen;
-
-		if (current->flags & PF_FLUSHER) {
-			if (sb->s_flags & MS_FLUSHING) {
-				/*
-				 * There's no point in two pdflush threads
-				 * flushing the same device.  But for other
-				 * callers, we want to perform the flush
-				 * because the fdatasync is how we implement
-				 * writer throttling.
-				 */
-				continue;
-			}
-			sb->s_flags |= MS_FLUSHING;
-		}
-
 		if (!list_empty(&sb->s_dirty)) {
 			spin_unlock(&sb_lock);
 			__sync_list(&sb->s_dirty, sync_mode,
 					nr_to_write, older_than_this);
 			spin_lock(&sb_lock);
 		}
-		if (current->flags & PF_FLUSHER)
-			sb->s_flags &= ~MS_FLUSHING;
 		if (nr_to_write && *nr_to_write == 0)
 			break;
 	}
@@ -307,7 +308,7 @@ void writeback_unlocked_inodes(int *nr_to_write, int sync_mode,
 }

 /*
- * Called under inode_lock
+ * Called under inode_lock.
 */
 static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
 {
@@ -318,7 +319,17 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
 		inode = list_entry(tmp, struct inode, i_list);

 		if (!atomic_read(&inode->i_count)) {
+			struct backing_dev_info *bdi;
+
+			bdi = inode->i_mapping->backing_dev_info;
+			if (current_is_pdflush() && !writeback_acquire(bdi))
+				goto out;
+
 			__sync_single_inode(inode, 0, NULL);
+
+			if (current_is_pdflush())
+				writeback_release(bdi);
+
 			nr_inodes--;

 			/* 
@@ -328,7 +339,7 @@ static int __try_to_writeback_unused_list(struct list_head *head, int nr_inodes)
 			tmp = head;
 		}
 	}
-
+out:
 	return nr_inodes;
 }

@@ -421,7 +432,11 @@ void sync_inodes(void)
 	}
 }

-void try_to_writeback_unused_inodes(unsigned long pexclusive)
+/*
+ * FIXME: the try_to_writeback_unused functions look dreadfully similar to
+ * writeback_unlocked_inodes...
+ */
+void try_to_writeback_unused_inodes(unsigned long unused)
 {
 	struct super_block * sb;
 	int nr_inodes = inodes_stat.nr_unused;
@@ -440,7 +455,6 @@ void try_to_writeback_unused_inodes(unsigned long pexclusive)
 	}
 	spin_unlock(&sb_lock);
 	spin_unlock(&inode_lock);
-	clear_bit(0, (unsigned long *)pexclusive);
 }

 /**

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -404,21 +404,14 @@ void prune_icache(int goal)
 	dispose_list(freeable);

 	/* 
-	 * If we didn't freed enough clean inodes schedule
-	 * a sync of the dirty inodes, we cannot do it
-	 * from here or we're either synchronously dogslow
-	 * or we deadlock with oom.
-	 */
-	if (goal) {
-		static unsigned long exclusive;
-
-		if (!test_and_set_bit(0, &exclusive)) {
-			if (pdflush_operation(try_to_writeback_unused_inodes,
-						(unsigned long)&exclusive))
-				clear_bit(0, &exclusive);
-		}
-	}
+	 * If we didn't free enough clean inodes then schedule writeback of
+	 * the dirty inodes.  We cannot do it from here or we're either
+	 * synchronously dogslow or we deadlock with oom.
+	 */
+	if (goal)
+		pdflush_operation(try_to_writeback_unused_inodes, 0);
 }
+
 /*
 * This is called from kswapd when we think we need some
 * more memory, but aren't really sure how much. So we

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -112,7 +112,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
-#define MS_FLUSHING	(1<<16)	/* inodes are currently under writeout */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)

@@ -156,7 +155,6 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
 #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
 #define IS_SYNC(inode)		(__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
 #define IS_MANDLOCK(inode)	__IS_FLG(inode, MS_MANDLOCK)
-#define IS_FLUSHING(inode)	__IS_FLG(inode, MS_FLUSHING)

 #define IS_QUOTAINIT(inode)	((inode)->i_flags & S_QUOTA)
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)

--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -12,6 +12,15 @@ extern spinlock_t inode_lock;
 extern struct list_head inode_in_use;
 extern struct list_head inode_unused;

+/*
+ * Yes, writeback.h requires sched.h
+ * No, sched.h is not included from here.
+ */
+static inline int current_is_pdflush(void)
+{
+	return current->flags & PF_FLUSHER;
+}
+
 /*
 * fs/fs-writeback.c
 */

--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -20,6 +20,7 @@
 #include <linux/writeback.h>
 #include <linux/init.h>
 #include <linux/sysrq.h>
+#include <linux/backing-dev.h>

 /*
 * Memory thresholds, in percentages
@@ -86,10 +87,7 @@ void balance_dirty_pages(struct address_space *mapping)
 		wake_pdflush = 1;
 	}

-	if (wake_pdflush && !IS_FLUSHING(mapping->host)) {
-		/*
-		 * There is no flush thread against this device. Start one now.
-		 */
+	if (wake_pdflush && !writeback_in_progress(mapping->backing_dev_info)) {
 		if (dirty_and_writeback > async_thresh) {
 			pdflush_flush(dirty_and_writeback - async_thresh);
 			yield();