[PATCH] Don't remove inode from hash until filesystem has

From: Neil Brown <neilb@cse.unsw.edu.au> When an NFS request arrives, it contains a filehandle which needs to be converted to a dentry. Many filesystems use find_exported_dentry in fs/exportfs/expfs.c. A key part of this on filesystem where a 32bit inode number uniquely locates a file is export_iget which calls iget(sb, inum). iget will either: 1/ find the inode in the inode cache and return it or 2/ create a new inode and call ->read_inode to load it from the storage device. export_iget then verifies the inode is really a good inode (->read_inode didn't detect any problems) and the right inode (base on generation number from the file handle). For this to work reliably, it is important that whenever an inode is *not* in the cache, the on-device version is up-to-date. Otherwise, when read_inode loads the inode it will get bad data. For a file that has not been deleted, this condition always holds: a dirty inode is always flushed to disc before the inode is unhashed. However for a file that is being deleted this condition doesn't (didn't) hold. When iput -> iput_final -> generic_drop_inode -> generic_delete_inode is called we would unhash the inode before calling into the filesytem through ->delete_inode. So there is a small window between when generic_delete_inode unhashes the inode, and when ->delete_inode writes something to disc, where a call to ->read_inode (for export_iget) might discover what it thinks is a valid inode, but is really one that is in the process of being destroyed. It is this window that I want to close by moving the unhashing to the end of generic_delete_inode.

[PATCH] Don't remove inode from hash until filesystem has
From: Neil Brown <neilb@cse.unsw.edu.au> When an NFS request arrives, it contains a filehandle which needs to be converted to a dentry. Many filesystems use find_exported_dentry in fs/exportfs/expfs.c. A key part of this on filesystem where a 32bit inode number uniquely locates a file is export_iget which calls iget(sb, inum). iget will either: 1/ find the inode in the inode cache and return it or 2/ create a new inode and call ->read_inode to load it from the storage device. export_iget then verifies the inode is really a good inode (->read_inode didn't detect any problems) and the right inode (base on generation number from the file handle). For this to work reliably, it is important that whenever an inode is *not* in the cache, the on-device version is up-to-date. Otherwise, when read_inode loads the inode it will get bad data. For a file that has not been deleted, this condition always holds: a dirty inode is always flushed to disc before the inode is unhashed. However for a file that is being deleted this condition doesn't (didn't) hold. When iput -> iput_final -> generic_drop_inode -> generic_delete_inode is called we would unhash the inode before calling into the filesytem through ->delete_inode. So there is a small window between when generic_delete_inode unhashes the inode, and when ->delete_inode writes something to disc, where a call to ->read_inode (for export_iget) might discover what it thinks is a valid inode, but is really one that is in the process of being destroyed. It is this window that I want to close by moving the unhashing to the end of generic_delete_inode.
d6686d54 · Andrew Morton · Linus Torvalds · 2eb4051e · d6686d54 · d6686d54
Commit d6686d54 authored May 25, 2003 by Andrew Morton Committed by Linus Torvalds May 25, 2003
Show whitespace changes
Inline Side-by-side

Showing with 55 additions and 2 deletions

fs/fs-writeback.c fs/fs-writeback.c +2 -1

fs/inode.c fs/inode.c +53 -1

No files found.
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * Only add valid (hashed) inodes to the superblock's
 		 * dirty list.  Add blockdev inodes as well.
 		 */
-		if (hlist_unhashed(&inode->i_hash) && !S_ISBLK(inode->i_mode))
+		if ((hlist_unhashed(&inode->i_hash) || (inode->i_state & (I_FREEING|I_CLEAR)))
+		    && !S_ISBLK(inode->i_mode))
 			goto out;

 		/*

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -470,6 +470,7 @@ static int shrink_icache_memory(int nr, unsigned int gfp_mask)
 	return inodes_stat.nr_unused;
 }

+static void __wait_on_freeing_inode(struct inode *inode);
 /*
 * Called with the inode lock held.
 * NOTE: we are not increasing the inode-refcount, you must call __iget()
@@ -481,6 +482,7 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea
 	struct hlist_node *node;
 	struct inode * inode = NULL;

+repeat:
 	hlist_for_each (node, head) { 
 		prefetch(node->next);
 		inode = hlist_entry(node, struct inode, i_hash);
@@ -488,6 +490,10 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea
 			continue;
 		if (!test(inode, data))
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -502,6 +508,7 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head
 	struct hlist_node *node;
 	struct inode * inode = NULL;

+repeat:
 	hlist_for_each (node, head) {
 		prefetch(node->next);
 		inode = list_entry(node, struct inode, i_hash);
@@ -509,6 +516,10 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+			__wait_on_freeing_inode(inode);
+			goto repeat;
+		}
 		break;
 	}
 	return node ? inode : NULL;
@@ -937,11 +948,22 @@ void remove_inode_hash(struct inode *inode)
 	spin_unlock(&inode_lock);
 }

+/*
+ * Tell the filesystem that this inode is no longer of any interest and should
+ * be completely destroyed.
+ *
+ * We leave the inode in the inode hash table until *after* the filesystem's
+ * ->delete_inode completes.  This ensures that an iget (such as nfsd might
+ * instigate) will always find up-to-date information either in the hash or on
+ * disk.
+ *
+ * I_FREEING is set so that no-one will take a new reference to the inode while
+ * it is being deleted.
+ */
 void generic_delete_inode(struct inode *inode)
 {
 	struct super_operations *op = inode->i_sb->s_op;

-	hlist_del_init(&inode->i_hash);
 	list_del_init(&inode->i_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -960,6 +982,10 @@ void generic_delete_inode(struct inode *inode)
 		delete(inode);
 	} else
 		clear_inode(inode);
+	spin_lock(&inode_lock);
+	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode_lock);
+	wake_up_inode(inode);
 	if (inode->i_state != I_CLEAR)
 		BUG();
 	destroy_inode(inode);
@@ -1233,6 +1259,32 @@ void __wait_on_inode(struct inode *inode)
 	__set_current_state(TASK_RUNNING);
 }

+/*
+ * If we try to find an inode in the inode hash while it is being deleted, we
+ * have to wait until the filesystem completes its deletion before reporting
+ * that it isn't found.  This is because iget will immediately call
+ * ->read_inode, and we want to be sure that evidence of the deletion is found
+ * by ->read_inode.
+ *
+ * This call might return early if an inode which shares the waitq is woken up.
+ * This is most easily handled by the caller which will loop around again
+ * looking for the inode.
+ *
+ * This is called with inode_lock held.
+ */
+static void __wait_on_freeing_inode(struct inode *inode)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	wait_queue_head_t *wq = i_waitq_head(inode);
+
+	add_wait_queue(wq, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode_lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
+	spin_lock(&inode_lock);
+}
+
 void wake_up_inode(struct inode *inode)
 {
 	wait_queue_head_t *wq = i_waitq_head(inode);