Commit 02511a5a authored by Dave Chinner's avatar Dave Chinner Committed by Darrick J. Wong

xfs: clean up inode reclaim comments

Inode reclaim is quite different now to the way described in various
comments, so update all the comments explaining what it does and how
it works.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: default avatarBrian Foster <bfoster@redhat.com>
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
parent 4d0bab3a
...@@ -141,11 +141,8 @@ xfs_inode_free( ...@@ -141,11 +141,8 @@ xfs_inode_free(
} }
/* /*
* Queue a new inode reclaim pass if there are reclaimable inodes and there * Queue background inode reclaim work if there are reclaimable inodes and there
* isn't a reclaim pass already in progress. By default it runs every 5s based * isn't reclaim work already scheduled or in progress.
* on the xfs periodic sync default of 30s. Perhaps this should have it's own
* tunable, but that can be done if this method proves to be ineffective or too
* aggressive.
*/ */
static void static void
xfs_reclaim_work_queue( xfs_reclaim_work_queue(
...@@ -600,48 +597,31 @@ xfs_iget_cache_miss( ...@@ -600,48 +597,31 @@ xfs_iget_cache_miss(
} }
/* /*
* Look up an inode by number in the given file system. * Look up an inode by number in the given file system. The inode is looked up
* The inode is looked up in the cache held in each AG. * in the cache held in each AG. If the inode is found in the cache, initialise
* If the inode is found in the cache, initialise the vfs inode * the vfs inode if necessary.
* if necessary.
* *
* If it is not in core, read it in from the file system's device, * If it is not in core, read it in from the file system's device, add it to the
* add it to the cache and initialise the vfs inode. * cache and initialise the vfs inode.
* *
* The inode is locked according to the value of the lock_flags parameter. * The inode is locked according to the value of the lock_flags parameter.
* This flag parameter indicates how and if the inode's IO lock and inode lock * Inode lookup is only done during metadata operations and not as part of the
* should be taken. * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
*
* mp -- the mount point structure for the current file system. It points
* to the inode hash table.
* tp -- a pointer to the current transaction if there is one. This is
* simply passed through to the xfs_iread() call.
* ino -- the number of the inode desired. This is the unique identifier
* within the file system for the inode being requested.
* lock_flags -- flags indicating how to lock the inode. See the comment
* for xfs_ilock() for a list of valid values.
*/ */
int int
xfs_iget( xfs_iget(
xfs_mount_t *mp, struct xfs_mount *mp,
xfs_trans_t *tp, struct xfs_trans *tp,
xfs_ino_t ino, xfs_ino_t ino,
uint flags, uint flags,
uint lock_flags, uint lock_flags,
xfs_inode_t **ipp) struct xfs_inode **ipp)
{ {
xfs_inode_t *ip; struct xfs_inode *ip;
int error; struct xfs_perag *pag;
xfs_perag_t *pag; xfs_agino_t agino;
xfs_agino_t agino; int error;
/*
* xfs_reclaim_inode() uses the ILOCK to ensure an inode
* doesn't get freed while it's being referenced during a
* radix tree traversal here. It assumes this function
* aqcuires only the ILOCK (and therefore it has no need to
* involve the IOLOCK in this synchronization).
*/
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
/* reject inode numbers outside existing AGs */ /* reject inode numbers outside existing AGs */
...@@ -758,15 +738,7 @@ xfs_inode_walk_ag_grab( ...@@ -758,15 +738,7 @@ xfs_inode_walk_ag_grab(
ASSERT(rcu_read_lock_held()); ASSERT(rcu_read_lock_held());
/* /* Check for stale RCU freed inode */
* check for stale RCU freed inode
*
* If the inode has been reallocated, it doesn't matter if it's not in
* the AG we are walking - we are walking for writeback, so if it
* passes all the "valid inode" checks and is dirty, then we'll write
* it back anyway. If it has been reallocated and still being
* initialised, the XFS_INEW check below will catch it.
*/
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
if (!ip->i_ino) if (!ip->i_ino)
goto out_unlock_noent; goto out_unlock_noent;
...@@ -1044,43 +1016,16 @@ xfs_reclaim_inode_grab( ...@@ -1044,43 +1016,16 @@ xfs_reclaim_inode_grab(
} }
/* /*
* Inodes in different states need to be treated differently. The following * Inode reclaim is non-blocking, so the default action if progress cannot be
* table lists the inode states and the reclaim actions necessary: * made is to "requeue" the inode for reclaim by unlocking it and clearing the
* * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
* inode state iflush ret required action * blocking anymore and hence we can wait for the inode to be able to reclaim
* --------------- ---------- --------------- * it.
* bad - reclaim
* shutdown EIO unpin and reclaim
* clean, unpinned 0 reclaim
* stale, unpinned 0 reclaim
* clean, pinned(*) 0 requeue
* stale, pinned EAGAIN requeue
* dirty, async - requeue
* dirty, sync 0 reclaim
* *
* (*) dgc: I don't think the clean, pinned state is possible but it gets * We do no IO here - if callers require inodes to be cleaned they must push the
* handled anyway given the order of checks implemented. * AIL first to trigger writeback of dirty inodes. This enables writeback to be
* * done in the background in a non-blocking manner, and enables memory reclaim
* Also, because we get the flush lock first, we know that any inode that has * to make progress without blocking.
* been flushed delwri has had the flush completed by the time we check that
* the inode is clean.
*
* Note that because the inode is flushed delayed write by AIL pushing, the
* flush lock may already be held here and waiting on it can result in very
* long latencies. Hence for sync reclaims, where we wait on the flush lock,
* the caller should push the AIL first before trying to reclaim inodes to
* minimise the amount of time spent waiting. For background relaim, we only
* bother to reclaim clean inodes anyway.
*
* Hence the order of actions after gaining the locks should be:
* bad => reclaim
* shutdown => unpin and reclaim
* pinned, async => requeue
* pinned, sync => unpin
* stale => reclaim
* clean => reclaim
* dirty, async => requeue
* dirty, sync => flush, wait and reclaim
*/ */
static void static void
xfs_reclaim_inode( xfs_reclaim_inode(
...@@ -1271,13 +1216,11 @@ xfs_reclaim_inodes( ...@@ -1271,13 +1216,11 @@ xfs_reclaim_inodes(
} }
/* /*
* Scan a certain number of inodes for reclaim. * The shrinker infrastructure determines how many inodes we should scan for
* * reclaim. We want as many clean inodes ready to reclaim as possible, so we
* When called we make sure that there is a background (fast) inode reclaim in * push the AIL here. We also want to proactively free up memory if we can to
* progress, while we will throttle the speed of reclaim via doing synchronous * minimise the amount of work memory reclaim has to do so we kick the
* reclaim of inodes. That means if we come across dirty inodes, we wait for * background reclaim if it isn't already scheduled.
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
*/ */
long long
xfs_reclaim_inodes_nr( xfs_reclaim_inodes_nr(
...@@ -1390,8 +1333,7 @@ xfs_inode_matches_eofb( ...@@ -1390,8 +1333,7 @@ xfs_inode_matches_eofb(
* This is a fast pass over the inode cache to try to get reclaim moving on as * This is a fast pass over the inode cache to try to get reclaim moving on as
* many inodes as possible in a short period of time. It kicks itself every few * many inodes as possible in a short period of time. It kicks itself every few
* seconds, as well as being kicked by the inode cache shrinker when memory * seconds, as well as being kicked by the inode cache shrinker when memory
* goes low. It scans as quickly as possible avoiding locked inodes or those * goes low.
* already being flushed, and once done schedules a future pass.
*/ */
void void
xfs_reclaim_worker( xfs_reclaim_worker(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment