Commit 1a3e8f3d authored by Dave Chinner's avatar Dave Chinner Committed by Dave Chinner

xfs: convert inode cache lookups to use RCU locking

With delayed logging greatly increasing the sustained parallelism of inode
operations, the inode cache locking is showing significant read vs write
contention when inode reclaim runs at the same time as lookups. There is
also a lot more write lock acquistions than there are read locks (4:1 ratio)
so the read locking is not really buying us much in the way of parallelism.

To avoid the read vs write contention, change the cache to use RCU locking on
the read side. To avoid needing to RCU free every single inode, use the built
in slab RCU freeing mechanism. This requires us to be able to detect lookups of
freed inodes, so enѕure that ever freed inode has an inode number of zero and
the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit
lookup path, but also add a check for a zero inode number as well.

We canthen convert all the read locking lockups to use RCU read side locking
and hence remove all read side locking.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarAlex Elder <aelder@sgi.com>
parent d95b7aaf
...@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab( ...@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
{ {
struct inode *inode = VFS_I(ip); struct inode *inode = VFS_I(ip);
ASSERT(rcu_read_lock_held());
/*
* check for stale RCU freed inode
*
* If the inode has been reallocated, it doesn't matter if it's not in
* the AG we are walking - we are walking for writeback, so if it
* passes all the "valid inode" checks and is dirty, then we'll write
* it back anyway. If it has been reallocated and still being
* initialised, the XFS_INEW check below will catch it.
*/
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino)
goto out_unlock_noent;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock_noent;
spin_unlock(&ip->i_flags_lock);
/* nothing to sync during shutdown */ /* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return EFSCORRUPTED; return EFSCORRUPTED;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
return ENOENT;
/* If we can't grab the inode, it must on it's way to reclaim. */ /* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode)) if (!igrab(inode))
return ENOENT; return ENOENT;
...@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab( ...@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
/* inode is valid */ /* inode is valid */
return 0; return 0;
out_unlock_noent:
spin_unlock(&ip->i_flags_lock);
return ENOENT;
} }
STATIC int STATIC int
...@@ -98,12 +118,12 @@ xfs_inode_ag_walk( ...@@ -98,12 +118,12 @@ xfs_inode_ag_walk(
int error = 0; int error = 0;
int i; int i;
read_lock(&pag->pag_ici_lock); rcu_read_lock();
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index, (void **)batch, first_index,
XFS_LOOKUP_BATCH); XFS_LOOKUP_BATCH);
if (!nr_found) { if (!nr_found) {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
break; break;
} }
...@@ -118,18 +138,26 @@ xfs_inode_ag_walk( ...@@ -118,18 +138,26 @@ xfs_inode_ag_walk(
batch[i] = NULL; batch[i] = NULL;
/* /*
* Update the index for the next lookup. Catch overflows * Update the index for the next lookup. Catch
* into the next AG range which can occur if we have inodes * overflows into the next AG range which can occur if
* in the last block of the AG and we are currently * we have inodes in the last block of the AG and we
* pointing to the last inode. * are currently pointing to the last inode.
*
* Because we may see inodes that are from the wrong AG
* due to RCU freeing and reallocation, only update the
* index if it lies in this AG. It was a race that lead
* us to see this inode, so another lookup from the
* same index will not find it again.
*/ */
if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1; done = 1;
} }
/* unlock now we've grabbed the inodes. */ /* unlock now we've grabbed the inodes. */
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
for (i = 0; i < nr_found; i++) { for (i = 0; i < nr_found; i++) {
if (!batch[i]) if (!batch[i])
...@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab( ...@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
struct xfs_inode *ip, struct xfs_inode *ip,
int flags) int flags)
{ {
ASSERT(rcu_read_lock_held());
/* quick check for stale RCU freed inode */
if (!ip->i_ino)
return 1;
/* /*
* do some unlocked checks first to avoid unnecceary lock traffic. * do some unlocked checks first to avoid unnecessary lock traffic.
* The first is a flush lock check, the second is a already in reclaim * The first is a flush lock check, the second is a already in reclaim
* check. Only do these checks if we are not going to block on locks. * check. Only do these checks if we are not going to block on locks.
*/ */
...@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab( ...@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
* The radix tree lock here protects a thread in xfs_iget from racing * The radix tree lock here protects a thread in xfs_iget from racing
* with us starting reclaim on the inode. Once we have the * with us starting reclaim on the inode. Once we have the
* XFS_IRECLAIM flag set it will not touch us. * XFS_IRECLAIM flag set it will not touch us.
*
* Due to RCU lookup, we may find inodes that have been freed and only
* have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
* aren't candidates for reclaim at all, so we must check the
* XFS_IRECLAIMABLE is set first before proceeding to reclaim.
*/ */
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { __xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* ignore as it is already under reclaim */ /* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
return 1; return 1;
} }
...@@ -864,14 +902,14 @@ xfs_reclaim_inodes_ag( ...@@ -864,14 +902,14 @@ xfs_reclaim_inodes_ag(
struct xfs_inode *batch[XFS_LOOKUP_BATCH]; struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i; int i;
write_lock(&pag->pag_ici_lock); rcu_read_lock();
nr_found = radix_tree_gang_lookup_tag( nr_found = radix_tree_gang_lookup_tag(
&pag->pag_ici_root, &pag->pag_ici_root,
(void **)batch, first_index, (void **)batch, first_index,
XFS_LOOKUP_BATCH, XFS_LOOKUP_BATCH,
XFS_ICI_RECLAIM_TAG); XFS_ICI_RECLAIM_TAG);
if (!nr_found) { if (!nr_found) {
write_unlock(&pag->pag_ici_lock); rcu_read_unlock();
break; break;
} }
...@@ -891,14 +929,24 @@ xfs_reclaim_inodes_ag( ...@@ -891,14 +929,24 @@ xfs_reclaim_inodes_ag(
* occur if we have inodes in the last block of * occur if we have inodes in the last block of
* the AG and we are currently pointing to the * the AG and we are currently pointing to the
* last inode. * last inode.
*
* Because we may see inodes that are from the
* wrong AG due to RCU freeing and
* reallocation, only update the index if it
* lies in this AG. It was a race that lead us
* to see this inode, so another lookup from
* the same index will not find it again.
*/ */
if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
pag->pag_agno)
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1; done = 1;
} }
/* unlock now we've grabbed the inodes. */ /* unlock now we've grabbed the inodes. */
write_unlock(&pag->pag_ici_lock); rcu_read_unlock();
for (i = 0; i < nr_found; i++) { for (i = 0; i < nr_found; i++) {
if (!batch[i]) if (!batch[i])
......
...@@ -80,6 +80,7 @@ xfs_inode_alloc( ...@@ -80,6 +80,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush)); ASSERT(completion_done(&ip->i_flush));
ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
lockdep_set_class_and_name(&ip->i_iolock.mr_lock, lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
...@@ -98,9 +99,6 @@ xfs_inode_alloc( ...@@ -98,9 +99,6 @@ xfs_inode_alloc(
ip->i_size = 0; ip->i_size = 0;
ip->i_new_size = 0; ip->i_new_size = 0;
/* prevent anyone from using this yet */
VFS_I(ip)->i_state = I_NEW;
return ip; return ip;
} }
...@@ -159,6 +157,16 @@ xfs_inode_free( ...@@ -159,6 +157,16 @@ xfs_inode_free(
ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush)); ASSERT(completion_done(&ip->i_flush));
/*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
* free state. The ip->i_flags_lock provides the barrier against lookup
* races.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free); call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
} }
...@@ -169,14 +177,29 @@ static int ...@@ -169,14 +177,29 @@ static int
xfs_iget_cache_hit( xfs_iget_cache_hit(
struct xfs_perag *pag, struct xfs_perag *pag,
struct xfs_inode *ip, struct xfs_inode *ip,
xfs_ino_t ino,
int flags, int flags,
int lock_flags) __releases(pag->pag_ici_lock) int lock_flags) __releases(RCU)
{ {
struct inode *inode = VFS_I(ip); struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
int error; int error;
/*
* check for re-use of an inode within an RCU grace period due to the
* radix tree nodes not being updated yet. We monitor for this by
* setting the inode number to zero before freeing the inode structure.
* If the inode has been reallocated and set up, then the inode number
* will not match, so check for that, too.
*/
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
XFS_STATS_INC(xs_ig_frecycle);
error = EAGAIN;
goto out_error;
}
/* /*
* If we are racing with another cache hit that is currently * If we are racing with another cache hit that is currently
...@@ -219,7 +242,7 @@ xfs_iget_cache_hit( ...@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
ip->i_flags |= XFS_IRECLAIM; ip->i_flags |= XFS_IRECLAIM;
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
error = -inode_init_always(mp->m_super, inode); error = -inode_init_always(mp->m_super, inode);
if (error) { if (error) {
...@@ -227,7 +250,7 @@ xfs_iget_cache_hit( ...@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
* Re-initializing the inode failed, and we are in deep * Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list. * trouble. Try to re-add it to the reclaim list.
*/ */
read_lock(&pag->pag_ici_lock); rcu_read_lock();
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~XFS_INEW; ip->i_flags &= ~XFS_INEW;
...@@ -261,7 +284,7 @@ xfs_iget_cache_hit( ...@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
/* We've got a live one. */ /* We've got a live one. */
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
trace_xfs_iget_hit(ip); trace_xfs_iget_hit(ip);
} }
...@@ -275,7 +298,7 @@ xfs_iget_cache_hit( ...@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
out_error: out_error:
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
return error; return error;
} }
...@@ -397,7 +420,7 @@ xfs_iget( ...@@ -397,7 +420,7 @@ xfs_iget(
xfs_agino_t agino; xfs_agino_t agino;
/* reject inode numbers outside existing AGs */ /* reject inode numbers outside existing AGs */
if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return EINVAL; return EINVAL;
/* get the perag structure and ensure that it's inode capable */ /* get the perag structure and ensure that it's inode capable */
...@@ -406,15 +429,15 @@ xfs_iget( ...@@ -406,15 +429,15 @@ xfs_iget(
again: again:
error = 0; error = 0;
read_lock(&pag->pag_ici_lock); rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino); ip = radix_tree_lookup(&pag->pag_ici_root, agino);
if (ip) { if (ip) {
error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
if (error) if (error)
goto out_error_or_again; goto out_error_or_again;
} else { } else {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
XFS_STATS_INC(xs_ig_missed); XFS_STATS_INC(xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
......
...@@ -2000,16 +2000,32 @@ xfs_ifree_cluster( ...@@ -2000,16 +2000,32 @@ xfs_ifree_cluster(
*/ */
for (i = 0; i < ninodes; i++) { for (i = 0; i < ninodes; i++) {
retry: retry:
read_lock(&pag->pag_ici_lock); rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i))); XFS_INO_TO_AGINO(mp, (inum + i)));
/* Inode not in memory or stale, nothing to do */ /* Inode not in memory, nothing to do */
if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { if (!ip) {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
continue; continue;
} }
/*
* because this is an RCU protected lookup, we could
* find a recently freed or even reallocated inode
* during the lookup. We need to check under the
* i_flags_lock for a valid inode here. Skip it if it
* is not valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum + i ||
__xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
continue;
}
spin_unlock(&ip->i_flags_lock);
/* /*
* Don't try to lock/unlock the current inode, but we * Don't try to lock/unlock the current inode, but we
* _cannot_ skip the other inodes that we did not find * _cannot_ skip the other inodes that we did not find
...@@ -2019,11 +2035,11 @@ xfs_ifree_cluster( ...@@ -2019,11 +2035,11 @@ xfs_ifree_cluster(
*/ */
if (ip != free_ip && if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
delay(1); delay(1);
goto retry; goto retry;
} }
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
xfs_iflock(ip); xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE); xfs_iflags_set(ip, XFS_ISTALE);
...@@ -2629,7 +2645,7 @@ xfs_iflush_cluster( ...@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
read_lock(&pag->pag_ici_lock); rcu_read_lock();
/* really need a gang lookup range call here */ /* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
first_index, inodes_per_cluster); first_index, inodes_per_cluster);
...@@ -2640,9 +2656,21 @@ xfs_iflush_cluster( ...@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
iq = ilist[i]; iq = ilist[i];
if (iq == ip) if (iq == ip)
continue; continue;
/* if the inode lies outside this cluster, we're done. */
if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) /*
break; * because this is an RCU protected lookup, we could find a
* recently freed or even reallocated inode during the lookup.
* We need to check under the i_flags_lock for a valid inode
* here. Skip it if it is not valid or the wrong inode.
*/
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino ||
(XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
spin_unlock(&ip->i_flags_lock);
continue;
}
spin_unlock(&ip->i_flags_lock);
/* /*
* Do an un-protected check to see if the inode is dirty and * Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated * is a candidate for flushing. These checks will be repeated
...@@ -2692,7 +2720,7 @@ xfs_iflush_cluster( ...@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
} }
out_free: out_free:
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
kmem_free(ilist); kmem_free(ilist);
out_put: out_put:
xfs_perag_put(pag); xfs_perag_put(pag);
...@@ -2704,7 +2732,7 @@ xfs_iflush_cluster( ...@@ -2704,7 +2732,7 @@ xfs_iflush_cluster(
* Corruption detected in the clustering loop. Invalidate the * Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem. * inode buffer and shut down the filesystem.
*/ */
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
/* /*
* Clean up the buffer. If it was B_DELWRI, just release it -- * Clean up the buffer. If it was B_DELWRI, just release it --
* brelse can handle it with no problems. If not, shut down the * brelse can handle it with no problems. If not, shut down the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment