Commit c3155097 authored by Brian Foster's avatar Brian Foster Committed by Darrick J. Wong

xfs: sync eofblocks scans under iolock are livelock prone

The xfs_eofblocks.eof_scan_owner field is an internal field to
facilitate invoking eofb scans from the kernel while under the iolock.
This is necessary because the eofb scan acquires the iolock of each
inode. Synchronous scans are invoked on certain buffered write failures
while under iolock. In such cases, the scan owner indicates that the
context for the scan already owns the particular iolock and prevents a
double lock deadlock.

eofblocks scans while under iolock are still livelock prone in the event
of multiple parallel scans, however. If multiple buffered writes to
different inodes fail and invoke eofblocks scans at the same time, each
scan avoids a deadlock with its own inode by virtue of the
eof_scan_owner field, but will never be able to acquire the iolock of
the inode from the parallel scan. Because the low free space scans are
invoked with SYNC_WAIT, the scan will not return until it has processed
every tagged inode and thus both scans will spin indefinitely on the
iolock being held across the opposite scan. This problem can be
reproduced reliably by generic/224 on systems with higher cpu counts
(x16).

To avoid this problem, simplify the semantics of eofblocks scans to
never invoke a scan while under iolock. This means that the buffered
write context must drop the iolock before the scan. It must reacquire
the lock before the write retry and also repeat the initial write
checks, as the original state might no longer be valid once the iolock
was dropped.
Signed-off-by: default avatarBrian Foster <bfoster@redhat.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
parent a36b9261
...@@ -614,8 +614,10 @@ xfs_file_buffered_aio_write( ...@@ -614,8 +614,10 @@ xfs_file_buffered_aio_write(
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
ssize_t ret; ssize_t ret;
int enospc = 0; int enospc = 0;
int iolock = XFS_IOLOCK_EXCL; int iolock;
write_retry:
iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock); xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock);
...@@ -625,7 +627,6 @@ xfs_file_buffered_aio_write( ...@@ -625,7 +627,6 @@ xfs_file_buffered_aio_write(
/* We can write back this queue in page reclaim */ /* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode); current->backing_dev_info = inode_to_bdi(inode);
write_retry:
trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
if (likely(ret >= 0)) if (likely(ret >= 0))
...@@ -641,18 +642,21 @@ xfs_file_buffered_aio_write( ...@@ -641,18 +642,21 @@ xfs_file_buffered_aio_write(
* running at the same time. * running at the same time.
*/ */
if (ret == -EDQUOT && !enospc) { if (ret == -EDQUOT && !enospc) {
xfs_iunlock(ip, iolock);
enospc = xfs_inode_free_quota_eofblocks(ip); enospc = xfs_inode_free_quota_eofblocks(ip);
if (enospc) if (enospc)
goto write_retry; goto write_retry;
enospc = xfs_inode_free_quota_cowblocks(ip); enospc = xfs_inode_free_quota_cowblocks(ip);
if (enospc) if (enospc)
goto write_retry; goto write_retry;
iolock = 0;
} else if (ret == -ENOSPC && !enospc) { } else if (ret == -ENOSPC && !enospc) {
struct xfs_eofblocks eofb = {0}; struct xfs_eofblocks eofb = {0};
enospc = 1; enospc = 1;
xfs_flush_inodes(ip->i_mount); xfs_flush_inodes(ip->i_mount);
eofb.eof_scan_owner = ip->i_ino; /* for locking */
xfs_iunlock(ip, iolock);
eofb.eof_flags = XFS_EOF_FLAGS_SYNC; eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
xfs_icache_free_eofblocks(ip->i_mount, &eofb); xfs_icache_free_eofblocks(ip->i_mount, &eofb);
goto write_retry; goto write_retry;
...@@ -660,7 +664,8 @@ xfs_file_buffered_aio_write( ...@@ -660,7 +664,8 @@ xfs_file_buffered_aio_write(
current->backing_dev_info = NULL; current->backing_dev_info = NULL;
out: out:
xfs_iunlock(ip, iolock); if (iolock)
xfs_iunlock(ip, iolock);
return ret; return ret;
} }
......
...@@ -1324,11 +1324,8 @@ xfs_inode_free_eofblocks( ...@@ -1324,11 +1324,8 @@ xfs_inode_free_eofblocks(
{ {
int ret = 0; int ret = 0;
struct xfs_eofblocks *eofb = args; struct xfs_eofblocks *eofb = args;
bool need_iolock = true;
int match; int match;
ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
if (!xfs_can_free_eofblocks(ip, false)) { if (!xfs_can_free_eofblocks(ip, false)) {
/* inode could be preallocated or append-only */ /* inode could be preallocated or append-only */
trace_xfs_inode_free_eofblocks_invalid(ip); trace_xfs_inode_free_eofblocks_invalid(ip);
...@@ -1356,27 +1353,19 @@ xfs_inode_free_eofblocks( ...@@ -1356,27 +1353,19 @@ xfs_inode_free_eofblocks(
if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
XFS_ISIZE(ip) < eofb->eof_min_file_size) XFS_ISIZE(ip) < eofb->eof_min_file_size)
return 0; return 0;
/*
* A scan owner implies we already hold the iolock. Skip it here
* to avoid deadlock.
*/
if (eofb->eof_scan_owner == ip->i_ino)
need_iolock = false;
} }
/* /*
* If the caller is waiting, return -EAGAIN to keep the background * If the caller is waiting, return -EAGAIN to keep the background
* scanner moving and revisit the inode in a subsequent pass. * scanner moving and revisit the inode in a subsequent pass.
*/ */
if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (flags & SYNC_WAIT) if (flags & SYNC_WAIT)
ret = -EAGAIN; ret = -EAGAIN;
return ret; return ret;
} }
ret = xfs_free_eofblocks(ip); ret = xfs_free_eofblocks(ip);
if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret; return ret;
} }
...@@ -1423,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks( ...@@ -1423,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks(
struct xfs_eofblocks eofb = {0}; struct xfs_eofblocks eofb = {0};
struct xfs_dquot *dq; struct xfs_dquot *dq;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
/* /*
* Set the scan owner to avoid a potential livelock. Otherwise, the scan * Run a sync scan to increase effectiveness and use the union filter to
* can repeatedly trylock on the inode we're currently processing. We
* run a sync scan to increase effectiveness and use the union filter to
* cover all applicable quotas in a single scan. * cover all applicable quotas in a single scan.
*/ */
eofb.eof_scan_owner = ip->i_ino;
eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
...@@ -1583,12 +1567,9 @@ xfs_inode_free_cowblocks( ...@@ -1583,12 +1567,9 @@ xfs_inode_free_cowblocks(
{ {
int ret; int ret;
struct xfs_eofblocks *eofb = args; struct xfs_eofblocks *eofb = args;
bool need_iolock = true;
int match; int match;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
/* /*
* Just clear the tag if we have an empty cow fork or none at all. It's * Just clear the tag if we have an empty cow fork or none at all. It's
* possible the inode was fully unshared since it was originally tagged. * possible the inode was fully unshared since it was originally tagged.
...@@ -1621,28 +1602,16 @@ xfs_inode_free_cowblocks( ...@@ -1621,28 +1602,16 @@ xfs_inode_free_cowblocks(
if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
XFS_ISIZE(ip) < eofb->eof_min_file_size) XFS_ISIZE(ip) < eofb->eof_min_file_size)
return 0; return 0;
/*
* A scan owner implies we already hold the iolock. Skip it in
* xfs_free_eofblocks() to avoid deadlock. This also eliminates
* the possibility of EAGAIN being returned.
*/
if (eofb->eof_scan_owner == ip->i_ino)
need_iolock = false;
} }
/* Free the CoW blocks */ /* Free the CoW blocks */
if (need_iolock) { xfs_ilock(ip, XFS_IOLOCK_EXCL);
xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
if (need_iolock) { xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
}
return ret; return ret;
} }
......
...@@ -27,7 +27,6 @@ struct xfs_eofblocks { ...@@ -27,7 +27,6 @@ struct xfs_eofblocks {
kgid_t eof_gid; kgid_t eof_gid;
prid_t eof_prid; prid_t eof_prid;
__u64 eof_min_file_size; __u64 eof_min_file_size;
xfs_ino_t eof_scan_owner;
}; };
#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_WAIT 0x0001 /* wait for i/o to complete */
...@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( ...@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
dst->eof_flags = src->eof_flags; dst->eof_flags = src->eof_flags;
dst->eof_prid = src->eof_prid; dst->eof_prid = src->eof_prid;
dst->eof_min_file_size = src->eof_min_file_size; dst->eof_min_file_size = src->eof_min_file_size;
dst->eof_scan_owner = NULLFSINO;
dst->eof_uid = INVALID_UID; dst->eof_uid = INVALID_UID;
if (src->eof_flags & XFS_EOF_FLAGS_UID) { if (src->eof_flags & XFS_EOF_FLAGS_UID) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment