Commit 20855e4c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-5.19-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Darrick Wong:
 "This fixes some stalling problems and corrects the last of the
  problems (I hope) observed during testing of the new atomic xattr
  update feature.

   - Fix statfs blocking on background inode gc workers

   - Fix some broken inode lock assertion code

   - Fix xattr leaf buffer leaks when cancelling a deferred xattr update
     operation

   - Clean up xattr recovery to make it easier to understand.

   - Fix xattr leaf block verifiers tripping over empty blocks.

   - Remove complicated and error prone xattr leaf block bholding mess.

   - Fix a bug where an rt extent crossing EOF was treated as "posteof"
     blocks and cleaned unnecessarily.

   - Fix a UAF when log shutdown races with unmount"

* tag 'xfs-5.19-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: prevent a UAF when log IO errors race with unmount
  xfs: dont treat rt extents beyond EOF as eofblocks to be cleared
  xfs: don't hold xattr leaf buffers across transaction rolls
  xfs: empty xattr leaf header blocks are not corruption
  xfs: clean up the end of xfs_attri_item_recover
  xfs: always free xattri_leaf_bp when cancelling a deferred op
  xfs: use invalidate_lock to check the state of mmap_lock
  xfs: factor out the common lock flags assert
  xfs: introduce xfs_inodegc_push()
  xfs: bound maximum wait time for inodegc work
parents 69cb6c65 7561cea5
......@@ -50,7 +50,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
/*
* Internal routines when attribute list is more than one block.
......@@ -393,16 +393,10 @@ xfs_attr_sf_addname(
* It won't fit in the shortform, transform to a leaf block. GROT:
* another possible req'mt for a double-split btree op.
*/
error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
error = xfs_attr_shortform_to_leaf(args);
if (error)
return error;
/*
* Prevent the leaf buffer from being unlocked so that a concurrent AIL
* push cannot grab the half-baked leaf buffer and run into problems
* with the write verifier.
*/
xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
out:
trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
......@@ -447,11 +441,9 @@ xfs_attr_leaf_addname(
/*
* Use the leaf buffer we may already hold locked as a result of
* a sf-to-leaf conversion. The held buffer is no longer valid
* after this call, regardless of the result.
* a sf-to-leaf conversion.
*/
error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
attr->xattri_leaf_bp = NULL;
error = xfs_attr_leaf_try_add(args);
if (error == -ENOSPC) {
error = xfs_attr3_leaf_to_node(args);
......@@ -497,8 +489,6 @@ xfs_attr_node_addname(
struct xfs_da_args *args = attr->xattri_da_args;
int error;
ASSERT(!attr->xattri_leaf_bp);
error = xfs_attr_node_addname_find_attr(attr);
if (error)
return error;
......@@ -1215,24 +1205,14 @@ xfs_attr_restore_rmt_blk(
*/
STATIC int
xfs_attr_leaf_try_add(
struct xfs_da_args *args,
struct xfs_buf *bp)
struct xfs_da_args *args)
{
struct xfs_buf *bp;
int error;
/*
* If the caller provided a buffer to us, it is locked and held in
* the transaction because it just did a shortform to leaf conversion.
* Hence we don't need to read it again. Otherwise read in the leaf
* buffer.
*/
if (bp) {
xfs_trans_bhold_release(args->trans, bp);
} else {
error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
if (error)
return error;
}
/*
* Look up the xattr name to set the insertion point for the new xattr.
......
......@@ -515,11 +515,6 @@ struct xfs_attr_intent {
*/
struct xfs_attri_log_nameval *xattri_nameval;
/*
* Used by xfs_attr_set to hold a leaf buffer across a transaction roll
*/
struct xfs_buf *xattri_leaf_bp;
/* Used to keep track of current state of delayed operation */
enum xfs_delattr_state xattri_dela_state;
......
......@@ -289,6 +289,23 @@ xfs_attr3_leaf_verify_entry(
return NULL;
}
/*
* Validate an attribute leaf block.
*
* Empty leaf blocks can occur under the following circumstances:
*
* 1. setxattr adds a new extended attribute to a file;
* 2. The file has zero existing attributes;
* 3. The attribute is too large to fit in the attribute fork;
* 4. The attribute is small enough to fit in a leaf block;
* 5. A log flush occurs after committing the transaction that creates
* the (empty) leaf block; and
* 6. The filesystem goes down after the log flush but before the new
* attribute can be committed to the leaf block.
*
* Hence we need to ensure that we don't fail the validation purely
* because the leaf is empty.
*/
static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
......@@ -310,15 +327,6 @@ xfs_attr3_leaf_verify(
if (fa)
return fa;
/*
* Empty leaf blocks should never occur; they imply the existence of a
* software bug that needs fixing. xfs_repair also flags them as a
* corruption that needs fixing, so we should never let these go to
* disk.
*/
if (ichdr.count == 0)
return __this_address;
/*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
......@@ -922,14 +930,10 @@ xfs_attr_shortform_getvalue(
return -ENOATTR;
}
/*
* Convert from using the shortform to the leaf. On success, return the
* buffer so that we can keep it locked until we're totally done with it.
*/
/* Convert from using the shortform to the leaf format. */
int
xfs_attr_shortform_to_leaf(
struct xfs_da_args *args,
struct xfs_buf **leaf_bp)
struct xfs_da_args *args)
{
struct xfs_inode *dp;
struct xfs_attr_shortform *sf;
......@@ -991,7 +995,6 @@ xfs_attr_shortform_to_leaf(
sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
*leaf_bp = bp;
out:
kmem_free(tmpbuffer);
return error;
......
......@@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
struct xfs_buf **leaf_bp);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
int xfs_attr_sf_findname(struct xfs_da_args *args,
struct xfs_attr_sf_entry **sfep,
......
......@@ -576,7 +576,7 @@ xfs_attri_item_recover(
struct xfs_trans_res tres;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
int error, ret = 0;
int error;
int total;
int local;
struct xfs_attrd_log_item *done_item = NULL;
......@@ -655,28 +655,31 @@ xfs_attri_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
ret = xfs_xattri_finish_update(attr, done_item);
if (ret == -EAGAIN) {
/* There's more work to do, so add it to this transaction */
error = xfs_xattri_finish_update(attr, done_item);
if (error == -EAGAIN) {
/*
* There's more work to do, so add the intent item to this
* transaction so that we can continue it later.
*/
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
} else
error = ret;
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
if (error)
goto out_unlock;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
return 0;
}
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
}
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
out_unlock:
if (attr->xattri_leaf_bp)
xfs_buf_relse(attr->xattri_leaf_bp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
out:
if (ret != -EAGAIN)
xfs_attr_free_item(attr);
return error;
}
......
......@@ -686,6 +686,8 @@ xfs_can_free_eofblocks(
* forever.
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
......
......@@ -440,7 +440,7 @@ xfs_inodegc_queue_all(
for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
}
}
......@@ -1841,8 +1841,8 @@ void
xfs_inodegc_worker(
struct work_struct *work)
{
struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
work);
struct xfs_inodegc *gc = container_of(to_delayed_work(work),
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
......@@ -1862,19 +1862,29 @@ xfs_inodegc_worker(
}
/*
* Force all currently queued inode inactivation work to run immediately and
* wait for the work to finish.
* Expedite all pending inodegc work to run immediately. This does not wait for
* completion of the work.
*/
void
xfs_inodegc_flush(
xfs_inodegc_push(
struct xfs_mount *mp)
{
if (!xfs_is_inodegc_enabled(mp))
return;
trace_xfs_inodegc_push(mp, __return_address);
xfs_inodegc_queue_all(mp);
}
/*
* Force all currently queued inode inactivation work to run immediately and
* wait for the work to finish.
*/
void
xfs_inodegc_flush(
struct xfs_mount *mp)
{
xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
xfs_inodegc_queue_all(mp);
flush_workqueue(mp->m_inodegc_wq);
}
......@@ -2014,6 +2024,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
spin_lock(&ip->i_flags_lock);
......@@ -2025,19 +2036,26 @@ xfs_inodegc_queue(
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
put_cpu_ptr(gc);
if (!xfs_is_inodegc_enabled(mp))
/*
* We queue the work while holding the current CPU so that the work
* is scheduled to run on this CPU.
*/
if (!xfs_is_inodegc_enabled(mp)) {
put_cpu_ptr(gc);
return;
}
if (xfs_inodegc_want_queue_work(ip, items))
queue_delay = 0;
if (xfs_inodegc_want_queue_work(ip, items)) {
trace_xfs_inodegc_queue(mp, __return_address);
queue_work(mp->m_inodegc_wq, &gc->work);
}
mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
put_cpu_ptr(gc);
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
flush_work(&gc->work);
flush_delayed_work(&gc->work);
}
}
......@@ -2054,7 +2072,7 @@ xfs_inodegc_cpu_dead(
unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
cancel_work_sync(&dead_gc->work);
cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list))
return;
......@@ -2073,12 +2091,12 @@ xfs_inodegc_cpu_dead(
llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count);
put_cpu_ptr(gc);
if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address);
queue_work(mp->m_inodegc_wq, &gc->work);
mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
}
put_cpu_ptr(gc);
}
/*
......@@ -2173,7 +2191,7 @@ xfs_inodegc_shrinker_scan(
unsigned int h = READ_ONCE(gc->shrinker_hits);
WRITE_ONCE(gc->shrinker_hits, h + 1);
queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
no_items = false;
}
}
......
......@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
......
......@@ -131,6 +131,26 @@ xfs_ilock_attr_map_shared(
return lock_mode;
}
/*
* You can't set both SHARED and EXCL for the same lock,
* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
* XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
* to set in lock_flags.
*/
static inline void
xfs_lock_flags_assert(
uint lock_flags)
{
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
ASSERT(lock_flags != 0);
}
/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
* multi-reader locks: invalidate_lock and the i_lock. This routine allows
......@@ -168,18 +188,7 @@ xfs_ilock(
{
trace_xfs_ilock(ip, lock_flags, _RET_IP_);
/*
* You can't set both SHARED and EXCL for the same lock,
* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
*/
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
......@@ -222,18 +231,7 @@ xfs_ilock_nowait(
{
trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
/*
* You can't set both SHARED and EXCL for the same lock,
* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
*/
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
......@@ -291,19 +289,7 @@ xfs_iunlock(
xfs_inode_t *ip,
uint lock_flags)
{
/*
* You can't set both SHARED and EXCL for the same lock,
* and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
* and XFS_ILOCK_EXCL are valid values to set in lock_flags.
*/
ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
(XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
ASSERT(lock_flags != 0);
xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
......@@ -379,8 +365,8 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
(lock_flags & XFS_IOLOCK_SHARED));
return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
(lock_flags & XFS_MMAPLOCK_SHARED));
}
if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
......
......@@ -2092,8 +2092,6 @@ xlog_dealloc_log(
xlog_in_core_t *iclog, *next_iclog;
int i;
xlog_cil_destroy(log);
/*
* Cycle all the iclogbuf locks to make sure all log IO completion
* is done before we tear down these buffers.
......@@ -2105,6 +2103,13 @@ xlog_dealloc_log(
iclog = iclog->ic_next;
}
/*
* Destroy the CIL after waiting for iclog IO completion because an
* iclog EIO error will try to shut down the log, which accesses the
* CIL to wake up the waiters.
*/
xlog_cil_destroy(log);
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
......
......@@ -61,7 +61,7 @@ struct xfs_error_cfg {
*/
struct xfs_inodegc {
struct llist_head list;
struct work_struct work;
struct delayed_work work;
/* approximate count of inodes in the list */
unsigned int items;
......
......@@ -454,9 +454,12 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp;
int error;
/* Flush inodegc work at the start of a quota reporting scan. */
/*
* Expedite pending inodegc work at the start of a quota reporting
* scan but don't block waiting for it to complete.
*/
if (id == 0)
xfs_inodegc_flush(mp);
xfs_inodegc_push(mp);
/*
* Try to get the dquot. We don't want it allocated on disk, so don't
......@@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next(
/* Flush inodegc work at the start of a quota reporting scan. */
if (*id == 0)
xfs_inodegc_flush(mp);
xfs_inodegc_push(mp);
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
......
......@@ -797,8 +797,11 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
/* Wait for whatever inactivations are in progress. */
xfs_inodegc_flush(mp);
/*
* Expedite background inodegc but don't wait. We do not want to block
* here waiting hours for a billion extent file to be truncated.
*/
xfs_inodegc_push(mp);
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
......@@ -1074,7 +1077,7 @@ xfs_inodegc_init_percpu(
gc = per_cpu_ptr(mp->m_inodegc, cpu);
init_llist_head(&gc->list);
gc->items = 0;
INIT_WORK(&gc->work, xfs_inodegc_worker);
INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
}
......
......@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush);
DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment