Commit 86286e48 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.17-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "A few fixes and error handling improvements:

   - fix deadlock between quota disable and qgroup rescan worker

   - fix use-after-free after failure to create a snapshot

   - skip warning on unmount after log cleanup failure

   - don't start transaction for scrub if the fs is mounted read-only

   - tree checker verifies item sizes"

* tag 'for-5.17-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: skip reserved bytes warning on unmount after log cleanup failure
  btrfs: fix use of uninitialized variable at rm device ioctl
  btrfs: fix use-after-free after failure to create a snapshot
  btrfs: tree-checker: check item_size for dev_item
  btrfs: tree-checker: check item_size for inode_item
  btrfs: fix deadlock between quota disable and qgroup rescan worker
  btrfs: don't start transaction for scrub if the fs is mounted read-only
parents b0bc0cb8 40cdc509
...@@ -124,6 +124,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) ...@@ -124,6 +124,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
{ {
if (refcount_dec_and_test(&cache->refs)) { if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0); WARN_ON(cache->pinned > 0);
/*
* If there was a failure to cleanup a log tree, very likely due
* to an IO failure on a writeback attempt of one or more of its
* extent buffers, we could not do proper (and cheap) unaccounting
* of their reserved space, so don't warn on reserved > 0 in that
* case.
*/
if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
!BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
WARN_ON(cache->reserved > 0); WARN_ON(cache->reserved > 0);
/* /*
...@@ -2544,6 +2553,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ...@@ -2544,6 +2553,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
int ret; int ret;
bool dirty_bg_running; bool dirty_bg_running;
/*
* This can only happen when we are doing read-only scrub on read-only
* mount.
* In that case we should not start a new transaction on read-only fs.
* Thus here we skip all chunk allocations.
*/
if (sb_rdonly(fs_info->sb)) {
mutex_lock(&fs_info->ro_block_group_mutex);
ret = inc_block_group_ro(cache, 0);
mutex_unlock(&fs_info->ro_block_group_mutex);
return ret;
}
do { do {
trans = btrfs_join_transaction(root); trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) if (IS_ERR(trans))
...@@ -3974,9 +3996,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ...@@ -3974,9 +3996,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
* important and indicates a real bug if this happens. * important and indicates a real bug if this happens.
*/ */
if (WARN_ON(space_info->bytes_pinned > 0 || if (WARN_ON(space_info->bytes_pinned > 0 ||
space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0)) space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0); btrfs_dump_space_info(info, space_info, 0, 0);
/*
* If there was a failure to cleanup a log tree, very likely due
* to an IO failure on a writeback attempt of one or more of its
* extent buffers, we could not do proper (and cheap) unaccounting
* of their reserved space, so don't warn on bytes_reserved > 0 in
* that case.
*/
if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
!BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
if (WARN_ON(space_info->bytes_reserved > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
}
WARN_ON(space_info->reclaim_size > 0); WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list); list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info); btrfs_sysfs_remove_space_info(space_info);
......
...@@ -145,6 +145,9 @@ enum { ...@@ -145,6 +145,9 @@ enum {
BTRFS_FS_STATE_DUMMY_FS_INFO, BTRFS_FS_STATE_DUMMY_FS_INFO,
BTRFS_FS_STATE_NO_CSUMS, BTRFS_FS_STATE_NO_CSUMS,
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
}; };
#define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_MAX 256
...@@ -3593,6 +3596,9 @@ do { \ ...@@ -3593,6 +3596,9 @@ do { \
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state))) &(fs_info)->fs_state)))
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
__printf(5, 6) __printf(5, 6)
__cold __cold
......
...@@ -805,10 +805,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, ...@@ -805,10 +805,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto fail; goto fail;
} }
spin_lock(&fs_info->trans_lock); trans->pending_snapshot = pending_snapshot;
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
spin_unlock(&fs_info->trans_lock);
ret = btrfs_commit_transaction(trans); ret = btrfs_commit_transaction(trans);
if (ret) if (ret)
...@@ -3354,7 +3351,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ...@@ -3354,7 +3351,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct block_device *bdev = NULL; struct block_device *bdev = NULL;
fmode_t mode; fmode_t mode;
int ret; int ret;
bool cancel; bool cancel = false;
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return -EPERM; return -EPERM;
......
...@@ -1185,9 +1185,24 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) ...@@ -1185,9 +1185,24 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL; struct btrfs_trans_handle *trans = NULL;
int ret = 0; int ret = 0;
/*
* We need to have subvol_sem write locked, to prevent races between
* concurrent tasks trying to disable quotas, because we will unlock
* and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
mutex_lock(&fs_info->qgroup_ioctl_lock); mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) if (!fs_info->quota_root)
goto out; goto out;
/*
* Request qgroup rescan worker to complete and wait for it. This wait
* must be done before transaction start for quota disable since it may
* deadlock with transaction by the qgroup rescan worker.
*/
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
mutex_unlock(&fs_info->qgroup_ioctl_lock); mutex_unlock(&fs_info->qgroup_ioctl_lock);
/* /*
...@@ -1205,14 +1220,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) ...@@ -1205,14 +1220,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
if (IS_ERR(trans)) { if (IS_ERR(trans)) {
ret = PTR_ERR(trans); ret = PTR_ERR(trans);
trans = NULL; trans = NULL;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out; goto out;
} }
if (!fs_info->quota_root) if (!fs_info->quota_root)
goto out; goto out;
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock); spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root; quota_root = fs_info->quota_root;
fs_info->quota_root = NULL; fs_info->quota_root = NULL;
...@@ -3383,6 +3397,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, ...@@ -3383,6 +3397,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info, btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled"); "qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL; ret = -EINVAL;
} else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* Quota disable is in progress */
ret = -EBUSY;
} }
if (ret) { if (ret) {
......
...@@ -2000,6 +2000,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) ...@@ -2000,6 +2000,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
} }
/*
* Add a pending snapshot associated with the given transaction handle to the
* respective handle. This must be called after the transaction commit started
* and while holding fs_info->trans_lock.
* This serves to guarantee a caller of btrfs_commit_transaction() that it can
* safely free the pending snapshot pointer in case btrfs_commit_transaction()
* returns an error.
*/
static void add_pending_snapshot(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->pending_snapshot)
return;
lockdep_assert_held(&trans->fs_info->trans_lock);
ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans) int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{ {
struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_fs_info *fs_info = trans->fs_info;
...@@ -2073,6 +2094,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ...@@ -2073,6 +2094,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (cur_trans->state >= TRANS_STATE_COMMIT_START) { if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
add_pending_snapshot(trans);
spin_unlock(&fs_info->trans_lock); spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count); refcount_inc(&cur_trans->use_count);
...@@ -2163,6 +2186,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ...@@ -2163,6 +2186,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* COMMIT_DOING so make sure to wait for num_writers to == 1 again. * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/ */
spin_lock(&fs_info->trans_lock); spin_lock(&fs_info->trans_lock);
add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING; cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock); spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait, wait_event(cur_trans->writer_wait,
......
...@@ -123,6 +123,8 @@ struct btrfs_trans_handle { ...@@ -123,6 +123,8 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction; struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv; struct btrfs_block_rsv *orig_rsv;
/* Set by a task that wants to create a snapshot. */
struct btrfs_pending_snapshot *pending_snapshot;
refcount_t use_count; refcount_t use_count;
unsigned int type; unsigned int type;
/* /*
......
...@@ -965,6 +965,7 @@ static int check_dev_item(struct extent_buffer *leaf, ...@@ -965,6 +965,7 @@ static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot) struct btrfs_key *key, int slot)
{ {
struct btrfs_dev_item *ditem; struct btrfs_dev_item *ditem;
const u32 item_size = btrfs_item_size(leaf, slot);
if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) { if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
dev_item_err(leaf, slot, dev_item_err(leaf, slot,
...@@ -972,6 +973,13 @@ static int check_dev_item(struct extent_buffer *leaf, ...@@ -972,6 +973,13 @@ static int check_dev_item(struct extent_buffer *leaf,
key->objectid, BTRFS_DEV_ITEMS_OBJECTID); key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN; return -EUCLEAN;
} }
if (unlikely(item_size != sizeof(*ditem))) {
dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
item_size, sizeof(*ditem));
return -EUCLEAN;
}
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) { if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
dev_item_err(leaf, slot, dev_item_err(leaf, slot,
...@@ -1007,6 +1015,7 @@ static int check_inode_item(struct extent_buffer *leaf, ...@@ -1007,6 +1015,7 @@ static int check_inode_item(struct extent_buffer *leaf,
struct btrfs_inode_item *iitem; struct btrfs_inode_item *iitem;
u64 super_gen = btrfs_super_generation(fs_info->super_copy); u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
const u32 item_size = btrfs_item_size(leaf, slot);
u32 mode; u32 mode;
int ret; int ret;
u32 flags; u32 flags;
...@@ -1016,6 +1025,12 @@ static int check_inode_item(struct extent_buffer *leaf, ...@@ -1016,6 +1025,12 @@ static int check_inode_item(struct extent_buffer *leaf,
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
if (unlikely(item_size != sizeof(*iitem))) {
generic_err(leaf, slot, "invalid item size: has %u expect %zu",
item_size, sizeof(*iitem));
return -EUCLEAN;
}
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */ /* Here we use super block generation + 1 to handle log tree */
......
...@@ -3414,6 +3414,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ...@@ -3414,6 +3414,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
if (log->node) { if (log->node) {
ret = walk_log_tree(trans, log, &wc); ret = walk_log_tree(trans, log, &wc);
if (ret) { if (ret) {
/*
* We weren't able to traverse the entire log tree, the
* typical scenario is getting an -EIO when reading an
* extent buffer of the tree, due to a previous writeback
* failure of it.
*/
set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
&log->fs_info->fs_state);
/*
* Some extent buffers of the log tree may still be dirty
* and not yet written back to storage, because we may
* have updates to a log tree without syncing a log tree,
* such as during rename and link operations. So flush
* them out and wait for their writeback to complete, so
* that we properly cleanup their state and pages.
*/
btrfs_write_marked_extents(log->fs_info,
&log->dirty_log_pages,
EXTENT_DIRTY | EXTENT_NEW);
btrfs_wait_tree_log_extents(log,
EXTENT_DIRTY | EXTENT_NEW);
if (trans) if (trans)
btrfs_abort_transaction(trans, ret); btrfs_abort_transaction(trans, ret);
else else
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment