Commit 73beece9 authored by Liu Bo's avatar Liu Bo Committed by David Sterba

Btrfs: fix lockdep deadlock warning due to dev_replace

Xfstests btrfs/011 complains about a deadlock warning,

[ 1226.649039] =========================================================
[ 1226.649039] [ INFO: possible irq lock inversion dependency detected ]
[ 1226.649039] 4.1.0+ #270 Not tainted
[ 1226.649039] ---------------------------------------------------------
[ 1226.652955] kswapd0/46 just changed the state of lock:
[ 1226.652955]  (&delayed_node->mutex){+.+.-.}, at: [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955] but this lock took another, RECLAIM_FS-unsafe lock in the past:
[ 1226.652955]  (&fs_info->dev_replace.lock){+.+.+.}

and interrupts could create inverse lock ordering between them.

[ 1226.652955]
other info that might help us debug this:
[ 1226.652955] Chain exists of:
  &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock

[ 1226.652955]  Possible interrupt unsafe locking scenario:

[ 1226.652955]        CPU0                    CPU1
[ 1226.652955]        ----                    ----
[ 1226.652955]   lock(&fs_info->dev_replace.lock);
[ 1226.652955]                                local_irq_disable();
[ 1226.652955]                                lock(&delayed_node->mutex);
[ 1226.652955]                                lock(&found->groups_sem);
[ 1226.652955]   <Interrupt>
[ 1226.652955]     lock(&delayed_node->mutex);
[ 1226.652955]
 *** DEADLOCK ***

Commit 084b6e7c ("btrfs: Fix a lockdep warning when running xfstest.") tried
to fix a similar one that has the exactly same warning, but with that, we still
run to this.

The above lock chain comes from
btrfs_commit_transaction
  ->btrfs_run_delayed_items
    ...
    ->__btrfs_update_delayed_inode
      ...
      ->__btrfs_cow_block
         ...
         ->find_free_extent
            ->cache_block_group
              ->load_free_space_cache
                ->btrfs_readpages
                  ->submit_one_bio
                    ...
                    ->__btrfs_map_block
                      ->btrfs_dev_replace_lock

However, with high memory pressure, tasks which hold dev_replace.lock can
be interrupted by kswapd and then kswapd is intended to release memory occupied
by superblock, inodes and dentries, where we may call evict_inode, and it comes
to

[ 1226.652955]  [<ffffffff81458735>] __btrfs_release_delayed_node+0x45/0x1d0
[ 1226.652955]  [<ffffffff81459e74>] btrfs_remove_delayed_node+0x24/0x30
[ 1226.652955]  [<ffffffff8140c5fe>] btrfs_evict_inode+0x34e/0x700

delayed_node->mutex may be acquired in __btrfs_release_delayed_node(), and it leads
to a ABBA deadlock.

To fix this, we can use "blocking rwlock" used in the case of extent_buffer, but
things are simpler here since we only needs read's spinlock to blocking lock.

With this, btrfs/011 no more produces warnings in dmesg.
Signed-off-by: default avatarLiu Bo <bo.li.liu@oracle.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 18558cae
......@@ -1002,8 +1002,10 @@ struct btrfs_dev_replace {
pid_t lock_owner;
atomic_t nesting_level;
struct mutex lock_finishing_cancel_unmount;
struct mutex lock_management_lock;
struct mutex lock;
rwlock_t lock;
atomic_t read_locks;
atomic_t blocking_readers;
wait_queue_head_t read_lock_wq;
struct btrfs_scrub_progress scrub_progress;
};
......
This diff is collapsed.
......@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_clear_lock_blocking(
struct btrfs_dev_replace *dev_replace);
static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
{
......
......@@ -2272,9 +2272,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
fs_info->dev_replace.lock_owner = 0;
atomic_set(&fs_info->dev_replace.nesting_level, 0);
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
mutex_init(&fs_info->dev_replace.lock_management_lock);
mutex_init(&fs_info->dev_replace.lock);
rwlock_init(&fs_info->dev_replace.lock);
atomic_set(&fs_info->dev_replace.read_locks, 0);
atomic_set(&fs_info->dev_replace.blocking_readers, 0);
init_waitqueue_head(&fs_info->replace_wait);
init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
......
......@@ -396,7 +396,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
}
/* insert extent in reada_tree + all per-device trees, all or nothing */
btrfs_dev_replace_lock(&fs_info->dev_replace);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
......@@ -404,12 +404,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
BUG_ON(!re_exist);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
goto error;
}
prev_dev = NULL;
......@@ -456,12 +456,12 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
BUG_ON(fs_info == NULL);
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
goto error;
}
}
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
btrfs_put_bbio(bbio);
return re;
......
......@@ -3857,16 +3857,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EIO;
}
btrfs_dev_replace_lock(&fs_info->dev_replace);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
if (dev->scrub_device ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -EINPROGRESS;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
......
......@@ -1714,12 +1714,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
} while (read_seqretry(&root->fs_info->profiles_lock, seq));
num_devices = root->fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&root->fs_info->dev_replace);
btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
......@@ -3686,12 +3686,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
num_devices = fs_info->fs_devices->num_devices;
btrfs_dev_replace_lock(&fs_info->dev_replace);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
BUG_ON(num_devices < 1);
num_devices--;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
if (num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
......@@ -5062,10 +5062,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = 1;
free_extent_map(em);
btrfs_dev_replace_lock(&fs_info->dev_replace);
btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
ret++;
btrfs_dev_replace_unlock(&fs_info->dev_replace);
btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
return ret;
}
......@@ -5325,10 +5325,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (!bbio_ret)
goto out;
btrfs_dev_replace_lock(dev_replace);
btrfs_dev_replace_lock(dev_replace, 0);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (!dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
btrfs_dev_replace_unlock(dev_replace, 0);
else
btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
......@@ -5751,8 +5753,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->mirror_num = map->num_stripes + 1;
}
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
if (dev_replace_is_ongoing) {
btrfs_dev_replace_clear_lock_blocking(dev_replace);
btrfs_dev_replace_unlock(dev_replace, 0);
}
free_extent_map(em);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment