Commit 0966a7b1 authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba

btrfs: scrub: Introduce full stripe lock for RAID56

Unlike mirror based profiles, RAID5/6 recovery needs to read out the
whole full stripe.

And if we don't do proper protection, it can easily cause race condition.

Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe()
for RAID5/6.
Which store a rb_tree of mutexes for full stripes, so scrub callers can
use them to lock a full stripe to avoid race.
Signed-off-by: default avatarQu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: default avatarLiu Bo <bo.li.liu@oracle.com>
Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
[ minor comment adjustments ]
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent fa7aede2
...@@ -539,6 +539,14 @@ struct btrfs_io_ctl { ...@@ -539,6 +539,14 @@ struct btrfs_io_ctl {
unsigned check_crcs:1; unsigned check_crcs:1;
}; };
/*
* Tree to record all locked full stripes of a RAID5/6 block group
*/
struct btrfs_full_stripe_locks_tree {
struct rb_root root;
struct mutex lock;
};
struct btrfs_block_group_cache { struct btrfs_block_group_cache {
struct btrfs_key key; struct btrfs_key key;
struct btrfs_block_group_item item; struct btrfs_block_group_item item;
...@@ -649,6 +657,9 @@ struct btrfs_block_group_cache { ...@@ -649,6 +657,9 @@ struct btrfs_block_group_cache {
* Protected by free_space_lock. * Protected by free_space_lock.
*/ */
int needs_free_space; int needs_free_space;
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
}; };
/* delayed seq elem */ /* delayed seq elem */
...@@ -3653,6 +3664,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, ...@@ -3653,6 +3664,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev); struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress); struct btrfs_scrub_progress *progress);
static inline void btrfs_init_full_stripe_locks_tree(
struct btrfs_full_stripe_locks_tree *locks_root)
{
locks_root->root = RB_ROOT;
mutex_init(&locks_root->lock);
}
/* dev-replace.c */ /* dev-replace.c */
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
......
...@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) ...@@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
if (atomic_dec_and_test(&cache->count)) { if (atomic_dec_and_test(&cache->count)) {
WARN_ON(cache->pinned > 0); WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0); WARN_ON(cache->reserved > 0);
/*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
* tries to release full stripe lock.
*
* No better way to resolve, but only to warn.
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl); kfree(cache->free_space_ctl);
kfree(cache); kfree(cache);
} }
...@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, ...@@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
btrfs_init_free_space_ctl(cache); btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0); atomic_set(&cache->trimming, 0);
mutex_init(&cache->free_space_lock); mutex_init(&cache->free_space_lock);
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
return cache; return cache;
} }
......
...@@ -240,6 +240,13 @@ struct scrub_warning { ...@@ -240,6 +240,13 @@ struct scrub_warning {
struct btrfs_device *dev; struct btrfs_device *dev;
}; };
struct full_stripe_lock {
struct rb_node node;
u64 logical;
u64 refs;
struct mutex mutex;
};
static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
...@@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) ...@@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
scrub_pause_off(fs_info); scrub_pause_off(fs_info);
} }
/*
* Insert new full stripe lock into full stripe locks tree
*
* Return pointer to existing or newly inserted full_stripe_lock structure if
* everything works well.
* Return ERR_PTR(-ENOMEM) if we failed to allocate memory
*
* NOTE: caller must hold full_stripe_locks_root->lock before calling this
* function
*/
static struct full_stripe_lock *insert_full_stripe_lock(
struct btrfs_full_stripe_locks_tree *locks_root,
u64 fstripe_logical)
{
struct rb_node **p;
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
WARN_ON(!mutex_is_locked(&locks_root->lock));
p = &locks_root->root.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct full_stripe_lock, node);
if (fstripe_logical < entry->logical) {
p = &(*p)->rb_left;
} else if (fstripe_logical > entry->logical) {
p = &(*p)->rb_right;
} else {
entry->refs++;
return entry;
}
}
/* Insert new lock */
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
ret->refs = 1;
mutex_init(&ret->mutex);
rb_link_node(&ret->node, parent, p);
rb_insert_color(&ret->node, &locks_root->root);
return ret;
}
/*
* Search for a full stripe lock of a block group
*
* Return pointer to existing full stripe lock if found
* Return NULL if not found
*/
static struct full_stripe_lock *search_full_stripe_lock(
struct btrfs_full_stripe_locks_tree *locks_root,
u64 fstripe_logical)
{
struct rb_node *node;
struct full_stripe_lock *entry;
WARN_ON(!mutex_is_locked(&locks_root->lock));
node = locks_root->root.rb_node;
while (node) {
entry = rb_entry(node, struct full_stripe_lock, node);
if (fstripe_logical < entry->logical)
node = node->rb_left;
else if (fstripe_logical > entry->logical)
node = node->rb_right;
else
return entry;
}
return NULL;
}
/*
* Helper to get full stripe logical from a normal bytenr.
*
* Caller must ensure @cache is a RAID56 block group.
*/
static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
u64 bytenr)
{
u64 ret;
/*
* Due to chunk item size limit, full stripe length should not be
* larger than U32_MAX. Just a sanity check here.
*/
WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
/*
* round_down() can only handle power of 2, while RAID56 full
* stripe length can be 64KiB * n, so we need to manually round down.
*/
ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
cache->full_stripe_len + cache->key.objectid;
return ret;
}
/*
* Lock a full stripe to avoid concurrency of recovery and read
*
* It's only used for profiles with parities (RAID5/6), for other profiles it
* does nothing.
*
* Return 0 if we locked full stripe covering @bytenr, with a mutex held.
* So caller must call unlock_full_stripe() at the same context.
*
* Return <0 if encounters error.
*/
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool *locked_ret)
{
struct btrfs_block_group_cache *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *existing;
u64 fstripe_start;
int ret = 0;
*locked_ret = false;
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg_cache) {
ASSERT(0);
return -ENOENT;
}
/* Profiles not based on parity don't need full stripe lock */
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
goto out;
locks_root = &bg_cache->full_stripe_locks_root;
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
/* Now insert the full stripe lock */
mutex_lock(&locks_root->lock);
existing = insert_full_stripe_lock(locks_root, fstripe_start);
mutex_unlock(&locks_root->lock);
if (IS_ERR(existing)) {
ret = PTR_ERR(existing);
goto out;
}
mutex_lock(&existing->mutex);
*locked_ret = true;
out:
btrfs_put_block_group(bg_cache);
return ret;
}
/*
* Unlock a full stripe.
*
* NOTE: Caller must ensure it's the same context calling corresponding
* lock_full_stripe().
*
* Return 0 if we unlock full stripe without problem.
* Return <0 for error
*/
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool locked)
{
struct btrfs_block_group_cache *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *fstripe_lock;
u64 fstripe_start;
bool freeit = false;
int ret = 0;
/* If we didn't acquire full stripe lock, no need to continue */
if (!locked)
return 0;
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg_cache) {
ASSERT(0);
return -ENOENT;
}
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
goto out;
locks_root = &bg_cache->full_stripe_locks_root;
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
mutex_lock(&locks_root->lock);
fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
/* Unpaired unlock_full_stripe() detected */
if (!fstripe_lock) {
WARN_ON(1);
ret = -ENOENT;
mutex_unlock(&locks_root->lock);
goto out;
}
if (fstripe_lock->refs == 0) {
WARN_ON(1);
btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
fstripe_lock->logical);
} else {
fstripe_lock->refs--;
}
if (fstripe_lock->refs == 0) {
rb_erase(&fstripe_lock->node, &locks_root->root);
freeit = true;
}
mutex_unlock(&locks_root->lock);
mutex_unlock(&fstripe_lock->mutex);
if (freeit)
kfree(fstripe_lock);
out:
btrfs_put_block_group(bg_cache);
return ret;
}
/* /*
* used for workers that require transaction commits (i.e., for the * used for workers that require transaction commits (i.e., for the
* NOCOW case) * NOCOW case)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment