Commit 195a49ea authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba

btrfs: fix race between writes to swap files and scrub

When we active a swap file, at btrfs_swap_activate(), we acquire the
exclusive operation lock to prevent the physical location of the swap
file extents to be changed by operations such as balance and device
replace/resize/remove. We also call there can_nocow_extent() which,
among other things, checks if the block group of a swap file extent is
currently RO, and if it is we can not use the extent, since a write
into it would result in COWing the extent.

However we have no protection against a scrub operation running after we
activate the swap file, which can result in the swap file extents to be
COWed while the scrub is running and operating on the respective block
group, because scrub turns a block group into RO before it processes it
and then back again to RW mode after processing it. That means an attempt
to write into a swap file extent while scrub is processing the respective
block group, will result in COWing the extent, changing its physical
location on disk.

Fix this by making sure that block groups that have extents that are used
by active swap files can not be turned into RO mode, therefore making it
not possible for a scrub to turn them into RO mode. When a scrub finds a
block group that can not be turned to RO due to the existence of extents
used by swap files, it proceeds to the next block group and logs a warning
message that mentions the block group was skipped due to active swap
files - this is the same approach we currently use for balance.

Fixes: ed46ff3d ("Btrfs: support swap files")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: default avatarAnand Jain <anand.jain@oracle.com>
Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 20903032
...@@ -1162,6 +1162,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) ...@@ -1162,6 +1162,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
spin_lock(&sinfo->lock); spin_lock(&sinfo->lock);
spin_lock(&cache->lock); spin_lock(&cache->lock);
if (cache->swap_extents) {
ret = -ETXTBSY;
goto out;
}
if (cache->ro) { if (cache->ro) {
cache->ro++; cache->ro++;
ret = 0; ret = 0;
...@@ -2307,7 +2312,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ...@@ -2307,7 +2312,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
} }
ret = inc_block_group_ro(cache, 0); ret = inc_block_group_ro(cache, 0);
if (!do_chunk_alloc) if (!do_chunk_alloc || ret == -ETXTBSY)
goto unlock_out; goto unlock_out;
if (!ret) if (!ret)
goto out; goto out;
...@@ -2316,6 +2321,8 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ...@@ -2316,6 +2321,8 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
if (ret < 0) if (ret < 0)
goto out; goto out;
ret = inc_block_group_ro(cache, 0); ret = inc_block_group_ro(cache, 0);
if (ret == -ETXTBSY)
goto unlock_out;
out: out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
...@@ -3406,6 +3413,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ...@@ -3406,6 +3413,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list)); ASSERT(list_empty(&block_group->bg_list));
ASSERT(refcount_read(&block_group->refs) == 1); ASSERT(refcount_read(&block_group->refs) == 1);
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group); btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock); spin_lock(&info->block_group_cache_lock);
...@@ -3472,3 +3480,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) ...@@ -3472,3 +3480,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
__btrfs_remove_free_space_cache(block_group->free_space_ctl); __btrfs_remove_free_space_cache(block_group->free_space_ctl);
} }
} }
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
{
bool ret = true;
spin_lock(&bg->lock);
if (bg->ro)
ret = false;
else
bg->swap_extents++;
spin_unlock(&bg->lock);
return ret;
}
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
{
spin_lock(&bg->lock);
ASSERT(!bg->ro);
ASSERT(bg->swap_extents >= amount);
bg->swap_extents -= amount;
spin_unlock(&bg->lock);
}
...@@ -186,6 +186,12 @@ struct btrfs_block_group { ...@@ -186,6 +186,12 @@ struct btrfs_block_group {
/* Flag indicating this block group is placed on a sequential zone */ /* Flag indicating this block group is placed on a sequential zone */
bool seq_zone; bool seq_zone;
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
*/
int swap_extents;
/* Record locked full stripes for RAID5/6 block group */ /* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root; struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
...@@ -312,4 +318,7 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) ...@@ -312,4 +318,7 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_freeze_block_group(struct btrfs_block_group *cache);
void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache);
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg);
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount);
#endif /* BTRFS_BLOCK_GROUP_H */ #endif /* BTRFS_BLOCK_GROUP_H */
...@@ -524,6 +524,11 @@ struct btrfs_swapfile_pin { ...@@ -524,6 +524,11 @@ struct btrfs_swapfile_pin {
* points to a struct btrfs_device. * points to a struct btrfs_device.
*/ */
bool is_block_group; bool is_block_group;
/*
* Only used when 'is_block_group' is true and it is the number of
* extents used by a swapfile for this block group ('ptr' field).
*/
int bg_extent_count;
}; };
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
......
...@@ -10192,6 +10192,7 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, ...@@ -10192,6 +10192,7 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
sp->ptr = ptr; sp->ptr = ptr;
sp->inode = inode; sp->inode = inode;
sp->is_block_group = is_block_group; sp->is_block_group = is_block_group;
sp->bg_extent_count = 1;
spin_lock(&fs_info->swapfile_pins_lock); spin_lock(&fs_info->swapfile_pins_lock);
p = &fs_info->swapfile_pins.rb_node; p = &fs_info->swapfile_pins.rb_node;
...@@ -10205,6 +10206,8 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, ...@@ -10205,6 +10206,8 @@ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
(sp->ptr == entry->ptr && sp->inode > entry->inode)) { (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
p = &(*p)->rb_right; p = &(*p)->rb_right;
} else { } else {
if (is_block_group)
entry->bg_extent_count++;
spin_unlock(&fs_info->swapfile_pins_lock); spin_unlock(&fs_info->swapfile_pins_lock);
kfree(sp); kfree(sp);
return 1; return 1;
...@@ -10230,8 +10233,11 @@ static void btrfs_free_swapfile_pins(struct inode *inode) ...@@ -10230,8 +10233,11 @@ static void btrfs_free_swapfile_pins(struct inode *inode)
sp = rb_entry(node, struct btrfs_swapfile_pin, node); sp = rb_entry(node, struct btrfs_swapfile_pin, node);
if (sp->inode == inode) { if (sp->inode == inode) {
rb_erase(&sp->node, &fs_info->swapfile_pins); rb_erase(&sp->node, &fs_info->swapfile_pins);
if (sp->is_block_group) if (sp->is_block_group) {
btrfs_dec_block_group_swap_extents(sp->ptr,
sp->bg_extent_count);
btrfs_put_block_group(sp->ptr); btrfs_put_block_group(sp->ptr);
}
kfree(sp); kfree(sp);
} }
node = next; node = next;
...@@ -10446,6 +10452,17 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ...@@ -10446,6 +10452,17 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out; goto out;
} }
if (!btrfs_inc_block_group_swap_extents(bg)) {
btrfs_warn(fs_info,
"block group for swapfile at %llu is read-only%s",
bg->start,
atomic_read(&fs_info->scrubs_running) ?
" (scrub running)" : "");
btrfs_put_block_group(bg);
ret = -EINVAL;
goto out;
}
ret = btrfs_add_swapfile_pin(inode, bg, true); ret = btrfs_add_swapfile_pin(inode, bg, true);
if (ret) { if (ret) {
btrfs_put_block_group(bg); btrfs_put_block_group(bg);
......
...@@ -3767,6 +3767,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ...@@ -3767,6 +3767,13 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* commit_transactions. * commit_transactions.
*/ */
ro_set = 0; ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
"skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else { } else {
btrfs_warn(fs_info, btrfs_warn(fs_info,
"failed setting block group ro: %d", ret); "failed setting block group ro: %d", ret);
...@@ -3862,7 +3869,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ...@@ -3862,7 +3869,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
} else { } else {
spin_unlock(&cache->lock); spin_unlock(&cache->lock);
} }
skip_unfreeze:
btrfs_unfreeze_block_group(cache); btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache); btrfs_put_block_group(cache);
if (ret) if (ret)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment