Commit e9255d6c authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba

btrfs: scrub: remove the old scrub recheck code

The old scrub code has different entrance to verify the content, and
since we have removed the writeback path, now we can start removing the
re-check part, including:

- scrub_recover structure
- scrub_sector::recover member
- function scrub_setup_recheck_block()
- function scrub_recheck_block()
- function scrub_recheck_block_checksum()
- function scrub_repair_block_group_good_copy()
- function scrub_repair_sector_from_good_copy()
- function scrub_is_page_on_raid56()

- function full_stripe_lock()
- function search_full_stripe_lock()
- function get_full_stripe_logical()
- function insert_full_stripe_lock()
- function lock_full_stripe()
- function unlock_full_stripe()
- btrfs_block_group::full_stripe_locks_root member
- btrfs_full_stripe_locks_tree structure
  This infrastructure is to ensure RAID56 scrub is properly handling
  recovery and P/Q scrub correctly.

  This is no longer needed, before P/Q scrub we will wait for all
  the involved data stripes to be scrubbed first, and RAID56 code has
  internal lock to ensure no race in the same full stripe.

- function scrub_print_warning()
- function scrub_get_recover()
- function scrub_put_recover()
- function scrub_handle_errored_block()
- function scrub_setup_recheck_block()
- function scrub_bio_wait_endio()
- function scrub_submit_raid56_bio_wait()
- function scrub_recheck_block_on_raid56()
- function scrub_recheck_block()
- function scrub_recheck_block_checksum()
- function scrub_repair_block_from_good_copy()
- function scrub_repair_sector_from_good_copy()

And two more functions exported temporarily for later cleanup:

- alloc_scrub_sector()
- alloc_scrub_block()
Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 16f93993
......@@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
cache);
/*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
* tries to release full stripe lock.
*
* No better way to resolve, but only to warn.
*/
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
kfree(cache->free_space_ctl);
kfree(cache->physical_map);
kfree(cache);
......@@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
atomic_set(&cache->frozen, 0);
mutex_init(&cache->free_space_lock);
cache->full_stripe_locks_root.root = RB_ROOT;
mutex_init(&cache->full_stripe_locks_root.lock);
return cache;
}
......
......@@ -91,14 +91,6 @@ struct btrfs_caching_control {
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
/*
* Tree to record all locked full stripes of a RAID5/6 block group
*/
struct btrfs_full_stripe_locks_tree {
struct rb_root root;
struct mutex lock;
};
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
......@@ -229,9 +221,6 @@ struct btrfs_block_group {
*/
int swap_extents;
/* Record locked full stripes for RAID5/6 block group */
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
/*
* Allocation offset for the block group to implement sequential
* allocation. This is used only on a zoned filesystem.
......
......@@ -183,12 +183,6 @@ struct scrub_stripe {
struct work_struct work;
};
struct scrub_recover {
refcount_t refs;
struct btrfs_io_context *bioc;
u64 map_length;
};
struct scrub_sector {
struct scrub_block *sblock;
struct list_head list;
......@@ -200,8 +194,6 @@ struct scrub_sector {
unsigned int have_csum:1;
unsigned int io_error:1;
u8 csum[BTRFS_CSUM_SIZE];
struct scrub_recover *recover;
};
struct scrub_bio {
......@@ -303,13 +295,6 @@ struct scrub_warning {
struct btrfs_device *dev;
};
struct full_stripe_lock {
struct rb_node node;
u64 logical;
u64 refs;
struct mutex mutex;
};
#ifndef CONFIG_64BIT
/* This structure is for architectures whose (void *) is smaller than u64 */
struct scrub_page_private {
......@@ -406,11 +391,11 @@ static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
}
static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
struct btrfs_device *dev,
u64 logical, u64 physical,
u64 physical_for_dev_replace,
int mirror_num)
struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
struct btrfs_device *dev,
u64 logical, u64 physical,
u64 physical_for_dev_replace,
int mirror_num)
{
struct scrub_block *sblock;
......@@ -437,8 +422,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
*
* Will also allocate new pages for @sblock if needed.
*/
static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
u64 logical)
struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, u64 logical)
{
const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
struct scrub_sector *ssector;
......@@ -534,17 +518,6 @@ static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
scrub_sector_get_page_offset(ssector));
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck[]);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror);
static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int sector_num, int force_write);
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
......@@ -555,12 +528,6 @@ static void scrub_bio_end_io_worker(struct work_struct *work);
static void scrub_block_complete(struct scrub_block *sblock);
static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
{
return sector->recover &&
(sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
refcount_inc(&sctx->refs);
......@@ -606,223 +573,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
scrub_pause_off(fs_info);
}
/*
* Insert new full stripe lock into full stripe locks tree
*
* Return pointer to existing or newly inserted full_stripe_lock structure if
* everything works well.
* Return ERR_PTR(-ENOMEM) if we failed to allocate memory
*
* NOTE: caller must hold full_stripe_locks_root->lock before calling this
* function
*/
static struct full_stripe_lock *insert_full_stripe_lock(
struct btrfs_full_stripe_locks_tree *locks_root,
u64 fstripe_logical)
{
struct rb_node **p;
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
lockdep_assert_held(&locks_root->lock);
p = &locks_root->root.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct full_stripe_lock, node);
if (fstripe_logical < entry->logical) {
p = &(*p)->rb_left;
} else if (fstripe_logical > entry->logical) {
p = &(*p)->rb_right;
} else {
entry->refs++;
return entry;
}
}
/*
* Insert new lock.
*/
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
ret->refs = 1;
mutex_init(&ret->mutex);
rb_link_node(&ret->node, parent, p);
rb_insert_color(&ret->node, &locks_root->root);
return ret;
}
/*
* Search for a full stripe lock of a block group
*
* Return pointer to existing full stripe lock if found
* Return NULL if not found
*/
static struct full_stripe_lock *search_full_stripe_lock(
struct btrfs_full_stripe_locks_tree *locks_root,
u64 fstripe_logical)
{
struct rb_node *node;
struct full_stripe_lock *entry;
lockdep_assert_held(&locks_root->lock);
node = locks_root->root.rb_node;
while (node) {
entry = rb_entry(node, struct full_stripe_lock, node);
if (fstripe_logical < entry->logical)
node = node->rb_left;
else if (fstripe_logical > entry->logical)
node = node->rb_right;
else
return entry;
}
return NULL;
}
/*
* Helper to get full stripe logical from a normal bytenr.
*
* Caller must ensure @cache is a RAID56 block group.
*/
static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
{
u64 ret;
/*
* Due to chunk item size limit, full stripe length should not be
* larger than U32_MAX. Just a sanity check here.
*/
WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
/*
* round_down() can only handle power of 2, while RAID56 full
* stripe length can be 64KiB * n, so we need to manually round down.
*/
ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
cache->full_stripe_len + cache->start;
return ret;
}
/*
* Lock a full stripe to avoid concurrency of recovery and read
*
* It's only used for profiles with parities (RAID5/6), for other profiles it
* does nothing.
*
* Return 0 if we locked full stripe covering @bytenr, with a mutex held.
* So caller must call unlock_full_stripe() at the same context.
*
* Return <0 if encounters error.
*/
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool *locked_ret)
{
struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *existing;
u64 fstripe_start;
int ret = 0;
*locked_ret = false;
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg_cache) {
ASSERT(0);
return -ENOENT;
}
/* Profiles not based on parity don't need full stripe lock */
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
goto out;
locks_root = &bg_cache->full_stripe_locks_root;
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
/* Now insert the full stripe lock */
mutex_lock(&locks_root->lock);
existing = insert_full_stripe_lock(locks_root, fstripe_start);
mutex_unlock(&locks_root->lock);
if (IS_ERR(existing)) {
ret = PTR_ERR(existing);
goto out;
}
mutex_lock(&existing->mutex);
*locked_ret = true;
out:
btrfs_put_block_group(bg_cache);
return ret;
}
/*
* Unlock a full stripe.
*
* NOTE: Caller must ensure it's the same context calling corresponding
* lock_full_stripe().
*
* Return 0 if we unlock full stripe without problem.
* Return <0 for error
*/
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
bool locked)
{
struct btrfs_block_group *bg_cache;
struct btrfs_full_stripe_locks_tree *locks_root;
struct full_stripe_lock *fstripe_lock;
u64 fstripe_start;
bool freeit = false;
int ret = 0;
/* If we didn't acquire full stripe lock, no need to continue */
if (!locked)
return 0;
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg_cache) {
ASSERT(0);
return -ENOENT;
}
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
goto out;
locks_root = &bg_cache->full_stripe_locks_root;
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
mutex_lock(&locks_root->lock);
fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
/* Unpaired unlock_full_stripe() detected */
if (!fstripe_lock) {
WARN_ON(1);
ret = -ENOENT;
mutex_unlock(&locks_root->lock);
goto out;
}
if (fstripe_lock->refs == 0) {
WARN_ON(1);
btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
fstripe_lock->logical);
} else {
fstripe_lock->refs--;
}
if (fstripe_lock->refs == 0) {
rb_erase(&fstripe_lock->node, &locks_root->root);
freeit = true;
}
mutex_unlock(&locks_root->lock);
mutex_unlock(&fstripe_lock->mutex);
if (freeit)
kfree(fstripe_lock);
out:
btrfs_put_block_group(bg_cache);
return ret;
}
static void scrub_free_csums(struct scrub_ctx *sctx)
{
while (!list_empty(&sctx->csum_list)) {
......@@ -1101,444 +851,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
btrfs_free_path(path);
}
static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
{
scrub_print_common_warning(errstr, sblock->dev,
sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER,
sblock->logical, sblock->physical);
}
static inline void scrub_get_recover(struct scrub_recover *recover)
{
refcount_inc(&recover->refs);
}
static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
struct scrub_recover *recover)
{
if (refcount_dec_and_test(&recover->refs)) {
btrfs_bio_counter_dec(fs_info);
btrfs_put_bioc(recover->bioc);
kfree(recover);
}
}
/*
* scrub_handle_errored_block gets called when either verification of the
* sectors failed or the bio failed to read, e.g. with EIO. In the latter
* case, this function handles all sectors in the bio, even though only one
* may be bad.
* The goal of this function is to repair the errored block by using the
* contents of one of the mirrors.
*/
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
{
struct scrub_ctx *sctx = sblock_to_check->sctx;
struct btrfs_device *dev = sblock_to_check->dev;
struct btrfs_fs_info *fs_info;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
/* One scrub_block for each mirror */
struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
struct scrub_block *sblock_bad;
int ret;
int mirror_index;
int sector_num;
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->sector_count < 1);
fs_info = sctx->fs_info;
if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
/*
* If we find an error in a super block, we just report it.
* They will get written with the next transaction commit
* anyway
*/
scrub_print_warning("super block error", sblock_to_check);
spin_lock(&sctx->stat_lock);
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
return 0;
}
logical = sblock_to_check->logical;
ASSERT(sblock_to_check->mirror_num);
failed_mirror_index = sblock_to_check->mirror_num - 1;
is_metadata = !(sblock_to_check->sectors[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->sectors[0]->have_csum;
if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
return 0;
/*
* We must use GFP_NOFS because the scrub task might be waiting for a
* worker task executing this function and in turn a transaction commit
* might be waiting the scrub task to pause (which needs to wait for all
* the worker tasks to complete before pausing).
* We do allocations in the workers through insert_full_stripe_lock()
* and scrub_add_sector_to_wr_bio(), which happens down the call chain of
* this function.
*/
nofs_flag = memalloc_nofs_save();
/*
* For RAID5/6, race can happen for a different device scrub thread.
* For data corruption, Parity and Data threads will both try
* to recovery the data.
* Race can lead to doubly added csum error, or even unrecoverable
* error.
*/
ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
if (ret < 0) {
memalloc_nofs_restore(nofs_flag);
spin_lock(&sctx->stat_lock);
if (ret == -ENOMEM)
sctx->stat.malloc_errors++;
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
return ret;
}
/*
* read all mirrors one after the other. This includes to
* re-read the extent or metadata block that failed (that was
* the cause that this fixup code is called) another time,
* sector by sector this time in order to know which sectors
* caused I/O errors and which ones are good (for all mirrors).
* It is the goal to handle the situation when more than one
* mirror contains I/O errors, but the errors do not
* overlap, i.e. the data can be repaired by selecting the
* sectors from those mirrors without I/O error on the
* particular sectors. One example (with blocks >= 2 * sectorsize)
* would be that mirror #1 has an I/O error on the first sector,
* the second sector is good, and mirror #2 has an I/O error on
* the second sector, but the first sector is good.
* Then the first sector of the first mirror can be repaired by
* taking the first sector of the second mirror, and the
* second sector of the second mirror can be repaired by
* copying the contents of the 2nd sector of the 1st mirror.
* One more note: if the sectors of one mirror contain I/O
* errors, the checksum cannot be verified. In order to get
* the best data for repairing, the first attempt is to find
* a mirror without I/O errors and with a validated checksum.
* Only if this is not possible, the sectors are picked from
* mirrors with I/O errors without considering the checksum.
* If the latter is the case, at the end, the checksum of the
* repaired area is verified in order to correctly maintain
* the statistics.
*/
for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
/*
* Note: the two members refs and outstanding_sectors are not
* used in the blocks that are used for the recheck procedure.
*
* But alloc_scrub_block() will initialize sblock::ref anyway,
* so we can use scrub_block_put() to clean them up.
*
* And here we don't setup the physical/dev for the sblock yet,
* they will be correctly initialized in scrub_setup_recheck_block().
*/
sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
logical, 0, 0, mirror_index);
if (!sblocks_for_recheck[mirror_index]) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
}
/* Setup the context, map the logical blocks and alloc the sectors */
ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
goto out;
}
BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
sblock_bad = sblocks_for_recheck[failed_mirror_index];
/* build and submit the bios for the failed mirror, check checksums */
scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
/*
* The error disappeared after reading sector by sector, or
* the area was part of a huge bio and other parts of the
* bio caused I/O errors, or the block layer merged several
* read requests into one and the error is caused by a
* different bio (usually one of the two latter cases is
* the cause)
*/
spin_lock(&sctx->stat_lock);
sctx->stat.unverified_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
goto out;
}
if (!sblock_bad->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
} else if (sblock_bad->header_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
else
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
if (sctx->readonly) {
ASSERT(!sctx->is_dev_replace);
goto out;
}
/*
* now build and submit the bios for the other mirrors, check
* checksums.
* First try to pick the mirror which is completely without I/O
* errors and also does not have a checksum error.
* If one is found, and if a checksum is present, the full block
* that is known to contain an error is rewritten. Afterwards
* the block is known to be corrected.
* If a mirror is found which is completely correct, and no
* checksum is present, only those sectors are rewritten that had
* an I/O error in the block to be repaired, since it cannot be
* determined, which copy of the other sectors is better (and it
* could happen otherwise that a correct sector would be
* overwritten by a bad one).
*/
for (mirror_index = 0; ;mirror_index++) {
struct scrub_block *sblock_other;
if (mirror_index == failed_mirror_index)
continue;
/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
if (mirror_index >= BTRFS_MAX_MIRRORS)
break;
if (!sblocks_for_recheck[mirror_index]->sector_count)
break;
sblock_other = sblocks_for_recheck[mirror_index];
} else {
struct scrub_recover *r = sblock_bad->sectors[0]->recover;
int max_allowed = r->bioc->num_stripes - r->bioc->replace_nr_stripes;
if (mirror_index >= max_allowed)
break;
if (!sblocks_for_recheck[1]->sector_count)
break;
ASSERT(failed_mirror_index == 0);
sblock_other = sblocks_for_recheck[1];
sblock_other->mirror_num = 1 + mirror_index;
}
/* build and submit the bios, check checksums */
scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
sblock_other->no_io_error_seen) {
if (sctx->is_dev_replace) {
goto corrected_error;
} else {
ret = scrub_repair_block_from_good_copy(
sblock_bad, sblock_other);
if (!ret)
goto corrected_error;
}
}
}
if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
goto did_not_correct_error;
/*
* In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those sectors.
* Select the good sectors from mirrors to rewrite bad sectors from
* the area to fix. Afterwards verify the checksum of the block
* that is supposed to be repaired. This verification step is
* only done for the purpose of statistic counting and for the
* final scrub report, whether errors remain.
* A perfect algorithm could make use of the checksum and try
* all possible combinations of sectors from the different mirrors
* until the checksum verification succeeds. For example, when
* the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
* of mirror #2 is readable but the final checksum test fails,
* then the 2nd sector of mirror #3 could be tried, whether now
* the final checksum succeeds. But this would be a rare
* exception and is therefore not implemented. At least it is
* avoided that the good copy is overwritten.
* A more useful improvement would be to pick the sectors
* without I/O error based on sector sizes (512 bytes on legacy
* disks) instead of on sectorsize. Then maybe 512 byte of one
* mirror could be repaired by taking 512 byte of a different
* mirror, even if other 512 byte sectors in the same sectorsize
* area are unreadable.
*/
success = 1;
for (sector_num = 0; sector_num < sblock_bad->sector_count;
sector_num++) {
struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
struct scrub_block *sblock_other = NULL;
/* Skip no-io-error sectors in scrub */
if (!sector_bad->io_error && !sctx->is_dev_replace)
continue;
if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
/*
* In case of dev replace, if raid56 rebuild process
* didn't work out correct data, then copy the content
* in sblock_bad to make sure target device is identical
* to source device, instead of writing garbage data in
* sblock_for_recheck array to target device.
*/
sblock_other = NULL;
} else if (sector_bad->io_error) {
/* Try to find no-io-error sector in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index]->sector_count > 0;
mirror_index++) {
if (!sblocks_for_recheck[mirror_index]->
sectors[sector_num]->io_error) {
sblock_other = sblocks_for_recheck[mirror_index];
break;
}
}
if (!sblock_other)
success = 0;
}
if (sctx->is_dev_replace) {
/*
* Did not find a mirror to fetch the sector from.
* scrub_write_sector_to_dev_replace() handles this
* case (sector->io_error), by filling the block with
* zeros before submitting the write request
*/
if (!sblock_other)
sblock_other = sblock_bad;
} else if (sblock_other) {
ret = scrub_repair_sector_from_good_copy(sblock_bad,
sblock_other,
sector_num, 0);
if (0 == ret)
sector_bad->io_error = 0;
else
success = 0;
}
}
if (success && !sctx->is_dev_replace) {
if (is_metadata || have_csum) {
/*
* need to verify the checksum now that all
* sectors on disk are repaired (the write
* request for data to be repaired is on its way).
* Just be lazy and use scrub_recheck_block()
* which re-reads the data before the checksum
* is verified, but most likely the data comes out
* of the page cache.
*/
scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
goto corrected_error;
else
goto did_not_correct_error;
} else {
corrected_error:
spin_lock(&sctx->stat_lock);
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on dev %s",
logical, btrfs_dev_name(dev));
}
} else {
did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on dev %s",
logical, btrfs_dev_name(dev));
}
out:
for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
struct scrub_recover *recover;
int sector_index;
/* Not allocated, continue checking the next mirror */
if (!sblock)
continue;
for (sector_index = 0; sector_index < sblock->sector_count;
sector_index++) {
/*
* Here we just cleanup the recover, each sector will be
* properly cleaned up by later scrub_block_put()
*/
recover = sblock->sectors[sector_index]->recover;
if (recover) {
scrub_put_recover(fs_info, recover);
sblock->sectors[sector_index]->recover = NULL;
}
}
scrub_block_put(sblock);
}
ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
memalloc_nofs_restore(nofs_flag);
if (ret < 0)
return ret;
return 0;
}
static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
{
if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
......@@ -1581,224 +893,6 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
}
}
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck[])
{
struct scrub_ctx *sctx = original_sblock->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
u64 logical = original_sblock->logical;
u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
u64 generation = original_sblock->sectors[0]->generation;
u64 flags = original_sblock->sectors[0]->flags;
u64 have_csum = original_sblock->sectors[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_io_context *bioc;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
int sector_index = 0;
int mirror_index;
int nmirrors;
int ret;
while (length > 0) {
sublen = min_t(u64, length, fs_info->sectorsize);
mapped_length = sublen;
bioc = NULL;
/*
* With a length of sectorsize, each returned stripe represents
* one mirror
*/
btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
logical, &mapped_length, &bioc);
if (ret || !bioc || mapped_length < sublen) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
if (!recover) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
return -ENOMEM;
}
refcount_set(&recover->refs, 1);
recover->bioc = bioc;
recover->map_length = mapped_length;
ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_sector *sector;
sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
scrub_put_recover(fs_info, recover);
return -ENOMEM;
}
sector->flags = flags;
sector->generation = generation;
sector->have_csum = have_csum;
if (have_csum)
memcpy(sector->csum,
original_sblock->sectors[0]->csum,
sctx->fs_info->csum_size);
scrub_stripe_index_and_offset(logical,
bioc->map_type,
bioc->full_stripe_logical,
bioc->num_stripes -
bioc->replace_nr_stripes,
mirror_index,
&stripe_index,
&stripe_offset);
/*
* We're at the first sector, also populate @sblock
* physical and dev.
*/
if (sector_index == 0) {
sblock->physical =
bioc->stripes[stripe_index].physical +
stripe_offset;
sblock->dev = bioc->stripes[stripe_index].dev;
sblock->physical_for_dev_replace =
original_sblock->physical_for_dev_replace;
}
BUG_ON(sector_index >= original_sblock->sector_count);
scrub_get_recover(recover);
sector->recover = recover;
}
scrub_put_recover(fs_info, recover);
length -= sublen;
logical += sublen;
sector_index++;
}
return 0;
}
static void scrub_bio_wait_endio(struct bio *bio)
{
complete(bio->bi_private);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
struct bio *bio,
struct scrub_sector *sector)
{
DECLARE_COMPLETION_ONSTACK(done);
bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
SECTOR_SHIFT;
bio->bi_private = &done;
bio->bi_end_io = scrub_bio_wait_endio;
raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
}
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
struct scrub_sector *first_sector = sblock->sectors[0];
struct bio *bio;
int i;
/* All sectors in sblock belong to the same stripe on the same device. */
ASSERT(sblock->dev);
if (!sblock->dev->bdev)
goto out;
bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
for (i = 0; i < sblock->sector_count; i++) {
struct scrub_sector *sector = sblock->sectors[i];
bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
bio_put(bio);
goto out;
}
bio_put(bio);
scrub_recheck_block_checksum(sblock);
return;
out:
for (i = 0; i < sblock->sector_count; i++)
sblock->sectors[i]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/*
* This function will check the on disk data for checksum errors, header errors
* and read I/O errors. If any I/O errors happen, the exact sectors which are
* errored are marked as being bad. The goal is to enable scrub to take those
* sectors that are not errored from all the mirrors so that the sectors that
* are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock,
int retry_failed_mirror)
{
int i;
sblock->no_io_error_seen = 1;
/* short cut for raid56 */
if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
for (i = 0; i < sblock->sector_count; i++) {
struct scrub_sector *sector = sblock->sectors[i];
struct bio bio;
struct bio_vec bvec;
if (sblock->dev->bdev == NULL) {
sector->io_error = 1;
sblock->no_io_error_seen = 0;
continue;
}
bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
SECTOR_SHIFT;
btrfsic_check_bio(&bio);
if (submit_bio_wait(&bio)) {
sector->io_error = 1;
sblock->no_io_error_seen = 0;
}
bio_uninit(&bio);
}
if (sblock->no_io_error_seen)
scrub_recheck_block_checksum(sblock);
}
static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
{
struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
......@@ -1808,77 +902,6 @@ static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
return !ret;
}
static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
sblock->header_error = 0;
sblock->checksum_error = 0;
sblock->generation_error = 0;
if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
scrub_checksum_data(sblock);
else
scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good)
{
int i;
int ret = 0;
for (i = 0; i < sblock_bad->sector_count; i++) {
int ret_sub;
ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
sblock_good, i, 1);
if (ret_sub)
ret = ret_sub;
}
return ret;
}
static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int sector_num, int force_write)
{
struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
const u32 sectorsize = fs_info->sectorsize;
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || sector_bad->io_error) {
struct bio bio;
struct bio_vec bvec;
int ret;
if (!sblock_bad->dev->bdev) {
btrfs_warn_rl(fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
return -EIO;
}
bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
bio.bi_iter.bi_sector = (sblock_bad->physical +
sector_bad->offset) >> SECTOR_SHIFT;
ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
bio_uninit(&bio);
if (ret) {
btrfs_dev_stat_inc_and_print(sblock_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
atomic64_inc(&fs_info->dev_replace.num_write_errors);
return -EIO;
}
}
return 0;
}
static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
{
int ret = 0;
......@@ -1934,9 +957,6 @@ static int scrub_checksum(struct scrub_block *sblock)
ret = scrub_checksum_super(sblock);
else
WARN_ON(1);
if (ret)
scrub_handle_errored_block(sblock);
return ret;
}
......@@ -2916,16 +1936,13 @@ static void scrub_bio_end_io_worker(struct work_struct *work)
static void scrub_block_complete(struct scrub_block *sblock)
{
if (!sblock->no_io_error_seen) {
scrub_handle_errored_block(sblock);
} else {
if (sblock->no_io_error_seen)
/*
* if has checksum error, write via repair mechanism in
* dev replace case, otherwise write here in dev replace
* case.
*/
scrub_checksum(sblock);
}
}
static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
......
......@@ -16,9 +16,16 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
/* Temporary declaration, would be deleted later. */
struct scrub_ctx;
struct scrub_sector;
struct scrub_block;
int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum);
int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
struct scrub_sector *sector);
void scrub_sector_get(struct scrub_sector *sector);
struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, u64 logical);
struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
struct btrfs_device *dev,
u64 logical, u64 physical,
u64 physical_for_dev_replace,
int mirror_num);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment