Commit 75b47033 authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba

btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap

Since we have rbio::error_bitmap to indicate exactly where the errors
are (including read error and csum mismatch error), we can make recovery
path more accurate.

For example:

             0        32K       64K
     Data 1  |XXXXXXXX|         |
     Data 2  |        |XXXXXXXXX|
     Parity  |        |         |

1) Get csum mismatch when reading data 1 [0, 32K)

2) Mark corresponding range error
   The old code will mark the whole data 1 stripe as error.
   While the new code will only mark data 1 [0, 32K) as error.

3) Recovery path
   The old code will recover data 1 [0, 64K), all using Data 2 and
   parity.

   This means, Data 1 [32K, 64K) will be corrupted data, as data 2
   [32K, 64K) is already corrupted.

   While the new code will only recover data 1 [0, 32K), as only
   that range has error so far.

This new behavior can avoid populating rbio cache with incorrect data.
Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 2942a50d
...@@ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) ...@@ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
return 0; return 0;
} }
/*
* Return the total numer of errors found in the vertical stripe of @sector_nr.
*
* @faila and @failb will also be updated to the first and second stripe
* number of the errors.
*/
static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
int *faila, int *failb)
{
int stripe_nr;
int found_errors = 0;
ASSERT(faila && failb);
*faila = -1;
*failb = -1;
for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
if (test_bit(total_sector_nr, rbio->error_bitmap)) {
found_errors++;
if (*faila < 0)
*faila = stripe_nr;
else if (*failb < 0)
*failb = stripe_nr;
}
}
return found_errors;
}
/* /*
* Add a single sector @sector into our list of bios for IO. * Add a single sector @sector into our list of bios for IO.
* *
...@@ -1740,14 +1770,15 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) ...@@ -1740,14 +1770,15 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
* @*pointers are the pre-allocated pointers by the caller, so we don't * @*pointers are the pre-allocated pointers by the caller, so we don't
* need to allocate/free the pointers again and again. * need to allocate/free the pointers again and again.
*/ */
static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
void **pointers, void **unmap_array) void **pointers, void **unmap_array)
{ {
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
struct sector_ptr *sector; struct sector_ptr *sector;
const u32 sectorsize = fs_info->sectorsize; const u32 sectorsize = fs_info->sectorsize;
const int faila = rbio->faila; int found_errors;
const int failb = rbio->failb; int faila;
int failb;
int stripe_nr; int stripe_nr;
/* /*
...@@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ...@@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
*/ */
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
!test_bit(sector_nr, &rbio->dbitmap)) !test_bit(sector_nr, &rbio->dbitmap))
return; return 0;
found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
&failb);
/*
* No errors in the veritical stripe, skip it. Can happen for recovery
* which only part of a stripe failed csum check.
*/
if (!found_errors)
return 0;
if (found_errors > rbio->bioc->max_errors)
return -EIO;
/* /*
* Setup our array of pointers with sectors from each stripe * Setup our array of pointers with sectors from each stripe
...@@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ...@@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
*/ */
for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
/* /*
* If we're rebuilding a read, we have to use * If we're rebuilding a read, we have to use pages from the
* pages from the bio list * bio list if possible.
*/ */
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
(stripe_nr == faila || stripe_nr == failb)) {
sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else { } else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
...@@ -1859,18 +1901,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ...@@ -1859,18 +1901,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
* Especially if we determine to cache the rbio, we need to * Especially if we determine to cache the rbio, we need to
* have at least all data sectors uptodate. * have at least all data sectors uptodate.
*/ */
if (rbio->faila >= 0) { if (faila >= 0) {
sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); sector = rbio_stripe_sector(rbio, faila, sector_nr);
sector->uptodate = 1; sector->uptodate = 1;
} }
if (rbio->failb >= 0) { if (failb >= 0) {
sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); sector = rbio_stripe_sector(rbio, failb, sector_nr);
sector->uptodate = 1; sector->uptodate = 1;
} }
cleanup: cleanup:
for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
kunmap_local(unmap_array[stripe_nr]); kunmap_local(unmap_array[stripe_nr]);
return 0;
} }
static int recover_sectors(struct btrfs_raid_bio *rbio) static int recover_sectors(struct btrfs_raid_bio *rbio)
...@@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) ...@@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
goto out; goto out;
} }
/* Make sure faila and fail b are in order. */
if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb)
swap(rbio->faila, rbio->failb);
if (rbio->operation == BTRFS_RBIO_READ_REBUILD || if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
spin_lock_irq(&rbio->bio_list_lock); spin_lock_irq(&rbio->bio_list_lock);
...@@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) ...@@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio); index_rbio_pages(rbio);
for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
recover_vertical(rbio, sectornr, pointers, unmap_array); ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
if (ret < 0)
break;
}
out: out:
kfree(pointers); kfree(pointers);
...@@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, ...@@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
int sectornr = total_sector_nr % rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors;
struct sector_ptr *sector; struct sector_ptr *sector;
if (rbio->faila == stripe || rbio->failb == stripe) { /*
/* Skip the current stripe. */ * Skip the range which has error. It can be a range which is
ASSERT(sectornr == 0); * marked error (for csum mismatch), or it can be a missing
total_sector_nr += rbio->stripe_nsectors - 1; * device.
atomic_inc(&rbio->error); */
if (!rbio->bioc->stripes[stripe].dev->bdev ||
test_bit(total_sector_nr, rbio->error_bitmap)) {
/*
* Also set the error bit for missing device, which
* may not yet have its error bit set.
*/
set_bit(total_sector_nr, rbio->error_bitmap);
continue; continue;
} }
sector = rbio_stripe_sector(rbio, stripe, sectornr); sector = rbio_stripe_sector(rbio, stripe, sectornr);
ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ); sectornr, REQ_OP_READ);
...@@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) ...@@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
/* /*
* Either we're doing recover for a read failure or degraded write, * Either we're doing recover for a read failure or degraded write,
* caller should have set faila/b and error bitmap correctly. * caller should have set error bitmap correctly.
*/ */
ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
bio_list_init(&bio_list); bio_list_init(&bio_list);
...@@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) ...@@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
submit_read_bios(rbio, &bio_list); submit_read_bios(rbio, &bio_list);
wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
/* We have more errors than our tolerance during the read. */
if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
ret = -EIO;
goto out;
}
ret = recover_sectors(rbio); ret = recover_sectors(rbio);
out: out:
...@@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work) ...@@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work)
rbio_orig_end_io(rbio, errno_to_blk_status(ret)); rbio_orig_end_io(rbio, errno_to_blk_status(ret));
} }
static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
{
bool found = false;
int sector_nr;
/*
* This is for RAID6 extra recovery tries, thus mirror number should
* be large than 2.
* Mirror 1 means read from data stripes. Mirror 2 means rebuild using
* RAID5 methods.
*/
ASSERT(mirror_num > 2);
for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int found_errors;
int faila;
int failb;
found_errors = get_rbio_veritical_errors(rbio, sector_nr,
&faila, &failb);
/* This vertical stripe doesn't have errors. */
if (!found_errors)
continue;
/*
* If we found errors, there should be only one error marked
* by previous set_rbio_range_error().
*/
ASSERT(found_errors == 1);
found = true;
/* Now select another stripe to mark as error. */
failb = rbio->real_stripes - (mirror_num - 1);
if (failb <= faila)
failb--;
/* Set the extra bit in error bitmap. */
if (failb >= 0)
set_bit(failb * rbio->stripe_nsectors + sector_nr,
rbio->error_bitmap);
}
/* We should found at least one vertical stripe with error.*/
ASSERT(found);
}
/* /*
* the main entry point for reads from the higher layers. This * the main entry point for reads from the higher layers. This
* is really only called when the normal read path had a failure, * is really only called when the normal read path had a failure,
...@@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, ...@@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
* for 'mirror_num > 2', select a stripe to fail on every retry. * for 'mirror_num > 2', select a stripe to fail on every retry.
*/ */
if (mirror_num > 2) { if (mirror_num > 2) {
/* set_rbio_raid6_extra_error(rbio, mirror_num);
* 'mirror == 3' is to fail the p stripe and
* reconstruct from the q stripe. 'mirror > 3' is to
* fail a data stripe and reconstruct from p+q stripe.
*/
rbio->failb = rbio->real_stripes - (mirror_num - 1); rbio->failb = rbio->real_stripes - (mirror_num - 1);
ASSERT(rbio->failb > 0); ASSERT(rbio->failb > 0);
if (rbio->failb <= rbio->faila) if (rbio->failb <= rbio->faila)
...@@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) ...@@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{ {
int dfail = 0, failp = -1; void **pointers = NULL;
void **unmap_array = NULL;
int sector_nr;
int ret; int ret;
/* No error case should be already handled by the caller. */ /*
ASSERT(rbio->faila >= 0 || rbio->failb >= 0); * @pointers array stores the pointer for each sector.
*
* @unmap_array stores copy of pointers that does not get reordered
* during reconstruction so that kunmap_local works.
*/
pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
if (!pointers || !unmap_array) {
ret = -ENOMEM;
goto out;
}
for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
int dfail = 0, failp = -1;
int faila;
int failb;
int found_errors;
if (is_data_stripe(rbio, rbio->faila)) found_errors = get_rbio_veritical_errors(rbio, sector_nr,
dfail++; &faila, &failb);
else if (is_parity_stripe(rbio->faila)) if (found_errors > rbio->bioc->max_errors) {
failp = rbio->faila; ret = -EIO;
goto out;
}
if (found_errors == 0)
continue;
/* We should have at least one error here. */
ASSERT(faila >= 0 || failb >= 0);
if (is_data_stripe(rbio, rbio->failb)) if (is_data_stripe(rbio, faila))
dfail++; dfail++;
else if (is_parity_stripe(rbio->failb)) else if (is_parity_stripe(faila))
failp = rbio->failb; failp = faila;
if (is_data_stripe(rbio, failb))
dfail++;
else if (is_parity_stripe(failb))
failp = failb;
/* /*
* Because we can not use a scrubbing parity to repair * Because we can not use a scrubbing parity to repair the
* the data, so the capability of the repair is declined. * data, so the capability of the repair is declined. (In the
* (In the case of RAID5, we can not repair anything) * case of RAID5, we can not repair anything.)
*/ */
if (dfail > rbio->bioc->max_errors - 1) if (dfail > rbio->bioc->max_errors - 1) {
return -EIO; ret = -EIO;
goto out;
}
/* /*
* If all data is good, only parity is correctly, just * If all data is good, only parity is correctly, just repair
* repair the parity. * the parity, no need to recover data stripes.
*/ */
if (dfail == 0) if (dfail == 0)
return 0; continue;
/* /*
* Here means we got one corrupted data stripe and one * Here means we got one corrupted data stripe and one
* corrupted parity on RAID6, if the corrupted parity * corrupted parity on RAID6, if the corrupted parity is
* is scrubbing parity, luckily, use the other one to repair * scrubbing parity, luckily, use the other one to repair the
* the data, or we can not repair the data stripe. * data, or we can not repair the data stripe.
*/ */
if (failp != rbio->scrubp) if (failp != rbio->scrubp) {
return -EIO; ret = -EIO;
goto out;
}
/* We have some corrupted sectors, need to repair them. */ ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
ret = recover_sectors(rbio); if (ret < 0)
goto out;
}
out:
kfree(pointers);
kfree(unmap_array);
return ret; return ret;
} }
...@@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) ...@@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
submit_read_bios(rbio, &bio_list); submit_read_bios(rbio, &bio_list);
wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { /* We may have some failures, recover the failed sectors first. */
ret = -EIO;
goto cleanup;
}
/*
* No error during read, can finish the scrub and need to verify the
* P/Q sectors;
*/
if (atomic_read(&rbio->error) == 0) {
need_check = true;
goto finish;
}
/* We have some failures, need to recover the failed sectors first. */
ret = recover_scrub_rbio(rbio); ret = recover_scrub_rbio(rbio);
if (ret < 0) if (ret < 0)
goto cleanup; goto cleanup;
finish:
/* /*
* We have every sector properly prepared. Can finish the scrub * We have every sector properly prepared. Can finish the scrub
* and writeback the good content. * and writeback the good content.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment