Commit ca65b73b authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds

[PATCH] md: fix raid6 resync check/repair code

raid6 currently does not check the P/Q syndromes when doing a resync, it just
calculates the correct value and writes it.  Doing the check can reduce writes
(often to 0) for a resync, and it is needed to properly implement the

  echo check > sync_action

operation.

This patch implements the appropriate checks and tidies up some related code.

It also allows raid6 user-requested resync to bypass the intent bitmap.
Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 6cce3b23
...@@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method) ...@@ -805,7 +805,7 @@ static void compute_parity(struct stripe_head *sh, int method)
} }
/* Compute one missing block */ /* Compute one missing block */
static void compute_block_1(struct stripe_head *sh, int dd_idx) static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
{ {
raid6_conf_t *conf = sh->raid_conf; raid6_conf_t *conf = sh->raid_conf;
int i, count, disks = conf->raid_disks; int i, count, disks = conf->raid_disks;
...@@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) ...@@ -821,7 +821,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
compute_parity(sh, UPDATE_PARITY); compute_parity(sh, UPDATE_PARITY);
} else { } else {
ptr[0] = page_address(sh->dev[dd_idx].page); ptr[0] = page_address(sh->dev[dd_idx].page);
memset(ptr[0], 0, STRIPE_SIZE); if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
count = 1; count = 1;
for (i = disks ; i--; ) { for (i = disks ; i--; ) {
if (i == dd_idx || i == qd_idx) if (i == dd_idx || i == qd_idx)
...@@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx) ...@@ -838,7 +838,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
} }
if (count != 1) if (count != 1)
xor_block(count, STRIPE_SIZE, ptr); xor_block(count, STRIPE_SIZE, ptr);
set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
} }
} }
...@@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) ...@@ -871,7 +872,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
return; return;
} else { } else {
/* We're missing D+Q; recompute D from P */ /* We're missing D+Q; recompute D from P */
compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
return; return;
} }
...@@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -982,6 +983,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
} }
static int page_is_zero(struct page *p)
{
char *a = page_address(p);
return ((*(u32*)a) == 0 &&
memcmp(a, a+4, STRIPE_SIZE-4)==0);
}
/* /*
* handle_stripe - do things to a stripe. * handle_stripe - do things to a stripe.
* *
...@@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in ...@@ -1000,7 +1007,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
* *
*/ */
static void handle_stripe(struct stripe_head *sh) static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
{ {
raid6_conf_t *conf = sh->raid_conf; raid6_conf_t *conf = sh->raid_conf;
int disks = conf->raid_disks; int disks = conf->raid_disks;
...@@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1228,7 +1235,7 @@ static void handle_stripe(struct stripe_head *sh)
if (uptodate == disks-1) { if (uptodate == disks-1) {
PRINTK("Computing stripe %llu block %d\n", PRINTK("Computing stripe %llu block %d\n",
(unsigned long long)sh->sector, i); (unsigned long long)sh->sector, i);
compute_block_1(sh, i); compute_block_1(sh, i, 0);
uptodate++; uptodate++;
} else if ( uptodate == disks-2 && failed >= 2 ) { } else if ( uptodate == disks-2 && failed >= 2 ) {
/* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
...@@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1323,7 +1330,7 @@ static void handle_stripe(struct stripe_head *sh)
/* We have failed blocks and need to compute them */ /* We have failed blocks and need to compute them */
switch ( failed ) { switch ( failed ) {
case 0: BUG(); case 0: BUG();
case 1: compute_block_1(sh, failed_num[0]); break; case 1: compute_block_1(sh, failed_num[0], 0); break;
case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
default: BUG(); /* This request should have been failed? */ default: BUG(); /* This request should have been failed? */
} }
...@@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1338,12 +1345,10 @@ static void handle_stripe(struct stripe_head *sh)
(unsigned long long)sh->sector, i); (unsigned long long)sh->sector, i);
locked++; locked++;
set_bit(R5_Wantwrite, &sh->dev[i].flags); set_bit(R5_Wantwrite, &sh->dev[i].flags);
#if 0 /**** FIX: I don't understand the logic here... ****/
if (!test_bit(R5_Insync, &sh->dev[i].flags)
|| ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
set_bit(STRIPE_INSYNC, &sh->state);
#endif
} }
/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
set_bit(STRIPE_INSYNC, &sh->state);
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes); atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
...@@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1356,79 +1361,97 @@ static void handle_stripe(struct stripe_head *sh)
* Any reads will already have been scheduled, so we just see if enough data * Any reads will already have been scheduled, so we just see if enough data
* is available * is available
*/ */
if (syncing && locked == 0 && if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
!test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { int update_p = 0, update_q = 0;
struct r5dev *dev;
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
#if 0 /* RAID-6: Don't support CHECK PARITY yet */
if (failed == 0) {
char *pagea;
if (uptodate != disks)
BUG();
compute_parity(sh, CHECK_PARITY);
uptodate--;
pagea = page_address(sh->dev[pd_idx].page);
if ((*(u32*)pagea) == 0 &&
!memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
/* parity is correct (on disc, not in buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
}
}
#endif
if (!test_bit(STRIPE_INSYNC, &sh->state)) {
int failed_needupdate[2];
struct r5dev *adev, *bdev;
if ( failed < 1 ) BUG_ON(failed>2);
failed_num[0] = pd_idx; BUG_ON(uptodate < disks);
if ( failed < 2 ) /* Want to check and possibly repair P and Q.
failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx; * However there could be one 'failed' device, in which
* case we can only check one of them, possibly using the
* other to generate missing data
*/
failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); /* If !tmp_page, we cannot do the calculations,
failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags); * but as we have set STRIPE_HANDLE, we will soon be called
* by stripe_handle with a tmp_page - just wait until then.
*/
if (tmp_page) {
if (failed == q_failed) {
/* The only possible failed device holds 'Q', so it makes
* sense to check P (If anything else were failed, we would
* have used P to recreate it).
*/
compute_block_1(sh, pd_idx, 1);
if (!page_is_zero(sh->dev[pd_idx].page)) {
compute_block_1(sh,pd_idx,0);
update_p = 1;
}
}
if (!q_failed && failed < 2) {
/* q is not failed, and we didn't use it to generate
* anything, so it makes sense to check it
*/
memcpy(page_address(tmp_page),
page_address(sh->dev[qd_idx].page),
STRIPE_SIZE);
compute_parity(sh, UPDATE_PARITY);
if (memcmp(page_address(tmp_page),
page_address(sh->dev[qd_idx].page),
STRIPE_SIZE)!= 0) {
clear_bit(STRIPE_INSYNC, &sh->state);
update_q = 1;
}
}
if (update_p || update_q) {
conf->mddev->resync_mismatches += STRIPE_SECTORS;
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
update_p = update_q = 0;
}
PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", /* now write out any block on a failed drive,
failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); * or P or Q if they need it
*/
#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ if (failed == 2) {
/* should be able to compute the missing block(s) and write to spare */ dev = &sh->dev[failed_num[1]];
if ( failed_needupdate[0] ^ failed_needupdate[1] ) { locked++;
if (uptodate+1 != disks) set_bit(R5_LOCKED, &dev->flags);
BUG(); set_bit(R5_Wantwrite, &dev->flags);
compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); set_bit(R5_Syncio, &dev->flags);
uptodate++; }
} else if ( failed_needupdate[0] & failed_needupdate[1] ) { if (failed >= 1) {
if (uptodate+2 != disks) dev = &sh->dev[failed_num[0]];
BUG(); locked++;
compute_block_2(sh, failed_num[0], failed_num[1]); set_bit(R5_LOCKED, &dev->flags);
uptodate += 2; set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_Syncio, &dev->flags);
} }
#else
compute_block_2(sh, failed_num[0], failed_num[1]);
uptodate += failed_needupdate[0] + failed_needupdate[1];
#endif
if (uptodate != disks)
BUG();
PRINTK("Marking for sync stripe %llu blocks %d,%d\n", if (update_p) {
(unsigned long long)sh->sector, failed_num[0], failed_num[1]); dev = &sh->dev[pd_idx];
locked ++;
/**** FIX: Should we really do both of these unconditionally? ****/ set_bit(R5_LOCKED, &dev->flags);
adev = &sh->dev[failed_num[0]]; set_bit(R5_Wantwrite, &dev->flags);
locked += !test_bit(R5_LOCKED, &adev->flags); set_bit(R5_Syncio, &dev->flags);
set_bit(R5_LOCKED, &adev->flags); }
set_bit(R5_Wantwrite, &adev->flags); if (update_q) {
bdev = &sh->dev[failed_num[1]]; dev = &sh->dev[qd_idx];
locked += !test_bit(R5_LOCKED, &bdev->flags); locked++;
set_bit(R5_LOCKED, &bdev->flags); set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_Syncio, &dev->flags);
}
clear_bit(STRIPE_DEGRADED, &sh->state); clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(R5_Wantwrite, &bdev->flags);
set_bit(STRIPE_INSYNC, &sh->state); set_bit(STRIPE_INSYNC, &sh->state);
set_bit(R5_Syncio, &adev->flags);
set_bit(R5_Syncio, &bdev->flags);
} }
} }
if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS,1); md_done_sync(conf->mddev, STRIPE_SECTORS,1);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
...@@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi) ...@@ -1664,7 +1687,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
} }
finish_wait(&conf->wait_for_overlap, &w); finish_wait(&conf->wait_for_overlap, &w);
raid6_plug_device(conf); raid6_plug_device(conf);
handle_stripe(sh); handle_stripe(sh, NULL);
release_stripe(sh); release_stripe(sh);
} else { } else {
/* cannot get stripe for read-ahead, just give-up */ /* cannot get stripe for read-ahead, just give-up */
...@@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1728,6 +1751,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return rv; return rv;
} }
if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync && sync_blocks >= STRIPE_SECTORS) { !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
/* we can skip this block, and probably more */ /* we can skip this block, and probably more */
sync_blocks /= STRIPE_SECTORS; sync_blocks /= STRIPE_SECTORS;
...@@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1765,7 +1789,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
clear_bit(STRIPE_INSYNC, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state);
spin_unlock(&sh->lock); spin_unlock(&sh->lock);
handle_stripe(sh); handle_stripe(sh, NULL);
release_stripe(sh); release_stripe(sh);
return STRIPE_SECTORS; return STRIPE_SECTORS;
...@@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev) ...@@ -1821,7 +1845,7 @@ static void raid6d (mddev_t *mddev)
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
handled++; handled++;
handle_stripe(sh); handle_stripe(sh, conf->spare_page);
release_stripe(sh); release_stripe(sh);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
...@@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev) ...@@ -1860,6 +1884,10 @@ static int run(mddev_t *mddev)
goto abort; goto abort;
memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
conf->spare_page = alloc_page(GFP_KERNEL);
if (!conf->spare_page)
goto abort;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap); init_waitqueue_head(&conf->wait_for_overlap);
...@@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev) ...@@ -1996,6 +2024,8 @@ static int run(mddev_t *mddev)
abort: abort:
if (conf) { if (conf) {
print_raid6_conf(conf); print_raid6_conf(conf);
if (conf->spare_page)
page_cache_release(conf->spare_page);
if (conf->stripe_hashtbl) if (conf->stripe_hashtbl)
free_pages((unsigned long) conf->stripe_hashtbl, free_pages((unsigned long) conf->stripe_hashtbl,
HASH_PAGES_ORDER); HASH_PAGES_ORDER);
......
...@@ -228,6 +228,8 @@ struct raid5_private_data { ...@@ -228,6 +228,8 @@ struct raid5_private_data {
* Cleared when a sync completes. * Cleared when a sync completes.
*/ */
struct page *spare_page; /* Used when checking P/Q in raid6 */
/* /*
* Free stripes pool * Free stripes pool
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment