Commit dd054fce authored by NeilBrown's avatar NeilBrown

md/raid5: handle activation of replacement device when recovery completes.

When recovery completes - as reported by a call to ->spare_active,
we clear In_sync on the original and set it on the replacement.

Then when the original gets removed we move the replacement from
'replacement' to 'rdev'.

This could race with other code that is looking at these pointers,
so we use memory barriers and careful ordering to ensure that
a reader might see one device twice, but never no devices.
Then the readers guard against using both devices, which could
only happen when writing.
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent 9a3e1101
......@@ -532,13 +532,21 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_end_io = raid5_end_read_request;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
rrdev = rcu_dereference(conf->disks[i].replacement);
smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev) {
rdev = rrdev;
rrdev = NULL;
}
if (rw & WRITE) {
if (replace_only)
rdev = NULL;
if (rdev == rrdev)
/* We raced and saw duplicates */
rrdev = NULL;
} else {
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
rdev = rrdev;
rrdev = NULL;
}
......@@ -1640,7 +1648,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
struct md_rdev *rdev = NULL;
for (i=0 ; i<disks; i++)
......@@ -1655,8 +1663,13 @@ static void raid5_end_read_request(struct bio * bi, int error)
return;
}
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
/* If replacement finished while this request was outstanding,
* 'replacement' might be NULL already.
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
rdev = conf->disks[i].replacement;
else
if (!rdev)
rdev = conf->disks[i].rdev;
if (uptodate) {
......@@ -1753,7 +1766,14 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
if (bi == &sh->dev[i].rreq) {
rdev = conf->disks[i].replacement;
if (rdev)
replacement = 1;
else
/* rdev was removed and 'replacement'
* replaced it. rdev is not removed
* until all requests are finished.
*/
rdev = conf->disks[i].rdev;
break;
}
}
......@@ -3539,6 +3559,9 @@ static void handle_stripe(struct stripe_head *sh)
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
rdev = conf->disks[i].replacement;
if (!rdev)
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS);
rdev_dec_pending(rdev, conf->mddev);
......@@ -5204,7 +5227,25 @@ static int raid5_spare_active(struct mddev *mddev)
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
if (tmp->rdev
if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->replacement->flags)
&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
/* Replacement has just become active. */
if (!tmp->rdev
|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))
count++;
if (tmp->rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
set_bit(Faulty, &tmp->rdev->flags);
sysfs_notify_dirent_safe(
tmp->rdev->sysfs_state);
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} else if (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
......@@ -5250,6 +5291,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
(!p->replacement || p->replacement == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
......@@ -5260,7 +5302,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
/* lost the race, try later */
err = -EBUSY;
*rdevp = rdev;
}
} else if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
clear_bit(Replacement, &p->replacement->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither - if they are careful
*/
p->replacement = NULL;
clear_bit(WantReplacement, &rdev->flags);
} else
/* We might have just removed the Replacement as faulty-
* clear the bit just in case
*/
clear_bit(WantReplacement, &rdev->flags);
abort:
print_raid5_conf(conf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment