Commit ad860670 authored by Yu Kuai's avatar Yu Kuai Committed by Song Liu

md/raid5: remove rcu protection to access rdev from conf

Because it's safe to accees rdev from conf:
 - If any spinlock is held, because synchronize_rcu() from
   md_kick_rdev_from_array() will prevent 'rdev' to be freed until
   spinlock is released;
 - If 'reconfig_lock' is held, because rdev can't be added or removed from
   array;
 - If there is normal IO inflight, because mddev_suspend() will prevent
   rdev to be added or removed from array;
 - If there is sync IO inflight, because 'MD_RECOVERY_RUNNING' is
   checked in remove_and_add_spares().

And these will cover all the scenarios in raid456.
Signed-off-by: default avatarYu Kuai <yukuai3@huawei.com>
Signed-off-by: default avatarSong Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20231125081604.3939938-5-yukuai1@huaweicloud.com
parent 2d32777d
...@@ -1890,28 +1890,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf, ...@@ -1890,28 +1890,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
continue; continue;
/* in case device is broken */ /* in case device is broken */
rcu_read_lock(); rdev = conf->disks[disk_index].rdev;
rdev = rcu_dereference(conf->disks[disk_index].rdev);
if (rdev) { if (rdev) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
sync_page_io(rdev, sh->sector, PAGE_SIZE, sync_page_io(rdev, sh->sector, PAGE_SIZE,
sh->dev[disk_index].page, REQ_OP_WRITE, sh->dev[disk_index].page, REQ_OP_WRITE,
false); false);
rdev_dec_pending(rdev, rdev->mddev); rdev_dec_pending(rdev, rdev->mddev);
rcu_read_lock();
} }
rrdev = rcu_dereference(conf->disks[disk_index].replacement); rrdev = conf->disks[disk_index].replacement;
if (rrdev) { if (rrdev) {
atomic_inc(&rrdev->nr_pending); atomic_inc(&rrdev->nr_pending);
rcu_read_unlock();
sync_page_io(rrdev, sh->sector, PAGE_SIZE, sync_page_io(rrdev, sh->sector, PAGE_SIZE,
sh->dev[disk_index].page, REQ_OP_WRITE, sh->dev[disk_index].page, REQ_OP_WRITE,
false); false);
rdev_dec_pending(rrdev, rrdev->mddev); rdev_dec_pending(rrdev, rrdev->mddev);
rcu_read_lock();
} }
rcu_read_unlock();
} }
ctx->data_parity_stripes++; ctx->data_parity_stripes++;
out: out:
...@@ -2948,7 +2942,6 @@ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) ...@@ -2948,7 +2942,6 @@ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
if (!log) if (!log)
return false; return false;
WARN_ON_ONCE(!rcu_read_lock_held());
tree_index = r5c_tree_index(conf, sect); tree_index = r5c_tree_index(conf, sect);
slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
return slot != NULL; return slot != NULL;
......
...@@ -620,11 +620,9 @@ static void ppl_do_flush(struct ppl_io_unit *io) ...@@ -620,11 +620,9 @@ static void ppl_do_flush(struct ppl_io_unit *io)
struct md_rdev *rdev; struct md_rdev *rdev;
struct block_device *bdev = NULL; struct block_device *bdev = NULL;
rcu_read_lock(); rdev = conf->disks[i].rdev;
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) if (rdev && !test_bit(Faulty, &rdev->flags))
bdev = rdev->bdev; bdev = rdev->bdev;
rcu_read_unlock();
if (bdev) { if (bdev) {
struct bio *bio; struct bio *bio;
...@@ -882,9 +880,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, ...@@ -882,9 +880,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)r_sector, dd_idx, (unsigned long long)r_sector, dd_idx,
(unsigned long long)sector); (unsigned long long)sector);
/* Array has not started so rcu dereference is safe */ rdev = conf->disks[dd_idx].rdev;
rdev = rcu_dereference_protected(
conf->disks[dd_idx].rdev, 1);
if (!rdev || (!test_bit(In_sync, &rdev->flags) && if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
sector >= rdev->recovery_offset)) { sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n", pr_debug("%s:%*s data member disk %d missing\n",
...@@ -936,9 +932,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, ...@@ -936,9 +932,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
0, &disk, &sh); 0, &disk, &sh);
BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
/* Array has not started so rcu dereference is safe */ parity_rdev = conf->disks[sh.pd_idx].rdev;
parity_rdev = rcu_dereference_protected(
conf->disks[sh.pd_idx].rdev, 1);
BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
pr_debug("%s:%*s write parity at sector %llu, disk %pg\n", pr_debug("%s:%*s write parity at sector %llu, disk %pg\n",
...@@ -1404,9 +1398,7 @@ int ppl_init_log(struct r5conf *conf) ...@@ -1404,9 +1398,7 @@ int ppl_init_log(struct r5conf *conf)
for (i = 0; i < ppl_conf->count; i++) { for (i = 0; i < ppl_conf->count; i++) {
struct ppl_log *log = &ppl_conf->child_logs[i]; struct ppl_log *log = &ppl_conf->child_logs[i];
/* Array has not started so rcu dereference is safe */ struct md_rdev *rdev = conf->disks[i].rdev;
struct md_rdev *rdev =
rcu_dereference_protected(conf->disks[i].rdev, 1);
mutex_init(&log->io_mutex); mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock); spin_lock_init(&log->io_list_lock);
......
...@@ -693,12 +693,12 @@ int raid5_calc_degraded(struct r5conf *conf) ...@@ -693,12 +693,12 @@ int raid5_calc_degraded(struct r5conf *conf)
int degraded, degraded2; int degraded, degraded2;
int i; int i;
rcu_read_lock();
degraded = 0; degraded = 0;
for (i = 0; i < conf->previous_raid_disks; i++) { for (i = 0; i < conf->previous_raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = rcu_dereference(conf->disks[i].replacement); rdev = READ_ONCE(conf->disks[i].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags)) if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++; degraded++;
else if (test_bit(In_sync, &rdev->flags)) else if (test_bit(In_sync, &rdev->flags))
...@@ -716,15 +716,14 @@ int raid5_calc_degraded(struct r5conf *conf) ...@@ -716,15 +716,14 @@ int raid5_calc_degraded(struct r5conf *conf)
if (conf->raid_disks >= conf->previous_raid_disks) if (conf->raid_disks >= conf->previous_raid_disks)
degraded++; degraded++;
} }
rcu_read_unlock();
if (conf->raid_disks == conf->previous_raid_disks) if (conf->raid_disks == conf->previous_raid_disks)
return degraded; return degraded;
rcu_read_lock();
degraded2 = 0; degraded2 = 0;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = rcu_dereference(conf->disks[i].replacement); rdev = READ_ONCE(conf->disks[i].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags)) if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++; degraded2++;
else if (test_bit(In_sync, &rdev->flags)) else if (test_bit(In_sync, &rdev->flags))
...@@ -738,7 +737,6 @@ int raid5_calc_degraded(struct r5conf *conf) ...@@ -738,7 +737,6 @@ int raid5_calc_degraded(struct r5conf *conf)
if (conf->raid_disks <= conf->previous_raid_disks) if (conf->raid_disks <= conf->previous_raid_disks)
degraded2++; degraded2++;
} }
rcu_read_unlock();
if (degraded2 > degraded) if (degraded2 > degraded)
return degraded2; return degraded2;
return degraded; return degraded;
...@@ -1183,14 +1181,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -1183,14 +1181,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi = &dev->req; bi = &dev->req;
rbi = &dev->rreq; /* For writing to replacement */ rbi = &dev->rreq; /* For writing to replacement */
rcu_read_lock(); rdev = conf->disks[i].rdev;
rrdev = rcu_dereference(conf->disks[i].replacement); rrdev = conf->disks[i].replacement;
smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev) {
rdev = rrdev;
rrdev = NULL;
}
if (op_is_write(op)) { if (op_is_write(op)) {
if (replace_only) if (replace_only)
rdev = NULL; rdev = NULL;
...@@ -1211,7 +1203,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -1211,7 +1203,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rrdev = NULL; rrdev = NULL;
if (rrdev) if (rrdev)
atomic_inc(&rrdev->nr_pending); atomic_inc(&rrdev->nr_pending);
rcu_read_unlock();
/* We have already checked bad blocks for reads. Now /* We have already checked bad blocks for reads. Now
* need to check for writes. We never accept write errors * need to check for writes. We never accept write errors
...@@ -2730,28 +2721,6 @@ static void shrink_stripes(struct r5conf *conf) ...@@ -2730,28 +2721,6 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL; conf->slab_cache = NULL;
} }
/*
* This helper wraps rcu_dereference_protected() and can be used when
* it is known that the nr_pending of the rdev is elevated.
*/
static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
{
return rcu_dereference_protected(rdev,
atomic_read(&rcu_access_pointer(rdev)->nr_pending));
}
/*
* This helper wraps rcu_dereference_protected() and should be used
* when it is known that the mddev_lock() is held. This is safe
* seeing raid5_remove_disk() has the same lock held.
*/
static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
struct md_rdev __rcu *rdev)
{
return rcu_dereference_protected(rdev,
lockdep_is_held(&mddev->reconfig_mutex));
}
static void raid5_end_read_request(struct bio * bi) static void raid5_end_read_request(struct bio * bi)
{ {
struct stripe_head *sh = bi->bi_private; struct stripe_head *sh = bi->bi_private;
...@@ -2777,9 +2746,9 @@ static void raid5_end_read_request(struct bio * bi) ...@@ -2777,9 +2746,9 @@ static void raid5_end_read_request(struct bio * bi)
* In that case it moved down to 'rdev'. * In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished. * rdev is not removed until all requests are finished.
*/ */
rdev = rdev_pend_deref(conf->disks[i].replacement); rdev = conf->disks[i].replacement;
if (!rdev) if (!rdev)
rdev = rdev_pend_deref(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
if (use_new_offset(conf, sh)) if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset; s = sh->sector + rdev->new_data_offset;
...@@ -2892,11 +2861,11 @@ static void raid5_end_write_request(struct bio *bi) ...@@ -2892,11 +2861,11 @@ static void raid5_end_write_request(struct bio *bi)
for (i = 0 ; i < disks; i++) { for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) { if (bi == &sh->dev[i].req) {
rdev = rdev_pend_deref(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
break; break;
} }
if (bi == &sh->dev[i].rreq) { if (bi == &sh->dev[i].rreq) {
rdev = rdev_pend_deref(conf->disks[i].replacement); rdev = conf->disks[i].replacement;
if (rdev) if (rdev)
replacement = 1; replacement = 1;
else else
...@@ -2904,7 +2873,7 @@ static void raid5_end_write_request(struct bio *bi) ...@@ -2904,7 +2873,7 @@ static void raid5_end_write_request(struct bio *bi)
* replaced it. rdev is not removed * replaced it. rdev is not removed
* until all requests are finished. * until all requests are finished.
*/ */
rdev = rdev_pend_deref(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
break; break;
} }
} }
...@@ -3666,15 +3635,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -3666,15 +3635,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
int bitmap_end = 0; int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) { if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev; struct md_rdev *rdev = conf->disks[i].rdev;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(In_sync, &rdev->flags) && if (rdev && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) !test_bit(Faulty, &rdev->flags))
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
else else
rdev = NULL; rdev = NULL;
rcu_read_unlock();
if (rdev) { if (rdev) {
if (!rdev_set_badblocks( if (!rdev_set_badblocks(
rdev, rdev,
...@@ -3792,16 +3759,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -3792,16 +3759,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
/* During recovery devices cannot be removed, so /* During recovery devices cannot be removed, so
* locking and refcounting of rdevs is not needed * locking and refcounting of rdevs is not needed
*/ */
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector, && !rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0)) RAID5_STRIPE_SECTORS(conf), 0))
abort = 1; abort = 1;
rdev = rcu_dereference(conf->disks[i].replacement); rdev = conf->disks[i].replacement;
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
...@@ -3809,7 +3777,6 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, ...@@ -3809,7 +3777,6 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
RAID5_STRIPE_SECTORS(conf), 0)) RAID5_STRIPE_SECTORS(conf), 0))
abort = 1; abort = 1;
} }
rcu_read_unlock();
if (abort) if (abort)
conf->recovery_disabled = conf->recovery_disabled =
conf->mddev->recovery_disabled; conf->mddev->recovery_disabled;
...@@ -3822,15 +3789,13 @@ static int want_replace(struct stripe_head *sh, int disk_idx) ...@@ -3822,15 +3789,13 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
struct md_rdev *rdev; struct md_rdev *rdev;
int rv = 0; int rv = 0;
rcu_read_lock(); rdev = sh->raid_conf->disks[disk_idx].replacement;
rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
if (rdev if (rdev
&& !test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector && (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector)) || rdev->mddev->recovery_cp <= sh->sector))
rv = 1; rv = 1;
rcu_read_unlock();
return rv; return rv;
} }
...@@ -4707,7 +4672,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4707,7 +4672,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->log_failed = r5l_log_disk_error(conf); s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) { for (i=disks; i--; ) {
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t first_bad; sector_t first_bad;
...@@ -4752,7 +4716,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4752,7 +4716,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Prefer to use the replacement for reads, but only /* Prefer to use the replacement for reads, but only
* if it is recovered enough and has no bad blocks. * if it is recovered enough and has no bad blocks.
*/ */
rdev = rcu_dereference(conf->disks[i].replacement); rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) && if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
...@@ -4763,7 +4727,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4763,7 +4727,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(R5_NeedReplace, &dev->flags); set_bit(R5_NeedReplace, &dev->flags);
else else
clear_bit(R5_NeedReplace, &dev->flags); clear_bit(R5_NeedReplace, &dev->flags);
rdev = rcu_dereference(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
clear_bit(R5_ReadRepl, &dev->flags); clear_bit(R5_ReadRepl, &dev->flags);
} }
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
...@@ -4810,8 +4774,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4810,8 +4774,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_WriteError, &dev->flags)) { if (test_bit(R5_WriteError, &dev->flags)) {
/* This flag does not apply to '.replacement' /* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/ * only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference( struct md_rdev *rdev2 = conf->disks[i].rdev;
conf->disks[i].rdev);
if (rdev2 == rdev) if (rdev2 == rdev)
clear_bit(R5_Insync, &dev->flags); clear_bit(R5_Insync, &dev->flags);
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
...@@ -4823,8 +4787,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4823,8 +4787,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_MadeGood, &dev->flags)) { if (test_bit(R5_MadeGood, &dev->flags)) {
/* This flag does not apply to '.replacement' /* This flag does not apply to '.replacement'
* only to .rdev, so make sure to check that*/ * only to .rdev, so make sure to check that*/
struct md_rdev *rdev2 = rcu_dereference( struct md_rdev *rdev2 = conf->disks[i].rdev;
conf->disks[i].rdev);
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1; s->handle_bad_blocks = 1;
atomic_inc(&rdev2->nr_pending); atomic_inc(&rdev2->nr_pending);
...@@ -4832,8 +4796,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4832,8 +4796,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
clear_bit(R5_MadeGood, &dev->flags); clear_bit(R5_MadeGood, &dev->flags);
} }
if (test_bit(R5_MadeGoodRepl, &dev->flags)) { if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
struct md_rdev *rdev2 = rcu_dereference( struct md_rdev *rdev2 = conf->disks[i].replacement;
conf->disks[i].replacement);
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
s->handle_bad_blocks = 1; s->handle_bad_blocks = 1;
atomic_inc(&rdev2->nr_pending); atomic_inc(&rdev2->nr_pending);
...@@ -4854,8 +4818,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4854,8 +4818,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && !test_bit(Faulty, &rdev->flags)) if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1; do_recovery = 1;
else if (!rdev) { else if (!rdev) {
rdev = rcu_dereference( rdev = conf->disks[i].replacement;
conf->disks[i].replacement);
if (rdev && !test_bit(Faulty, &rdev->flags)) if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1; do_recovery = 1;
} }
...@@ -4882,7 +4845,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4882,7 +4845,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
else else
s->replacing = 1; s->replacing = 1;
} }
rcu_read_unlock();
} }
/* /*
...@@ -5339,23 +5301,23 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -5339,23 +5301,23 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) { if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */ /* We own a safe reference to the rdev */
rdev = rdev_pend_deref(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
if (!rdev_set_badblocks(rdev, sh->sector, if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0)) RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = rdev_pend_deref(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector, rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0); RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
rdev = rdev_pend_deref(conf->disks[i].replacement); rdev = conf->disks[i].replacement;
if (!rdev) if (!rdev)
/* rdev have been moved down */ /* rdev have been moved down */
rdev = rdev_pend_deref(conf->disks[i].rdev); rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector, rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0); RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
...@@ -5514,24 +5476,22 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) ...@@ -5514,24 +5476,22 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
&dd_idx, NULL); &dd_idx, NULL);
end_sector = sector + bio_sectors(raid_bio); end_sector = sector + bio_sectors(raid_bio);
rcu_read_lock();
if (r5c_big_stripe_cached(conf, sector)) if (r5c_big_stripe_cached(conf, sector))
goto out_rcu_unlock; return 0;
rdev = rcu_dereference(conf->disks[dd_idx].replacement); rdev = conf->disks[dd_idx].replacement;
if (!rdev || test_bit(Faulty, &rdev->flags) || if (!rdev || test_bit(Faulty, &rdev->flags) ||
rdev->recovery_offset < end_sector) { rdev->recovery_offset < end_sector) {
rdev = rcu_dereference(conf->disks[dd_idx].rdev); rdev = conf->disks[dd_idx].rdev;
if (!rdev) if (!rdev)
goto out_rcu_unlock; return 0;
if (test_bit(Faulty, &rdev->flags) || if (test_bit(Faulty, &rdev->flags) ||
!(test_bit(In_sync, &rdev->flags) || !(test_bit(In_sync, &rdev->flags) ||
rdev->recovery_offset >= end_sector)) rdev->recovery_offset >= end_sector))
goto out_rcu_unlock; return 0;
} }
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
&bad_sectors)) { &bad_sectors)) {
...@@ -5575,10 +5535,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) ...@@ -5575,10 +5535,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
raid_bio->bi_iter.bi_sector); raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio); submit_bio_noacct(align_bio);
return 1; return 1;
out_rcu_unlock:
rcu_read_unlock();
return 0;
} }
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
...@@ -6581,14 +6537,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n ...@@ -6581,14 +6537,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
* Note in case of > 1 drive failures it's possible we're rebuilding * Note in case of > 1 drive failures it's possible we're rebuilding
* one drive while leaving another faulty drive in array. * one drive while leaving another faulty drive in array.
*/ */
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1; still_degraded = 1;
} }
rcu_read_unlock();
md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
...@@ -7899,18 +7853,10 @@ static int raid5_run(struct mddev *mddev) ...@@ -7899,18 +7853,10 @@ static int raid5_run(struct mddev *mddev)
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) { i++) {
rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); rdev = conf->disks[i].rdev;
if (!rdev && conf->disks[i].replacement) {
/* The replacement is all we have yet */
rdev = rdev_mdlock_deref(mddev,
conf->disks[i].replacement);
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
rcu_assign_pointer(conf->disks[i].rdev, rdev);
}
if (!rdev) if (!rdev)
continue; continue;
if (rcu_access_pointer(conf->disks[i].replacement) && if (conf->disks[i].replacement &&
conf->reshape_progress != MaxSector) { conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */ /* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n"); pr_warn("md: cannot handle concurrent replacement and reshape.\n");
...@@ -8094,15 +8040,16 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev) ...@@ -8094,15 +8040,16 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
int i; int i;
lockdep_assert_held(&mddev->lock);
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
conf->chunk_sectors / 2, mddev->layout); conf->chunk_sectors / 2, mddev->layout);
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
} }
rcu_read_unlock();
seq_printf (seq, "]"); seq_printf (seq, "]");
} }
...@@ -8140,9 +8087,8 @@ static int raid5_spare_active(struct mddev *mddev) ...@@ -8140,9 +8087,8 @@ static int raid5_spare_active(struct mddev *mddev)
unsigned long flags; unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev); rdev = conf->disks[i].rdev;
replacement = rdev_mdlock_deref(mddev, replacement = conf->disks[i].replacement;
conf->disks[i].replacement);
if (replacement if (replacement
&& replacement->recovery_offset == MaxSector && replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &replacement->flags) && !test_bit(Faulty, &replacement->flags)
...@@ -8181,7 +8127,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8181,7 +8127,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
int err = 0; int err = 0;
int number = rdev->raid_disk; int number = rdev->raid_disk;
struct md_rdev __rcu **rdevp; struct md_rdev **rdevp;
struct disk_info *p; struct disk_info *p;
struct md_rdev *tmp; struct md_rdev *tmp;
...@@ -8204,9 +8150,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8204,9 +8150,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->pool_size)) if (unlikely(number >= conf->pool_size))
return 0; return 0;
p = conf->disks + number; p = conf->disks + number;
if (rdev == rcu_access_pointer(p->rdev)) if (rdev == p->rdev)
rdevp = &p->rdev; rdevp = &p->rdev;
else if (rdev == rcu_access_pointer(p->replacement)) else if (rdev == p->replacement)
rdevp = &p->replacement; rdevp = &p->replacement;
else else
return 0; return 0;
...@@ -8226,28 +8172,24 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8226,28 +8172,24 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) && if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled && mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) && !has_failed(conf) &&
(!rcu_access_pointer(p->replacement) || (!p->replacement || p->replacement == rdev) &&
rcu_access_pointer(p->replacement) == rdev) &&
number < conf->raid_disks) { number < conf->raid_disks) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
*rdevp = NULL; WRITE_ONCE(*rdevp, NULL);
if (!err) { if (!err) {
err = log_modify(conf, rdev, false); err = log_modify(conf, rdev, false);
if (err) if (err)
goto abort; goto abort;
} }
tmp = rcu_access_pointer(p->replacement); tmp = p->replacement;
if (tmp) { if (tmp) {
/* We must have just cleared 'rdev' */ /* We must have just cleared 'rdev' */
rcu_assign_pointer(p->rdev, tmp); WRITE_ONCE(p->rdev, tmp);
clear_bit(Replacement, &tmp->flags); clear_bit(Replacement, &tmp->flags);
smp_mb(); /* Make sure other CPUs may see both as identical WRITE_ONCE(p->replacement, NULL);
* but will never see neither - if they are careful
*/
rcu_assign_pointer(p->replacement, NULL);
if (!err) if (!err)
err = log_modify(conf, tmp, true); err = log_modify(conf, tmp, true);
...@@ -8315,7 +8257,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8315,7 +8257,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = disk; rdev->raid_disk = disk;
if (rdev->saved_raid_disk != disk) if (rdev->saved_raid_disk != disk)
conf->fullsync = 1; conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev); WRITE_ONCE(p->rdev, rdev);
err = log_modify(conf, rdev, true); err = log_modify(conf, rdev, true);
...@@ -8324,7 +8266,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8324,7 +8266,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
} }
for (disk = first; disk <= last; disk++) { for (disk = first; disk <= last; disk++) {
p = conf->disks + disk; p = conf->disks + disk;
tmp = rdev_mdlock_deref(mddev, p->rdev); tmp = p->rdev;
if (test_bit(WantReplacement, &tmp->flags) && if (test_bit(WantReplacement, &tmp->flags) &&
mddev->reshape_position == MaxSector && mddev->reshape_position == MaxSector &&
p->replacement == NULL) { p->replacement == NULL) {
...@@ -8333,7 +8275,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -8333,7 +8275,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
rdev->raid_disk = disk; rdev->raid_disk = disk;
err = 0; err = 0;
conf->fullsync = 1; conf->fullsync = 1;
rcu_assign_pointer(p->replacement, rdev); WRITE_ONCE(p->replacement, rdev);
break; break;
} }
} }
...@@ -8466,7 +8408,7 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -8466,7 +8408,7 @@ static int raid5_start_reshape(struct mddev *mddev)
if (mddev->recovery_cp < MaxSector) if (mddev->recovery_cp < MaxSector)
return -EBUSY; return -EBUSY;
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) if (conf->disks[i].replacement)
return -EBUSY; return -EBUSY;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
...@@ -8637,12 +8579,10 @@ static void raid5_finish_reshape(struct mddev *mddev) ...@@ -8637,12 +8579,10 @@ static void raid5_finish_reshape(struct mddev *mddev)
for (d = conf->raid_disks ; for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks; d < conf->raid_disks - mddev->delta_disks;
d++) { d++) {
rdev = rdev_mdlock_deref(mddev, rdev = conf->disks[d].rdev;
conf->disks[d].rdev);
if (rdev) if (rdev)
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
rdev = rdev_mdlock_deref(mddev, rdev = conf->disks[d].replacement;
conf->disks[d].replacement);
if (rdev) if (rdev)
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
} }
......
...@@ -473,8 +473,8 @@ enum { ...@@ -473,8 +473,8 @@ enum {
*/ */
struct disk_info { struct disk_info {
struct md_rdev __rcu *rdev; struct md_rdev *rdev;
struct md_rdev __rcu *replacement; struct md_rdev *replacement;
struct page *extra_page; /* extra page to use in prexor */ struct page *extra_page; /* extra page to use in prexor */
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment