Commit e81faa91 authored by Song Liu's avatar Song Liu

Merge branch 'raid1-read_balance' into md-6.9

From: Yu Kuai <yukuai3@huawei.com>
Co-developed-by: default avatarPaul Luse <paul.e.luse@linux.intel.com>

The original idea is that Paul want to optimize raid1 read
performance([1]), however, we think that the original code for
read_balance() is quite complex, and we don't want to add more
complexity. Hence we decide to refactor read_balance() first, to make
code cleaner and easier for follow up.

Before this patchset, read_balance() has many local variables and many
branches, it want to consider all the scenarios in one iteration. The
idea of this patch is to divide them into 4 different steps:

1) If resync is in progress, find the first usable disk, patch 5;
Otherwise:
2) Loop through all disks and skipping slow disks and disks with bad
blocks, choose the best disk, patch 10. If no disk is found:
3) Look for disks with bad blocks and choose the one with most number of
sectors, patch 8. If no disk is found:
4) Choose first found slow disk with no bad blocks, or slow disk with
most number of sectors, patch 7.

Note that step 3) and step 4) are super code path, and performance
should not be considered.

And after this patchset, we'll continue to optimize read_balance for
step 2), specifically how to choose the best rdev to read.

[1] https://lore.kernel.org/all/20240102125115.129261-1-paul.e.luse@linux.intel.com/

Yu Kuai (11):
  md: add a new helper rdev_has_badblock()
  md/raid1: factor out helpers to add rdev to conf
  md/raid1: record nonrot rdevs while adding/removing rdevs to conf
  md/raid1: fix choose next idle in read_balance()
  md/raid1-10: add a helper raid1_check_read_range()
  md/raid1-10: factor out a new helper raid1_should_read_first()
  md/raid1: factor out read_first_rdev() from read_balance()
  md/raid1: factor out choose_slow_rdev() from read_balance()
  md/raid1: factor out choose_bb_rdev() from read_balance()
  md/raid1: factor out the code to manage sequential IO
  md/raid1: factor out helpers to choose the best rdev from
    read_balance()
parents dfd2bf43 0091c5a2
...@@ -207,6 +207,7 @@ enum flag_bits { ...@@ -207,6 +207,7 @@ enum flag_bits {
* check if there is collision between raid1 * check if there is collision between raid1
* serial bios. * serial bios.
*/ */
Nonrot, /* non-rotational device (SSD) */
}; };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
...@@ -222,6 +223,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -222,6 +223,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
} }
return 0; return 0;
} }
static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
int sectors)
{
sector_t first_bad;
int bad_sectors;
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new); int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
......
...@@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev) ...@@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
return false; return false;
} }
/**
* raid1_check_read_range() - check a given read range for bad blocks,
* available read length is returned;
* @rdev: the rdev to read;
* @this_sector: read position;
* @len: read length;
*
* helper function for read_balance()
*
* 1) If there are no bad blocks in the range, @len is returned;
* 2) If the range are all bad blocks, 0 is returned;
* 3) If there are partial bad blocks:
* - If the bad block range starts after @this_sector, the length of first
* good region is returned;
* - If the bad block range starts before @this_sector, 0 is returned and
* the @len is updated to the offset into the region before we get to the
* good blocks;
*/
static inline int raid1_check_read_range(struct md_rdev *rdev,
sector_t this_sector, int *len)
{
sector_t first_bad;
int bad_sectors;
/* no bad block overlap */
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
return *len;
/*
* bad block range starts offset into our range so we can return the
* number of sectors before the bad blocks start.
*/
if (first_bad > this_sector)
return first_bad - this_sector;
/* read range is fully consumed by bad blocks. */
if (this_sector + *len <= first_bad + bad_sectors)
return 0;
/*
* final case, bad block range starts before or at the start of our
* range but does not cover our entire range so we still return 0 but
* update the length with the number of sectors before we get to the
* good ones.
*/
*len = first_bad + bad_sectors - this_sector;
return 0;
}
/*
* Check if read should choose the first rdev.
*
* Balance on the whole device if no resync is going on (recovery is ok) or
* below the resync window. Otherwise, take the first readable disk.
*/
static inline bool raid1_should_read_first(struct mddev *mddev,
sector_t this_sector, int len)
{
if ((mddev->recovery_cp < this_sector + len))
return true;
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, READ, this_sector,
this_sector + len))
return true;
return false;
}
...@@ -498,9 +498,6 @@ static void raid1_end_write_request(struct bio *bio) ...@@ -498,9 +498,6 @@ static void raid1_end_write_request(struct bio *bio)
* to user-side. So if something waits for IO, then it * to user-side. So if something waits for IO, then it
* will wait for the 'master' bio. * will wait for the 'master' bio.
*/ */
sector_t first_bad;
int bad_sectors;
r1_bio->bios[mirror] = NULL; r1_bio->bios[mirror] = NULL;
to_put = bio; to_put = bio;
/* /*
...@@ -516,8 +513,8 @@ static void raid1_end_write_request(struct bio *bio) ...@@ -516,8 +513,8 @@ static void raid1_end_write_request(struct bio *bio)
set_bit(R1BIO_Uptodate, &r1_bio->state); set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */ /* Maybe we can clear some bad blocks. */
if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
&first_bad, &bad_sectors) && !discard_error) { !discard_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD; r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state); set_bit(R1BIO_MadeGood, &r1_bio->state);
} }
...@@ -582,211 +579,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector, ...@@ -582,211 +579,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
return len; return len;
} }
/* static void update_read_sectors(struct r1conf *conf, int disk,
* This routine returns the disk from which the requested read should sector_t this_sector, int len)
* be done. There is a per-array 'next expected sequential IO' sector
* number - if this matches on the next IO then we use the last disk.
* There is also a per-disk 'last know head position' sector that is
* maintained from IRQ contexts, both the normal and the resync IO
* completion handlers update this position correctly. If there is no
* perfect sequential match then we pick the disk whose head is closest.
*
* If there are 2 mirrors in the same 2 devices, performance degrades
* because position is mirror, not device based.
*
* The rdev for the device selected will have nr_pending incremented.
*/
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
{ {
const sector_t this_sector = r1_bio->sector; struct raid1_info *info = &conf->mirrors[disk];
int sectors;
int best_good_sectors; atomic_inc(&info->rdev->nr_pending);
int best_disk, best_dist_disk, best_pending_disk; if (info->next_seq_sect != this_sector)
int has_nonrot_disk; info->seq_start = this_sector;
info->next_seq_sect = this_sector + len;
}
static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int len = r1_bio->sectors;
int disk; int disk;
sector_t best_dist;
unsigned int min_pending; for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev; struct md_rdev *rdev;
int choose_first; int read_len;
int choose_next_idle;
/* if (r1_bio->bios[disk] == IO_BLOCKED)
* Check if we can balance. We can balance on the whole continue;
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
retry:
sectors = r1_bio->sectors;
best_disk = -1;
best_dist_disk = -1;
best_dist = MaxSector;
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0;
has_nonrot_disk = 0;
choose_next_idle = 0;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if ((conf->mddev->recovery_cp < this_sector + sectors) || rdev = conf->mirrors[disk].rdev;
(mddev_is_clustered(conf->mddev) && if (!rdev || test_bit(Faulty, &rdev->flags))
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, continue;
this_sector + sectors)))
choose_first = 1; /* choose the first disk even if it has some bad blocks. */
else read_len = raid1_check_read_range(rdev, this_sector, &len);
choose_first = 0; if (read_len > 0) {
update_read_sectors(conf, disk, this_sector, read_len);
*max_sectors = read_len;
return disk;
}
}
return -1;
}
static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int best_disk = -1;
int best_len = 0;
int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist; struct md_rdev *rdev;
sector_t first_bad; int len;
int bad_sectors; int read_len;
unsigned int pending;
bool nonrot; if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev; rdev = conf->mirrors[disk].rdev;
if (r1_bio->bios[disk] == IO_BLOCKED if (!rdev || test_bit(Faulty, &rdev->flags) ||
|| rdev == NULL test_bit(WriteMostly, &rdev->flags))
|| test_bit(Faulty, &rdev->flags))
continue; continue;
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < this_sector + sectors) /* keep track of the disk with the most readable sectors. */
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len > best_len) {
best_disk = disk;
best_len = read_len;
}
}
if (best_disk != -1) {
*max_sectors = best_len;
update_read_sectors(conf, best_disk, this_sector, best_len);
}
return best_disk;
}
static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int bb_disk = -1;
int bb_read_len = 0;
int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
int len;
int read_len;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue; continue;
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just rdev = conf->mirrors[disk].rdev;
* use the first as a last resort */ if (!rdev || test_bit(Faulty, &rdev->flags) ||
if (best_dist_disk < 0) { !test_bit(WriteMostly, &rdev->flags))
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (first_bad <= this_sector)
/* Cannot use this */
continue; continue;
best_good_sectors = first_bad - this_sector;
} else /* there are no bad blocks, we can use this disk */
best_good_sectors = sectors; len = r1_bio->sectors;
best_dist_disk = disk; read_len = raid1_check_read_range(rdev, this_sector, &len);
best_pending_disk = disk; if (read_len == r1_bio->sectors) {
update_read_sectors(conf, disk, this_sector, read_len);
return disk;
}
/*
* there are partial bad blocks, choose the rdev with largest
* read length.
*/
if (read_len > bb_read_len) {
bb_disk = disk;
bb_read_len = read_len;
} }
continue;
} }
/* This is a reasonable device to use. It might
* even be best. if (bb_disk != -1) {
*max_sectors = bb_read_len;
update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
}
return bb_disk;
}
static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
{
/* TODO: address issues with this check and concurrency. */
return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
conf->mirrors[disk].head_position == r1_bio->sector;
}
/*
* If buffered sequential IO size exceeds optimal iosize, check if there is idle
* disk. If yes, choose the idle disk.
*/ */
if (is_badblock(rdev, this_sector, sectors, static bool should_choose_next(struct r1conf *conf, int disk)
&first_bad, &bad_sectors)) { {
if (best_dist < MaxSector) struct raid1_info *mirror = &conf->mirrors[disk];
/* already have a better device */ int opt_iosize;
if (!test_bit(Nonrot, &mirror->rdev->flags))
return false;
opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
return opt_iosize > 0 && mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
}
static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
{
if (!rdev || test_bit(Faulty, &rdev->flags))
return false;
/* still in recovery */
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
return false;
/* don't read from slow disk unless have to */
if (test_bit(WriteMostly, &rdev->flags))
return false;
/* don't split IO for bad blocks unless have to */
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
return false;
return true;
}
struct read_balance_ctl {
sector_t closest_dist;
int closest_dist_disk;
int min_pending;
int min_pending_disk;
int sequential_disk;
int readable_disks;
};
static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
{
int disk;
struct read_balance_ctl ctl = {
.closest_dist_disk = -1,
.closest_dist = MaxSector,
.min_pending_disk = -1,
.min_pending = UINT_MAX,
.sequential_disk = -1,
};
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
sector_t dist;
unsigned int pending;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue; continue;
if (first_bad <= this_sector) {
/* cannot read here. If this is the 'primary'
* device, then we must not read beyond
* bad_sectors from another device..
*/
bad_sectors -= (this_sector - first_bad);
if (choose_first && sectors > bad_sectors)
sectors = bad_sectors;
if (best_good_sectors > sectors)
best_good_sectors = sectors;
} else { rdev = conf->mirrors[disk].rdev;
sector_t good_sectors = first_bad - this_sector; if (!rdev_readable(rdev, r1_bio))
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
best_disk = disk;
}
if (choose_first)
break;
}
continue; continue;
} else {
if ((sectors > best_good_sectors) && (best_disk >= 0))
best_disk = -1;
best_good_sectors = sectors;
}
if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */ /* At least two disks to choose from so failfast is OK */
if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state); set_bit(R1BIO_FailFast, &r1_bio->state);
nonrot = bdev_nonrot(rdev->bdev);
has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending); pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position); dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
if (choose_first) {
best_disk = disk;
break;
}
/* Don't change to another disk for sequential reads */ /* Don't change to another disk for sequential reads */
if (conf->mirrors[disk].next_seq_sect == this_sector if (is_sequential(conf, disk, r1_bio)) {
|| dist == 0) { if (!should_choose_next(conf, disk))
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; return disk;
struct raid1_info *mirror = &conf->mirrors[disk];
best_disk = disk;
/* /*
* If buffered sequential IO size exceeds optimal * Add 'pending' to avoid choosing this disk if
* iosize, check if there is idle disk. If yes, choose * there is other idle disk.
* the idle disk. read_balance could already choose an */
* idle disk before noticing it's a sequential IO in pending++;
* this disk. This doesn't matter because this disk /*
* will idle, next time it will be utilized after the * If there is no other idle disk, this disk
* first disk has IO size exceeds optimal iosize. In * will be chosen.
* this way, iosize of the first disk will be optimal */
* iosize at least. iosize of the second disk might be ctl.sequential_disk = disk;
* small, but not a big deal since when the second disk
* starts IO, the first disk is likely still busy.
*/
if (nonrot && opt_iosize > 0 &&
mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >=
mirror->seq_start) {
choose_next_idle = 1;
continue;
}
break;
} }
if (choose_next_idle) if (ctl.min_pending > pending) {
continue; ctl.min_pending = pending;
ctl.min_pending_disk = disk;
if (min_pending > pending) {
min_pending = pending;
best_pending_disk = disk;
} }
if (dist < best_dist) { if (ctl.closest_dist > dist) {
best_dist = dist; ctl.closest_dist = dist;
best_dist_disk = disk; ctl.closest_dist_disk = disk;
} }
} }
/*
* sequential IO size exceeds optimal iosize, however, there is no other
* idle disk, so choose the sequential disk.
*/
if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
return ctl.sequential_disk;
/* /*
* If all disks are rotational, choose the closest disk. If any disk is * If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the * non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with * disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload. * mixed ratation/non-rotational disks depending on workload.
*/ */
if (best_disk == -1) { if (ctl.min_pending_disk != -1 &&
if (has_nonrot_disk || min_pending == 0) (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
best_disk = best_pending_disk; return ctl.min_pending_disk;
else else
best_disk = best_dist_disk; return ctl.closest_dist_disk;
} }
if (best_disk >= 0) { /*
rdev = conf->mirrors[best_disk].rdev; * This routine returns the disk from which the requested read should be done.
if (!rdev) *
goto retry; * 1) If resync is in progress, find the first usable disk and use it even if it
atomic_inc(&rdev->nr_pending); * has some bad blocks.
sectors = best_good_sectors; *
* 2) Now that there is no resync, loop through all disks and skipping slow
* disks and disks with bad blocks for now. Only pay attention to key disk
* choice.
*
* 3) If we've made it this far, now look for disks with bad blocks and choose
* the one with most number of sectors.
*
* 4) If we are all the way at the end, we have no choice but to use a disk even
* if it is write mostly.
*
* The rdev for the device selected will have nr_pending incremented.
*/
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
int disk;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if (conf->mirrors[best_disk].next_seq_sect != this_sector) if (raid1_should_read_first(conf->mddev, r1_bio->sector,
conf->mirrors[best_disk].seq_start = this_sector; r1_bio->sectors))
return choose_first_rdev(conf, r1_bio, max_sectors);
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; disk = choose_best_rdev(conf, r1_bio);
if (disk >= 0) {
*max_sectors = r1_bio->sectors;
update_read_sectors(conf, disk, r1_bio->sector,
r1_bio->sectors);
return disk;
} }
*max_sectors = sectors;
return best_disk; /*
* If we are here it means we didn't find a perfectly good disk so
* now spend a bit more time trying to find one with the most good
* sectors.
*/
disk = choose_bb_rdev(conf, r1_bio, max_sectors);
if (disk >= 0)
return disk;
return choose_slow_rdev(conf, r1_bio, max_sectors);
} }
static void wake_up_barrier(struct r1conf *conf) static void wake_up_barrier(struct r1conf *conf)
...@@ -1760,6 +1858,52 @@ static int raid1_spare_active(struct mddev *mddev) ...@@ -1760,6 +1858,52 @@ static int raid1_spare_active(struct mddev *mddev)
return count; return count;
} }
static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
bool replacement)
{
struct raid1_info *info = conf->mirrors + disk;
if (replacement)
info += conf->raid_disks;
if (info->rdev)
return false;
if (bdev_nonrot(rdev->bdev)) {
set_bit(Nonrot, &rdev->flags);
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
}
rdev->raid_disk = disk;
info->head_position = 0;
info->seq_start = MaxSector;
WRITE_ONCE(info->rdev, rdev);
return true;
}
static bool raid1_remove_conf(struct r1conf *conf, int disk)
{
struct raid1_info *info = conf->mirrors + disk;
struct md_rdev *rdev = info->rdev;
if (!rdev || test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending))
return false;
/* Only remove non-faulty devices if recovery is not possible. */
if (!test_bit(Faulty, &rdev->flags) &&
rdev->mddev->recovery_disabled != conf->recovery_disabled &&
rdev->mddev->degraded < conf->raid_disks)
return false;
if (test_and_clear_bit(Nonrot, &rdev->flags))
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
WRITE_ONCE(info->rdev, NULL);
return true;
}
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
...@@ -1795,15 +1939,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1795,15 +1939,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
p->head_position = 0; raid1_add_conf(conf, rdev, mirror, false);
rdev->raid_disk = mirror;
err = 0; err = 0;
/* As all devices are equivalent, we don't need a full recovery /* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array * if this was recently any drive of the array
*/ */
if (rdev->saved_raid_disk < 0) if (rdev->saved_raid_disk < 0)
conf->fullsync = 1; conf->fullsync = 1;
WRITE_ONCE(p->rdev, rdev);
break; break;
} }
if (test_bit(WantReplacement, &p->rdev->flags) && if (test_bit(WantReplacement, &p->rdev->flags) &&
...@@ -1813,13 +1955,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1813,13 +1955,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) { if (err && repl_slot >= 0) {
/* Add this device as a replacement */ /* Add this device as a replacement */
p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags); set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot; raid1_add_conf(conf, rdev, repl_slot, true);
err = 0; err = 0;
conf->fullsync = 1; conf->fullsync = 1;
WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
} }
print_conf(conf); print_conf(conf);
...@@ -1836,27 +1976,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1836,27 +1976,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks)) if (unlikely(number >= conf->raid_disks))
goto abort; goto abort;
if (rdev != p->rdev) if (rdev != p->rdev) {
p = conf->mirrors + conf->raid_disks + number; number += conf->raid_disks;
p = conf->mirrors + number;
}
print_conf(conf); print_conf(conf);
if (rdev == p->rdev) { if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) || if (!raid1_remove_conf(conf, number)) {
atomic_read(&rdev->nr_pending)) {
err = -EBUSY;
goto abort;
}
/* Only remove non-faulty devices if recovery
* is not possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
mddev->degraded < conf->raid_disks) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
WRITE_ONCE(p->rdev, NULL);
if (conf->mirrors[conf->raid_disks + number].rdev) { if (number < conf->raid_disks &&
conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced. /* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before * Move down the replacement. We drain all IO before
* doing this to avoid confusion. * doing this to avoid confusion.
...@@ -1944,8 +2077,6 @@ static void end_sync_write(struct bio *bio) ...@@ -1944,8 +2077,6 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = get_resync_r1bio(bio); struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev; struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
sector_t first_bad;
int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) { if (!uptodate) {
...@@ -1955,14 +2086,11 @@ static void end_sync_write(struct bio *bio) ...@@ -1955,14 +2086,11 @@ static void end_sync_write(struct bio *bio)
set_bit(MD_RECOVERY_NEEDED, & set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery); mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state); set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
&first_bad, &bad_sectors) && !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
!is_badblock(conf->mirrors[r1_bio->read_disk].rdev, r1_bio->sector, r1_bio->sectors)) {
r1_bio->sector,
r1_bio->sectors,
&first_bad, &bad_sectors)
)
set_bit(R1BIO_MadeGood, &r1_bio->state); set_bit(R1BIO_MadeGood, &r1_bio->state);
}
put_sync_write_buf(r1_bio, uptodate); put_sync_write_buf(r1_bio, uptodate);
} }
...@@ -2279,16 +2407,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) ...@@ -2279,16 +2407,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
do { do {
sector_t first_bad;
int bad_sectors;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
(test_bit(In_sync, &rdev->flags) || (test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) && (!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) && rdev->recovery_offset >= sect + s)) &&
is_badblock(rdev, sect, s, rdev_has_badblock(rdev, sect, s) == 0) {
&first_bad, &bad_sectors) == 0) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (sync_page_io(rdev, sect, s<<9, if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false)) conf->tmppage, REQ_OP_READ, false))
...@@ -3006,23 +3130,17 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -3006,23 +3130,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL; err = -EINVAL;
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk; int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0) if (disk_idx >= conf->raid_disks || disk_idx < 0)
continue; continue;
if (test_bit(Replacement, &rdev->flags))
disk = conf->mirrors + mddev->raid_disks + disk_idx;
else
disk = conf->mirrors + disk_idx;
if (disk->rdev) if (!raid1_add_conf(conf, rdev, disk_idx,
test_bit(Replacement, &rdev->flags)))
goto abort; goto abort;
disk->rdev = rdev;
disk->head_position = 0;
disk->seq_start = MaxSector;
} }
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev; conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list); INIT_LIST_HEAD(&conf->bio_end_io_list);
......
...@@ -71,6 +71,7 @@ struct r1conf { ...@@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements. * allow for replacements.
*/ */
int raid_disks; int raid_disks;
int nonrot_disks;
spinlock_t device_lock; spinlock_t device_lock;
......
...@@ -518,11 +518,7 @@ static void raid10_end_write_request(struct bio *bio) ...@@ -518,11 +518,7 @@ static void raid10_end_write_request(struct bio *bio)
* The 'master' represents the composite IO operation to * The 'master' represents the composite IO operation to
* user-side. So if something waits for IO, then it will * user-side. So if something waits for IO, then it will
* wait for the 'master' bio. * wait for the 'master' bio.
*/ *
sector_t first_bad;
int bad_sectors;
/*
* Do not set R10BIO_Uptodate if the current device is * Do not set R10BIO_Uptodate if the current device is
* rebuilding or Faulty. This is because we cannot use * rebuilding or Faulty. This is because we cannot use
* such device for properly reading the data back (we could * such device for properly reading the data back (we could
...@@ -535,10 +531,9 @@ static void raid10_end_write_request(struct bio *bio) ...@@ -535,10 +531,9 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */ /* Maybe we can clear some bad blocks. */
if (is_badblock(rdev, if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->devs[slot].addr, r10_bio->sectors) &&
r10_bio->sectors, !discard_error) {
&first_bad, &bad_sectors) && !discard_error) {
bio_put(bio); bio_put(bio);
if (repl) if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
...@@ -753,17 +748,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -753,17 +748,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_good_sectors = 0; best_good_sectors = 0;
do_balance = 1; do_balance = 1;
clear_bit(R10BIO_FailFast, &r10_bio->state); clear_bit(R10BIO_FailFast, &r10_bio->state);
/*
* Check if we can balance. We can balance on the whole if (raid1_should_read_first(conf->mddev, this_sector, sectors))
* device if no resync is going on (recovery is ok), or below
* the resync window. We take the first readable disk when
* above the resync window.
*/
if ((conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
do_balance = 0; do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) { for (slot = 0; slot < conf->copies ; slot++) {
...@@ -1330,10 +1316,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1330,10 +1316,7 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
} }
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr; sector_t dev_sector = r10_bio->devs[i].addr;
int bad_sectors;
int is_bad;
/* /*
* Discard request doesn't care the write result * Discard request doesn't care the write result
...@@ -1342,9 +1325,8 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -1342,9 +1325,8 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
if (!r10_bio->sectors) if (!r10_bio->sectors)
continue; continue;
is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, if (rdev_has_badblock(rdev, dev_sector,
&first_bad, &bad_sectors); r10_bio->sectors) < 0) {
if (is_bad < 0) {
/* /*
* Mustn't write here until the bad block * Mustn't write here until the bad block
* is acknowledged * is acknowledged
...@@ -2290,8 +2272,6 @@ static void end_sync_write(struct bio *bio) ...@@ -2290,8 +2272,6 @@ static void end_sync_write(struct bio *bio)
struct mddev *mddev = r10_bio->mddev; struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int d; int d;
sector_t first_bad;
int bad_sectors;
int slot; int slot;
int repl; int repl;
struct md_rdev *rdev = NULL; struct md_rdev *rdev = NULL;
...@@ -2312,11 +2292,10 @@ static void end_sync_write(struct bio *bio) ...@@ -2312,11 +2292,10 @@ static void end_sync_write(struct bio *bio)
&rdev->mddev->recovery); &rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state); set_bit(R10BIO_WriteError, &r10_bio->state);
} }
} else if (is_badblock(rdev, } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->devs[slot].addr, r10_bio->sectors)) {
r10_bio->sectors,
&first_bad, &bad_sectors))
set_bit(R10BIO_MadeGood, &r10_bio->state); set_bit(R10BIO_MadeGood, &r10_bio->state);
}
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
...@@ -2597,11 +2576,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2597,11 +2576,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op) int sectors, struct page *page, enum req_op op)
{ {
sector_t first_bad; if (rdev_has_badblock(rdev, sector, sectors) &&
int bad_sectors; (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
&& (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1; return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
/* success */ /* success */
...@@ -2658,16 +2634,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 ...@@ -2658,16 +2634,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9; s = PAGE_SIZE >> 9;
do { do {
sector_t first_bad;
int bad_sectors;
d = r10_bio->devs[sl].devnum; d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
test_bit(In_sync, &rdev->flags) && test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, rdev_has_badblock(rdev,
&first_bad, &bad_sectors) == 0) { r10_bio->devs[sl].addr + sect,
s) == 0) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev, success = sync_page_io(rdev,
r10_bio->devs[sl].addr + r10_bio->devs[sl].addr +
......
...@@ -1210,10 +1210,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -1210,10 +1210,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
*/ */
while (op_is_write(op) && rdev && while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) { test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad; int bad = rdev_has_badblock(rdev, sh->sector,
int bad_sectors; RAID5_STRIPE_SECTORS(conf));
int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
if (!bad) if (!bad)
break; break;
...@@ -2855,8 +2853,6 @@ static void raid5_end_write_request(struct bio *bi) ...@@ -2855,8 +2853,6 @@ static void raid5_end_write_request(struct bio *bi)
struct r5conf *conf = sh->raid_conf; struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i; int disks = sh->disks, i;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0; int replacement = 0;
for (i = 0 ; i < disks; i++) { for (i = 0 ; i < disks; i++) {
...@@ -2888,9 +2884,8 @@ static void raid5_end_write_request(struct bio *bi) ...@@ -2888,9 +2884,8 @@ static void raid5_end_write_request(struct bio *bi)
if (replacement) { if (replacement) {
if (bi->bi_status) if (bi->bi_status)
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector, else if (rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), RAID5_STRIPE_SECTORS(conf)))
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else { } else {
if (bi->bi_status) { if (bi->bi_status) {
...@@ -2900,9 +2895,8 @@ static void raid5_end_write_request(struct bio *bi) ...@@ -2900,9 +2895,8 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_set_bit(WantReplacement, &rdev->flags)) if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED, set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery); &rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector, } else if (rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), RAID5_STRIPE_SECTORS(conf))) {
&first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags); set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* That was a successful write so make /* That was a successful write so make
...@@ -4674,8 +4668,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4674,8 +4668,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
for (i=disks; i--; ) { for (i=disks; i--; ) {
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int is_bad = 0; int is_bad = 0;
dev = &sh->dev[i]; dev = &sh->dev[i];
...@@ -4719,8 +4711,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4719,8 +4711,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rdev = conf->disks[i].replacement; rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) && if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), !rdev_has_badblock(rdev, sh->sector,
&first_bad, &bad_sectors)) RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_ReadRepl, &dev->flags); set_bit(R5_ReadRepl, &dev->flags);
else { else {
if (rdev && !test_bit(Faulty, &rdev->flags)) if (rdev && !test_bit(Faulty, &rdev->flags))
...@@ -4733,8 +4725,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4733,8 +4725,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL; rdev = NULL;
if (rdev) { if (rdev) {
is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), is_bad = rdev_has_badblock(rdev, sh->sector,
&first_bad, &bad_sectors); RAID5_STRIPE_SECTORS(conf));
if (s->blocked_rdev == NULL if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags) && (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) { || is_bad < 0)) {
...@@ -5463,8 +5455,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) ...@@ -5463,8 +5455,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
struct bio *align_bio; struct bio *align_bio;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t sector, end_sector, first_bad; sector_t sector, end_sector;
int bad_sectors, dd_idx; int dd_idx;
bool did_inc; bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) { if (!in_chunk_boundary(mddev, raid_bio)) {
...@@ -5493,8 +5485,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) ...@@ -5493,8 +5485,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
&bad_sectors)) {
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment