Commit 310e9c85 authored by Jens Axboe's avatar Jens Axboe

Merge branch 'md-next' of...

Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.4/block

Pull MD updates from Song:

"- md/bitmap: Optimal last page size, by Jon Derrick
 - Various raid10 fixes, by Yu Kuai and Li Nan
 - md: add error_handlers for raid0 and linear, by Mariusz Tkaczyk"

* 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md:
  md/raid5: remove unused working_disks variable
  md/raid10: don't call bio_start_io_acct twice for bio which experienced read error
  md/raid10: fix memleak of md thread
  md/raid10: fix memleak for 'conf->bio_split'
  md/raid10: fix leak of 'r10bio->remaining' for recovery
  md/raid10: don't BUG_ON() in raise_barrier()
  md: fix soft lockup in status_resync
  md: add error_handlers for raid0 and linear
  md: Use optimal I/O size for last bitmap page
  md: Fix types in sb writer
  md: Move sb writer loop to its own function
  md/raid10: Fix typo in comment (replacment -> replacement)
  md: make kobj_type structures constant
  md/raid10: fix null-ptr-deref in raid10_sync_request
  md/raid10: fix task hung in raid10d
parents d2a1d45c 7bc43612
......@@ -209,76 +209,99 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
return NULL;
}
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
static unsigned int optimal_io_size(struct block_device *bdev,
unsigned int last_page_size,
unsigned int io_size)
{
if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
return roundup(last_page_size, bdev_io_opt(bdev));
return io_size;
}
static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
sector_t start, sector_t boundary)
{
if (io_size != opt_size &&
start + opt_size / SECTOR_SIZE <= boundary)
return opt_size;
if (start + io_size / SECTOR_SIZE <= boundary)
return io_size;
/* Overflows boundary */
return 0;
}
static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct page *page)
{
struct md_rdev *rdev;
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
restart:
rdev = NULL;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE;
loff_t offset = mddev->bitmap_info.offset;
sector_t offset = mddev->bitmap_info.offset;
sector_t ps, sboff, doff;
unsigned int size = PAGE_SIZE;
unsigned int opt_size = PAGE_SIZE;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
if (page->index == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (page->index == store->file_pages-1) {
int last_page_size = store->bytes & (PAGE_SIZE-1);
if (last_page_size == 0)
last_page_size = PAGE_SIZE;
size = roundup(last_page_size,
bdev_logical_block_size(bdev));
size = roundup(last_page_size, bdev_logical_block_size(bdev));
opt_size = optimal_io_size(bdev, last_page_size, size);
}
/* Just make sure we aren't corrupting data or
* metadata
*/
ps = page->index * PAGE_SIZE / SECTOR_SIZE;
sboff = rdev->sb_start + offset;
doff = rdev->data_offset;
/* Just make sure we aren't corrupting data or metadata */
if (mddev->external) {
/* Bitmap could be anywhere. */
if (rdev->sb_start + offset + (page->index
* (PAGE_SIZE/512))
> rdev->data_offset
&&
rdev->sb_start + offset
< (rdev->data_offset + mddev->dev_sectors
+ (PAGE_SIZE/512)))
goto bad_alignment;
if (sboff + ps > doff &&
sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE))
return -EINVAL;
} else if (offset < 0) {
/* DATA BITMAP METADATA */
if (offset
+ (long)(page->index * (PAGE_SIZE/512))
+ size/512 > 0)
size = bitmap_io_size(size, opt_size, offset + ps, 0);
if (size == 0)
/* bitmap runs in to metadata */
goto bad_alignment;
if (rdev->data_offset + mddev->dev_sectors
> rdev->sb_start + offset)
return -EINVAL;
if (doff + mddev->dev_sectors > sboff)
/* data runs in to bitmap */
goto bad_alignment;
return -EINVAL;
} else if (rdev->sb_start < rdev->data_offset) {
/* METADATA BITMAP DATA */
if (rdev->sb_start
+ offset
+ page->index*(PAGE_SIZE/512) + size/512
> rdev->data_offset)
size = bitmap_io_size(size, opt_size, sboff + ps, doff);
if (size == 0)
/* bitmap runs in to data */
goto bad_alignment;
return -EINVAL;
} else {
/* DATA METADATA BITMAP - no problems */
}
md_super_write(mddev, rdev,
rdev->sb_start + offset
+ page->index * (PAGE_SIZE/512),
size,
page);
}
if (wait && md_super_wait(mddev) < 0)
goto restart;
md_super_write(mddev, rdev, sboff + ps, (int) size, page);
return 0;
}
bad_alignment:
return -EINVAL;
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
struct md_rdev *rdev;
struct mddev *mddev = bitmap->mddev;
int ret;
do {
rdev = NULL;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
ret = __write_sb_page(rdev, bitmap, page);
if (ret)
return ret;
}
} while (wait && md_super_wait(mddev) < 0);
return 0;
}
static void md_bitmap_file_kick(struct bitmap *bitmap);
......
......@@ -223,7 +223,8 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio_sector < start_sector))
goto out_of_bounds;
if (unlikely(is_mddev_broken(tmp_dev->rdev, "linear"))) {
if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
md_error(mddev, tmp_dev->rdev);
bio_io_error(bio);
return true;
}
......@@ -270,6 +271,16 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
}
static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
{
if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
char *md_name = mdname(mddev);
pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
md_name, rdev->bdev);
}
}
static void linear_quiesce(struct mddev *mddev, int state)
{
}
......@@ -286,6 +297,7 @@ static struct md_personality linear_personality =
.hot_add_disk = linear_add,
.size = linear_size,
.quiesce = linear_quiesce,
.error_handler = linear_error,
};
static int __init linear_init (void)
......
......@@ -78,7 +78,7 @@
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
static struct kobj_type md_ktype;
static const struct kobj_type md_ktype;
struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
......@@ -3597,7 +3597,7 @@ static const struct sysfs_ops rdev_sysfs_ops = {
.show = rdev_attr_show,
.store = rdev_attr_store,
};
static struct kobj_type rdev_ktype = {
static const struct kobj_type rdev_ktype = {
.release = rdev_free,
.sysfs_ops = &rdev_sysfs_ops,
.default_groups = rdev_default_groups,
......@@ -5555,7 +5555,7 @@ static const struct sysfs_ops md_sysfs_ops = {
.show = md_attr_show,
.store = md_attr_store,
};
static struct kobj_type md_ktype = {
static const struct kobj_type md_ktype = {
.release = md_kobj_release,
.sysfs_ops = &md_sysfs_ops,
.default_groups = md_attr_groups,
......@@ -7974,6 +7974,9 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
sysfs_notify_dirent_safe(rdev->sysfs_state);
......@@ -8029,16 +8032,16 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
} else if (resync > max_sectors) {
resync = max_sectors;
} else {
resync -= atomic_read(&mddev->recovery_active);
if (resync < MD_RESYNC_ACTIVE) {
res = atomic_read(&mddev->recovery_active);
/*
* Resync has started, but the subtraction has
* yielded one of the special values. Force it
* to active to ensure the status reports an
* active resync.
* Resync has started, but the subtraction has overflowed or
* yielded one of the special values. Force it to active to
* ensure the status reports an active resync.
*/
if (resync < res || resync - res < MD_RESYNC_ACTIVE)
resync = MD_RESYNC_ACTIVE;
}
else
resync -= res;
}
if (resync == MD_RESYNC_NONE) {
......
......@@ -790,15 +790,9 @@ extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
static inline bool is_mddev_broken(struct md_rdev *rdev, const char *md_type)
static inline bool is_rdev_broken(struct md_rdev *rdev)
{
if (!disk_live(rdev->bdev->bd_disk)) {
if (!test_and_set_bit(MD_BROKEN, &rdev->mddev->flags))
pr_warn("md: %s: %s array has a missing/failed member\n",
mdname(rdev->mddev), md_type);
return true;
}
return false;
return !disk_live(rdev->bdev->bd_disk);
}
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
......
......@@ -569,8 +569,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
return true;
}
if (unlikely(is_mddev_broken(tmp_dev, "raid0"))) {
if (unlikely(is_rdev_broken(tmp_dev))) {
bio_io_error(bio);
md_error(mddev, tmp_dev);
return true;
}
......@@ -592,6 +593,16 @@ static void raid0_status(struct seq_file *seq, struct mddev *mddev)
return;
}
static void raid0_error(struct mddev *mddev, struct md_rdev *rdev)
{
if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
char *md_name = mdname(mddev);
pr_crit("md/raid0%s: Disk failure on %pg detected, failing array.\n",
md_name, rdev->bdev);
}
}
static void *raid0_takeover_raid45(struct mddev *mddev)
{
struct md_rdev *rdev;
......@@ -767,6 +778,7 @@ static struct md_personality raid0_personality=
.size = raid0_size,
.takeover = raid0_takeover,
.quiesce = raid0_quiesce,
.error_handler = raid0_error,
};
static int __init raid0_init (void)
......
......@@ -952,7 +952,9 @@ static void flush_pending_writes(struct r10conf *conf)
static void raise_barrier(struct r10conf *conf, int force)
{
write_seqlock_irq(&conf->resync_lock);
BUG_ON(force && !conf->barrier);
if (WARN_ON_ONCE(force && !conf->barrier))
force = false;
/* Wait until no block IO is waiting (unless 'force') */
wait_event_barrier(conf, force || !conf->nr_waiting);
......@@ -995,11 +997,15 @@ static bool stop_waiting_barrier(struct r10conf *conf)
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
return true;
/* move on if recovery thread is blocked by us */
if (conf->mddev->thread->tsk == current &&
test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
conf->nr_queued > 0)
/*
* move on if io is issued from raid10d(), nr_pending is not released
* from original io(see handle_read_error()). All raise barrier is
* blocked until this io is done.
*/
if (conf->mddev->thread->tsk == current) {
WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0);
return true;
}
return false;
}
......@@ -1244,7 +1250,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
if (!r10_bio->start_time &&
blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r10_bio->start_time = bio_start_io_acct(bio);
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
......@@ -1574,6 +1581,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
r10_bio->start_time = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
......@@ -1626,7 +1634,7 @@ static void raid10_end_discard_request(struct bio *bio)
/*
* raid10_remove_disk uses smp_mb to make sure rdev is set to
* replacement before setting replacement to NULL. It can read
* rdev first without barrier protect even replacment is NULL
* rdev first without barrier protect even replacement is NULL
*/
smp_rmb();
rdev = conf->mirrors[dev].rdev;
......@@ -2609,10 +2617,21 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
int d;
struct bio *wbio, *wbio2;
struct bio *wbio = r10_bio->devs[1].bio;
struct bio *wbio2 = r10_bio->devs[1].repl_bio;
/* Need to test wbio2->bi_end_io before we call
* submit_bio_noacct as if the former is NULL,
* the latter is free to free wbio2.
*/
if (wbio2 && !wbio2->bi_end_io)
wbio2 = NULL;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
fix_recovery_read_error(r10_bio);
if (wbio->bi_end_io)
end_sync_request(r10_bio);
if (wbio2)
end_sync_request(r10_bio);
return;
}
......@@ -2622,14 +2641,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* and submit the write request
*/
d = r10_bio->devs[1].devnum;
wbio = r10_bio->devs[1].bio;
wbio2 = r10_bio->devs[1].repl_bio;
/* Need to test wbio2->bi_end_io before we call
* submit_bio_noacct as if the former is NULL,
* the latter is free to free wbio2.
*/
if (wbio2 && !wbio2->bi_end_io)
wbio2 = NULL;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
......@@ -2978,9 +2989,13 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
allow_barrier(conf);
r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
*/
allow_barrier(conf);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
......@@ -3289,10 +3304,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
/*
* Allow skipping a full rebuild for incremental assembly
* of a clean array, like RAID1 does.
......@@ -3308,6 +3319,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return mddev->dev_sectors - sector_nr;
}
if (!mempool_initialized(&conf->r10buf_pool))
if (init_resync(conf))
return 0;
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
......@@ -4004,6 +4019,20 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
return nc*fc;
}
static void raid10_free_conf(struct r10conf *conf)
{
if (!conf)
return;
mempool_exit(&conf->r10bio_pool);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
safe_put_page(conf->tmppage);
bioset_exit(&conf->bio_split);
kfree(conf);
}
static struct r10conf *setup_conf(struct mddev *mddev)
{
struct r10conf *conf = NULL;
......@@ -4086,13 +4115,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return conf;
out:
if (conf) {
mempool_exit(&conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
bioset_exit(&conf->bio_split);
kfree(conf);
}
raid10_free_conf(conf);
return ERR_PTR(err);
}
......@@ -4129,6 +4152,9 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;
mddev->thread = conf->thread;
conf->thread = NULL;
if (mddev_is_clustered(conf->mddev)) {
int fc, fo;
......@@ -4141,9 +4167,6 @@ static int raid10_run(struct mddev *mddev)
}
}
mddev->thread = conf->thread;
conf->thread = NULL;
if (mddev->queue) {
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
......@@ -4283,10 +4306,7 @@ static int raid10_run(struct mddev *mddev)
out_free_conf:
md_unregister_thread(&mddev->thread);
mempool_exit(&conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
raid10_free_conf(conf);
mddev->private = NULL;
out:
return -EIO;
......@@ -4294,15 +4314,7 @@ static int raid10_run(struct mddev *mddev)
static void raid10_free(struct mddev *mddev, void *priv)
{
struct r10conf *conf = priv;
mempool_exit(&conf->r10bio_pool);
safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
bioset_exit(&conf->bio_split);
kfree(conf);
raid10_free_conf(priv);
}
static void raid10_quiesce(struct mddev *mddev, int quiesce)
......
......@@ -7716,7 +7716,6 @@ static void raid5_set_io_opt(struct r5conf *conf)
static int raid5_run(struct mddev *mddev)
{
struct r5conf *conf;
int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
......@@ -7912,10 +7911,8 @@ static int raid5_run(struct mddev *mddev)
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
goto abort;
}
if (test_bit(In_sync, &rdev->flags)) {
working_disks++;
if (test_bit(In_sync, &rdev->flags))
continue;
}
/* This disc is not fully in-sync. However if it
* just stored parity (beyond the recovery_offset),
* when we don't need to be concerned about the
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment