Commit 03f7b57a authored by Jens Axboe's avatar Jens Axboe

Merge tag 'md-next-20230927' of...

Merge tag 'md-next-20230927' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.7/block

Pull MD updates from Song:

"1. Make rdev add/remove independent from daemon thread, by Yu Kuai;
 2. Refactor code around quiesce() and mddev_suspend(), by Yu Kuai."

* tag 'md-next-20230927' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md:
  md: replace deprecated strncpy with memcpy
  md/md-linear: Annotate struct linear_conf with __counted_by
  md: don't check 'mddev->pers' and 'pers->quiesce' from suspend_lo_store()
  md: don't check 'mddev->pers' from suspend_hi_store()
  md-bitmap: suspend array earlier in location_store()
  md-bitmap: remove the checking of 'pers->quiesce' from location_store()
  md: don't rely on 'mddev->pers' to be set in mddev_suspend()
  md: initialize 'writes_pending' while allocating mddev
  md: initialize 'active_io' while allocating mddev
  md: delay remove_and_add_spares() for read only array to md_start_sync()
  md: factor out a helper rdev_addable() from remove_and_add_spares()
  md: factor out a helper rdev_is_spare() from remove_and_add_spares()
  md: factor out a helper rdev_removeable() from remove_and_add_spares()
  md: delay choosing sync action to md_start_sync()
  md: factor out a helper to choose sync action from md_check_recovery()
  md: use separate work_struct for md_start_sync()
parents aa511ff8 ceb04163
...@@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r ...@@ -749,7 +749,11 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
mddev_init(&rs->md); if (mddev_init(&rs->md)) {
kfree(rs);
ti->error = "Cannot initialize raid context";
return ERR_PTR(-ENOMEM);
}
rs->raid_disks = raid_devs; rs->raid_disks = raid_devs;
rs->delta_disks = 0; rs->delta_disks = 0;
...@@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs) ...@@ -798,6 +802,7 @@ static void raid_set_free(struct raid_set *rs)
dm_put_device(rs->ti, rs->dev[i].data_dev); dm_put_device(rs->ti, rs->dev[i].data_dev);
} }
mddev_destroy(&rs->md);
kfree(rs); kfree(rs);
} }
......
...@@ -2351,11 +2351,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -2351,11 +2351,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = mddev_lock(mddev); rv = mddev_lock(mddev);
if (rv) if (rv)
return rv; return rv;
mddev_suspend(mddev);
if (mddev->pers) { if (mddev->pers) {
if (!mddev->pers->quiesce) {
rv = -EBUSY;
goto out;
}
if (mddev->recovery || mddev->sync_thread) { if (mddev->recovery || mddev->sync_thread) {
rv = -EBUSY; rv = -EBUSY;
goto out; goto out;
...@@ -2369,11 +2367,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -2369,11 +2367,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EBUSY; rv = -EBUSY;
goto out; goto out;
} }
if (mddev->pers) {
mddev_suspend(mddev); md_bitmap_destroy(mddev);
md_bitmap_destroy(mddev);
mddev_resume(mddev);
}
mddev->bitmap_info.offset = 0; mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) { if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file; struct file *f = mddev->bitmap_info.file;
...@@ -2383,6 +2378,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -2383,6 +2378,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else { } else {
/* No bitmap, OK to set a location */ /* No bitmap, OK to set a location */
long long offset; long long offset;
struct bitmap *bitmap;
if (strncmp(buf, "none", 4) == 0) if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */; /* nothing to be done */;
else if (strncmp(buf, "file:", 5) == 0) { else if (strncmp(buf, "file:", 5) == 0) {
...@@ -2406,25 +2403,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -2406,25 +2403,20 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
rv = -EINVAL; rv = -EINVAL;
goto out; goto out;
} }
mddev->bitmap_info.offset = offset; mddev->bitmap_info.offset = offset;
if (mddev->pers) { bitmap = md_bitmap_create(mddev, -1);
struct bitmap *bitmap; if (IS_ERR(bitmap)) {
bitmap = md_bitmap_create(mddev, -1); rv = PTR_ERR(bitmap);
mddev_suspend(mddev); goto out;
if (IS_ERR(bitmap)) }
rv = PTR_ERR(bitmap);
else { mddev->bitmap = bitmap;
mddev->bitmap = bitmap; rv = md_bitmap_load(mddev);
rv = md_bitmap_load(mddev); if (rv) {
if (rv) mddev->bitmap_info.offset = 0;
mddev->bitmap_info.offset = 0; md_bitmap_destroy(mddev);
} goto out;
if (rv) {
md_bitmap_destroy(mddev);
mddev_resume(mddev);
goto out;
}
mddev_resume(mddev);
} }
} }
} }
...@@ -2437,6 +2429,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -2437,6 +2429,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} }
rv = 0; rv = 0;
out: out:
mddev_resume(mddev);
mddev_unlock(mddev); mddev_unlock(mddev);
if (rv) if (rv)
return rv; return rv;
......
...@@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) ...@@ -69,6 +69,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
if (!conf) if (!conf)
return NULL; return NULL;
/*
* conf->raid_disks is copy of mddev->raid_disks. The reason to
* keep a copy of mddev->raid_disks in struct linear_conf is,
* mddev->raid_disks may not be consistent with pointers number of
* conf->disks[] when it is updated in linear_add() and used to
* iterate old conf->disks[] earray in linear_congested().
* Here conf->raid_disks is always consitent with number of
* pointers in conf->disks[] array, and mddev->private is updated
* with rcu_assign_pointer() in linear_addr(), such race can be
* avoided.
*/
conf->raid_disks = raid_disks;
cnt = 0; cnt = 0;
conf->array_sectors = 0; conf->array_sectors = 0;
...@@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) ...@@ -112,19 +125,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
conf->disks[i-1].end_sector + conf->disks[i-1].end_sector +
conf->disks[i].rdev->sectors; conf->disks[i].rdev->sectors;
/*
* conf->raid_disks is copy of mddev->raid_disks. The reason to
* keep a copy of mddev->raid_disks in struct linear_conf is,
* mddev->raid_disks may not be consistent with pointers number of
* conf->disks[] when it is updated in linear_add() and used to
* iterate old conf->disks[] earray in linear_congested().
* Here conf->raid_disks is always consitent with number of
* pointers in conf->disks[] array, and mddev->private is updated
* with rcu_assign_pointer() in linear_addr(), such race can be
* avoided.
*/
conf->raid_disks = raid_disks;
return conf; return conf;
out: out:
......
...@@ -12,6 +12,6 @@ struct linear_conf ...@@ -12,6 +12,6 @@ struct linear_conf
struct rcu_head rcu; struct rcu_head rcu;
sector_t array_sectors; sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */ int raid_disks; /* a copy of mddev->raid_disks */
struct dev_info disks[]; struct dev_info disks[] __counted_by(raid_disks);
}; };
#endif #endif
...@@ -449,7 +449,7 @@ void mddev_suspend(struct mddev *mddev) ...@@ -449,7 +449,7 @@ void mddev_suspend(struct mddev *mddev)
set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
percpu_ref_kill(&mddev->active_io); percpu_ref_kill(&mddev->active_io);
if (mddev->pers->prepare_suspend) if (mddev->pers && mddev->pers->prepare_suspend)
mddev->pers->prepare_suspend(mddev); mddev->pers->prepare_suspend(mddev);
wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
...@@ -631,16 +631,39 @@ void mddev_put(struct mddev *mddev) ...@@ -631,16 +631,39 @@ void mddev_put(struct mddev *mddev)
* flush_workqueue() after mddev_find will succeed in waiting * flush_workqueue() after mddev_find will succeed in waiting
* for the work to be done. * for the work to be done.
*/ */
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
queue_work(md_misc_wq, &mddev->del_work); queue_work(md_misc_wq, &mddev->del_work);
} }
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
} }
static void md_safemode_timeout(struct timer_list *t); static void md_safemode_timeout(struct timer_list *t);
static void md_start_sync(struct work_struct *ws);
void mddev_init(struct mddev *mddev) static void active_io_release(struct percpu_ref *ref)
{
struct mddev *mddev = container_of(ref, struct mddev, active_io);
wake_up(&mddev->sb_wait);
}
static void no_op(struct percpu_ref *r) {}
int mddev_init(struct mddev *mddev)
{ {
if (percpu_ref_init(&mddev->active_io, active_io_release,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
return -ENOMEM;
if (percpu_ref_init(&mddev->writes_pending, no_op,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
percpu_ref_exit(&mddev->active_io);
return -ENOMEM;
}
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
mutex_init(&mddev->open_mutex); mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex); mutex_init(&mddev->sync_mutex);
...@@ -662,9 +685,21 @@ void mddev_init(struct mddev *mddev) ...@@ -662,9 +685,21 @@ void mddev_init(struct mddev *mddev)
mddev->resync_min = 0; mddev->resync_min = 0;
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE; mddev->level = LEVEL_NONE;
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
return 0;
} }
EXPORT_SYMBOL_GPL(mddev_init); EXPORT_SYMBOL_GPL(mddev_init);
void mddev_destroy(struct mddev *mddev)
{
percpu_ref_exit(&mddev->active_io);
percpu_ref_exit(&mddev->writes_pending);
}
EXPORT_SYMBOL_GPL(mddev_destroy);
static struct mddev *mddev_find_locked(dev_t unit) static struct mddev *mddev_find_locked(dev_t unit)
{ {
struct mddev *mddev; struct mddev *mddev;
...@@ -708,13 +743,16 @@ static struct mddev *mddev_alloc(dev_t unit) ...@@ -708,13 +743,16 @@ static struct mddev *mddev_alloc(dev_t unit)
new = kzalloc(sizeof(*new), GFP_KERNEL); new = kzalloc(sizeof(*new), GFP_KERNEL);
if (!new) if (!new)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
mddev_init(new);
error = mddev_init(new);
if (error)
goto out_free_new;
spin_lock(&all_mddevs_lock); spin_lock(&all_mddevs_lock);
if (unit) { if (unit) {
error = -EEXIST; error = -EEXIST;
if (mddev_find_locked(unit)) if (mddev_find_locked(unit))
goto out_free_new; goto out_destroy_new;
new->unit = unit; new->unit = unit;
if (MAJOR(unit) == MD_MAJOR) if (MAJOR(unit) == MD_MAJOR)
new->md_minor = MINOR(unit); new->md_minor = MINOR(unit);
...@@ -725,7 +763,7 @@ static struct mddev *mddev_alloc(dev_t unit) ...@@ -725,7 +763,7 @@ static struct mddev *mddev_alloc(dev_t unit)
error = -ENODEV; error = -ENODEV;
new->unit = mddev_alloc_unit(); new->unit = mddev_alloc_unit();
if (!new->unit) if (!new->unit)
goto out_free_new; goto out_destroy_new;
new->md_minor = MINOR(new->unit); new->md_minor = MINOR(new->unit);
new->hold_active = UNTIL_STOP; new->hold_active = UNTIL_STOP;
} }
...@@ -733,8 +771,11 @@ static struct mddev *mddev_alloc(dev_t unit) ...@@ -733,8 +771,11 @@ static struct mddev *mddev_alloc(dev_t unit)
list_add(&new->all_mddevs, &all_mddevs); list_add(&new->all_mddevs, &all_mddevs);
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
return new; return new;
out_free_new:
out_destroy_new:
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
mddev_destroy(new);
out_free_new:
kfree(new); kfree(new);
return ERR_PTR(error); return ERR_PTR(error);
} }
...@@ -745,6 +786,7 @@ static void mddev_free(struct mddev *mddev) ...@@ -745,6 +786,7 @@ static void mddev_free(struct mddev *mddev)
list_del(&mddev->all_mddevs); list_del(&mddev->all_mddevs);
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);
mddev_destroy(mddev);
kfree(mddev); kfree(mddev);
} }
...@@ -3879,7 +3921,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3879,7 +3921,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
return rv; return rv;
if (mddev->pers == NULL) { if (mddev->pers == NULL) {
strncpy(mddev->clevel, buf, slen); memcpy(mddev->clevel, buf, slen);
if (mddev->clevel[slen-1] == '\n') if (mddev->clevel[slen-1] == '\n')
slen--; slen--;
mddev->clevel[slen] = 0; mddev->clevel[slen] = 0;
...@@ -3912,7 +3954,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3912,7 +3954,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
} }
/* Now find the new personality */ /* Now find the new personality */
strncpy(clevel, buf, slen); memcpy(clevel, buf, slen);
if (clevel[slen-1] == '\n') if (clevel[slen-1] == '\n')
slen--; slen--;
clevel[slen] = 0; clevel[slen] = 0;
...@@ -4698,7 +4740,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -4698,7 +4740,7 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len)
size_t namelen = len-9; size_t namelen = len-9;
if (namelen >= sizeof(mddev->metadata_type)) if (namelen >= sizeof(mddev->metadata_type))
namelen = sizeof(mddev->metadata_type)-1; namelen = sizeof(mddev->metadata_type)-1;
strncpy(mddev->metadata_type, buf+9, namelen); memcpy(mddev->metadata_type, buf+9, namelen);
mddev->metadata_type[namelen] = 0; mddev->metadata_type[namelen] = 0;
if (namelen && mddev->metadata_type[namelen-1] == '\n') if (namelen && mddev->metadata_type[namelen-1] == '\n')
mddev->metadata_type[--namelen] = 0; mddev->metadata_type[--namelen] = 0;
...@@ -4872,6 +4914,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) ...@@ -4872,6 +4914,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
/* A write to sync_action is enough to justify /* A write to sync_action is enough to justify
* canceling read-auto mode * canceling read-auto mode
*/ */
flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR; mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
} }
...@@ -5146,18 +5189,13 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -5146,18 +5189,13 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_lock(mddev); err = mddev_lock(mddev);
if (err) if (err)
return err; return err;
err = -EINVAL;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
goto unlock;
mddev_suspend(mddev); mddev_suspend(mddev);
mddev->suspend_lo = new; mddev->suspend_lo = new;
mddev_resume(mddev); mddev_resume(mddev);
err = 0;
unlock:
mddev_unlock(mddev); mddev_unlock(mddev);
return err ?: len; return len;
} }
static struct md_sysfs_entry md_suspend_lo = static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
...@@ -5183,18 +5221,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -5183,18 +5221,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_lock(mddev); err = mddev_lock(mddev);
if (err) if (err)
return err; return err;
err = -EINVAL;
if (mddev->pers == NULL)
goto unlock;
mddev_suspend(mddev); mddev_suspend(mddev);
mddev->suspend_hi = new; mddev->suspend_hi = new;
mddev_resume(mddev); mddev_resume(mddev);
err = 0;
unlock:
mddev_unlock(mddev); mddev_unlock(mddev);
return err ?: len; return len;
} }
static struct md_sysfs_entry md_suspend_hi = static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
...@@ -5597,21 +5630,6 @@ static void mddev_delayed_delete(struct work_struct *ws) ...@@ -5597,21 +5630,6 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj); kobject_put(&mddev->kobj);
} }
static void no_op(struct percpu_ref *r) {}
int mddev_init_writes_pending(struct mddev *mddev)
{
if (mddev->writes_pending.percpu_count_ptr)
return 0;
if (percpu_ref_init(&mddev->writes_pending, no_op,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
return -ENOMEM;
/* We want to start with the refcount at zero */
percpu_ref_put(&mddev->writes_pending);
return 0;
}
EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
struct mddev *md_alloc(dev_t dev, char *name) struct mddev *md_alloc(dev_t dev, char *name)
{ {
/* /*
...@@ -5783,12 +5801,6 @@ static void md_safemode_timeout(struct timer_list *t) ...@@ -5783,12 +5801,6 @@ static void md_safemode_timeout(struct timer_list *t)
} }
static int start_dirty_degraded; static int start_dirty_degraded;
static void active_io_release(struct percpu_ref *ref)
{
struct mddev *mddev = container_of(ref, struct mddev, active_io);
wake_up(&mddev->sb_wait);
}
int md_run(struct mddev *mddev) int md_run(struct mddev *mddev)
{ {
...@@ -5869,15 +5881,10 @@ int md_run(struct mddev *mddev) ...@@ -5869,15 +5881,10 @@ int md_run(struct mddev *mddev)
nowait = nowait && bdev_nowait(rdev->bdev); nowait = nowait && bdev_nowait(rdev->bdev);
} }
err = percpu_ref_init(&mddev->active_io, active_io_release,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
if (err)
return err;
if (!bioset_initialized(&mddev->bio_set)) { if (!bioset_initialized(&mddev->bio_set)) {
err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (err) if (err)
goto exit_active_io; return err;
} }
if (!bioset_initialized(&mddev->sync_set)) { if (!bioset_initialized(&mddev->sync_set)) {
err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
...@@ -6074,8 +6081,6 @@ int md_run(struct mddev *mddev) ...@@ -6074,8 +6081,6 @@ int md_run(struct mddev *mddev)
bioset_exit(&mddev->sync_set); bioset_exit(&mddev->sync_set);
exit_bio_set: exit_bio_set:
bioset_exit(&mddev->bio_set); bioset_exit(&mddev->bio_set);
exit_active_io:
percpu_ref_exit(&mddev->active_io);
return err; return err;
} }
EXPORT_SYMBOL_GPL(md_run); EXPORT_SYMBOL_GPL(md_run);
...@@ -6291,7 +6296,6 @@ static void __md_stop(struct mddev *mddev) ...@@ -6291,7 +6296,6 @@ static void __md_stop(struct mddev *mddev)
module_put(pers->owner); module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set); bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set); bioset_exit(&mddev->sync_set);
bioset_exit(&mddev->io_clone_set); bioset_exit(&mddev->io_clone_set);
...@@ -6306,7 +6310,6 @@ void md_stop(struct mddev *mddev) ...@@ -6306,7 +6310,6 @@ void md_stop(struct mddev *mddev)
*/ */
__md_stop_writes(mddev); __md_stop_writes(mddev);
__md_stop(mddev); __md_stop(mddev);
percpu_ref_exit(&mddev->writes_pending);
} }
EXPORT_SYMBOL_GPL(md_stop); EXPORT_SYMBOL_GPL(md_stop);
...@@ -7646,6 +7649,10 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, ...@@ -7646,6 +7649,10 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mutex_unlock(&mddev->open_mutex); mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev); sync_blockdev(bdev);
} }
if (!md_is_rdwr(mddev))
flush_work(&mddev->sync_work);
err = mddev_lock(mddev); err = mddev_lock(mddev);
if (err) { if (err) {
pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
...@@ -7886,7 +7893,6 @@ static void md_free_disk(struct gendisk *disk) ...@@ -7886,7 +7893,6 @@ static void md_free_disk(struct gendisk *disk)
{ {
struct mddev *mddev = disk->private_data; struct mddev *mddev = disk->private_data;
percpu_ref_exit(&mddev->writes_pending);
mddev_free(mddev); mddev_free(mddev);
} }
...@@ -8570,6 +8576,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) ...@@ -8570,6 +8576,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY); BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) { if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */ /* need to switch to read/write */
flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR; mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
...@@ -9161,6 +9168,85 @@ void md_do_sync(struct md_thread *thread) ...@@ -9161,6 +9168,85 @@ void md_do_sync(struct md_thread *thread)
} }
EXPORT_SYMBOL_GPL(md_do_sync); EXPORT_SYMBOL_GPL(md_do_sync);
static bool rdev_removeable(struct md_rdev *rdev)
{
/* rdev is not used. */
if (rdev->raid_disk < 0)
return false;
/* There are still inflight io, don't remove this rdev. */
if (atomic_read(&rdev->nr_pending))
return false;
/*
* An error occurred but has not yet been acknowledged by the metadata
* handler, don't remove this rdev.
*/
if (test_bit(Blocked, &rdev->flags))
return false;
/* Fautly rdev is not used, it's safe to remove it. */
if (test_bit(Faulty, &rdev->flags))
return true;
/* Journal disk can only be removed if it's faulty. */
if (test_bit(Journal, &rdev->flags))
return false;
/*
* 'In_sync' is cleared while 'raid_disk' is valid, which means
* replacement has just become active from pers->spare_active(), and
* then pers->hot_remove_disk() will replace this rdev with replacement.
*/
if (!test_bit(In_sync, &rdev->flags))
return true;
return false;
}
static bool rdev_is_spare(struct md_rdev *rdev)
{
return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags);
}
static bool rdev_addable(struct md_rdev *rdev)
{
/* rdev is already used, don't add it again. */
if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
test_bit(Faulty, &rdev->flags))
return false;
/* Allow to add journal disk. */
if (test_bit(Journal, &rdev->flags))
return true;
/* Allow to add if array is read-write. */
if (md_is_rdwr(rdev->mddev))
return true;
/*
* For read-only array, only allow to readd a rdev. And if bitmap is
* used, don't allow to readd a rdev that is too old.
*/
if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))
return true;
return false;
}
static bool md_spares_need_change(struct mddev *mddev)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
if (rdev_removeable(rdev) || rdev_addable(rdev))
return true;
return false;
}
static int remove_and_add_spares(struct mddev *mddev, static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this) struct md_rdev *this)
{ {
...@@ -9193,12 +9279,8 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -9193,12 +9279,8 @@ static int remove_and_add_spares(struct mddev *mddev,
synchronize_rcu(); synchronize_rcu();
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) && if ((this == NULL || rdev == this) &&
rdev->raid_disk >= 0 && (test_bit(RemoveSynchronized, &rdev->flags) ||
!test_bit(Blocked, &rdev->flags) && rdev_removeable(rdev))) {
((test_bit(RemoveSynchronized, &rdev->flags) ||
(!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0)) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) { mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev); sysfs_unlink_rdev(mddev, rdev);
...@@ -9220,25 +9302,12 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -9220,25 +9302,12 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (this && this != rdev) if (this && this != rdev)
continue; continue;
if (test_bit(Candidate, &rdev->flags)) if (rdev_is_spare(rdev))
continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++; spares++;
if (rdev->raid_disk >= 0) if (!rdev_addable(rdev))
continue;
if (test_bit(Faulty, &rdev->flags))
continue; continue;
if (!test_bit(Journal, &rdev->flags)) { if (!test_bit(Journal, &rdev->flags))
if (!md_is_rdwr(mddev) &&
!(rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
continue;
rdev->recovery_offset = 0; rdev->recovery_offset = 0;
}
if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
/* failure here is OK */ /* failure here is OK */
sysfs_link_rdev(mddev, rdev); sysfs_link_rdev(mddev, rdev);
...@@ -9254,9 +9323,81 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -9254,9 +9323,81 @@ static int remove_and_add_spares(struct mddev *mddev,
return spares; return spares;
} }
static bool md_choose_sync_action(struct mddev *mddev, int *spares)
{
/* Check if reshape is in progress first. */
if (mddev->reshape_position != MaxSector) {
if (mddev->pers->check_reshape == NULL ||
mddev->pers->check_reshape(mddev) != 0)
return false;
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
return true;
}
/*
* Remove any failed drives, then add spares if possible. Spares are
* also removed and re-added, to allow the personality to fail the
* re-add.
*/
*spares = remove_and_add_spares(mddev, NULL);
if (*spares) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
/* Start new recovery. */
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
return true;
}
/* Check if recovery is in progress. */
if (mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
return true;
}
/* Delay to choose resync/check/repair in md_do_sync(). */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
return true;
/* Nothing to be done */
return false;
}
static void md_start_sync(struct work_struct *ws) static void md_start_sync(struct work_struct *ws)
{ {
struct mddev *mddev = container_of(ws, struct mddev, del_work); struct mddev *mddev = container_of(ws, struct mddev, sync_work);
int spares = 0;
mddev_lock_nointr(mddev);
if (!md_is_rdwr(mddev)) {
/*
* On a read-only array we can:
* - remove failed devices
* - add already-in_sync devices if the array itself is in-sync.
* As we only add devices that are already in-sync, we can
* activate the spares immediately.
*/
remove_and_add_spares(mddev, NULL);
goto not_running;
}
if (!md_choose_sync_action(mddev, &spares))
goto not_running;
if (!mddev->pers->sync_request)
goto not_running;
/*
* We are adding a device or devices to an array which has the bitmap
* stored on all devices. So make sure all bitmap pages get written.
*/
if (spares)
md_bitmap_write_all(mddev->bitmap);
rcu_assign_pointer(mddev->sync_thread, rcu_assign_pointer(mddev->sync_thread,
md_register_thread(md_do_sync, mddev, "resync")); md_register_thread(md_do_sync, mddev, "resync"));
...@@ -9264,20 +9405,27 @@ static void md_start_sync(struct work_struct *ws) ...@@ -9264,20 +9405,27 @@ static void md_start_sync(struct work_struct *ws)
pr_warn("%s: could not start resync thread...\n", pr_warn("%s: could not start resync thread...\n",
mdname(mddev)); mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */ /* leave the spares where they are, it shouldn't hurt */
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); goto not_running;
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); }
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); mddev_unlock(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); md_wakeup_thread(mddev->sync_thread);
wake_up(&resync_wait);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
&mddev->recovery))
if (mddev->sysfs_action)
sysfs_notify_dirent_safe(mddev->sysfs_action);
} else
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event(); md_new_event();
return;
not_running:
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev_unlock(mddev);
wake_up(&resync_wait);
if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
mddev->sysfs_action)
sysfs_notify_dirent_safe(mddev->sysfs_action);
} }
/* /*
...@@ -9345,7 +9493,6 @@ void md_check_recovery(struct mddev *mddev) ...@@ -9345,7 +9493,6 @@ void md_check_recovery(struct mddev *mddev)
return; return;
if (mddev_trylock(mddev)) { if (mddev_trylock(mddev)) {
int spares = 0;
bool try_set_sync = mddev->safemode != 0; bool try_set_sync = mddev->safemode != 0;
if (!mddev->external && mddev->safemode == 1) if (!mddev->external && mddev->safemode == 1)
...@@ -9353,30 +9500,43 @@ void md_check_recovery(struct mddev *mddev) ...@@ -9353,30 +9500,43 @@ void md_check_recovery(struct mddev *mddev)
if (!md_is_rdwr(mddev)) { if (!md_is_rdwr(mddev)) {
struct md_rdev *rdev; struct md_rdev *rdev;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
/* sync_work already queued. */
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
if (!mddev->external && mddev->in_sync) if (!mddev->external && mddev->in_sync)
/* 'Blocked' flag not needed as failed devices /*
* 'Blocked' flag not needed as failed devices
* will be recorded if array switched to read/write. * will be recorded if array switched to read/write.
* Leaving it set will prevent the device * Leaving it set will prevent the device
* from being removed. * from being removed.
*/ */
rdev_for_each(rdev, mddev) rdev_for_each(rdev, mddev)
clear_bit(Blocked, &rdev->flags); clear_bit(Blocked, &rdev->flags);
/* On a read-only array we can:
* - remove failed devices /*
* - add already-in_sync devices if the array itself * There is no thread, but we need to call
* is in-sync.
* As we only add devices that are already in-sync,
* we can activate the spares immediately.
*/
remove_and_add_spares(mddev, NULL);
/* There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk * ->spare_active and clear saved_raid_disk
*/ */
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev); md_reap_sync_thread(mddev);
/*
* Let md_start_sync() to remove and add rdevs to the
* array.
*/
if (md_spares_need_change(mddev)) {
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
queue_work(md_misc_wq, &mddev->sync_work);
}
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
goto unlock; goto unlock;
} }
...@@ -9432,56 +9592,14 @@ void md_check_recovery(struct mddev *mddev) ...@@ -9432,56 +9592,14 @@ void md_check_recovery(struct mddev *mddev)
clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) &&
test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
goto not_running; queue_work(md_misc_wq, &mddev->sync_work);
/* no recovery is running. } else {
* remove any failed drives, then
* add spares if possible.
* Spares are also removed and re-added, to allow
* the personality to fail the re-add.
*/
if (mddev->reshape_position != MaxSector) {
if (mddev->pers->check_reshape == NULL ||
mddev->pers->check_reshape(mddev) != 0)
/* Cannot proceed */
goto not_running;
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if ((spares = remove_and_add_spares(mddev, NULL))) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
/* nothing to be done ... */
goto not_running;
if (mddev->pers->sync_request) {
if (spares) {
/* We are adding a device or devices to an array
* which has the bitmap stored on all devices.
* So make sure all bitmap pages get written
*/
md_bitmap_write_all(mddev->bitmap);
}
INIT_WORK(&mddev->del_work, md_start_sync);
queue_work(md_misc_wq, &mddev->del_work);
goto unlock;
}
not_running:
if (!mddev->sync_thread) {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
wake_up(&resync_wait); wake_up(&resync_wait);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
&mddev->recovery))
if (mddev->sysfs_action)
sysfs_notify_dirent_safe(mddev->sysfs_action);
} }
unlock: unlock:
wake_up(&mddev->sb_wait); wake_up(&mddev->sb_wait);
mddev_unlock(mddev); mddev_unlock(mddev);
......
...@@ -453,7 +453,10 @@ struct mddev { ...@@ -453,7 +453,10 @@ struct mddev {
struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */
struct kernfs_node *sysfs_level; /*handle for 'level' */ struct kernfs_node *sysfs_level; /*handle for 'level' */
struct work_struct del_work; /* used for delayed sysfs removal */ /* used for delayed sysfs removal */
struct work_struct del_work;
/* used for register new sync thread */
struct work_struct sync_work;
/* "lock" protects: /* "lock" protects:
* flush_bio transition from NULL to !NULL * flush_bio transition from NULL to !NULL
...@@ -768,7 +771,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t ...@@ -768,7 +771,6 @@ extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **t
extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev); extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev);
extern int mddev_init_writes_pending(struct mddev *mddev);
extern bool md_write_start(struct mddev *mddev, struct bio *bi); extern bool md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev); extern void md_write_end(struct mddev *mddev);
...@@ -795,7 +797,8 @@ extern int md_integrity_register(struct mddev *mddev); ...@@ -795,7 +797,8 @@ extern int md_integrity_register(struct mddev *mddev);
extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void mddev_init(struct mddev *mddev); extern int mddev_init(struct mddev *mddev);
extern void mddev_destroy(struct mddev *mddev);
struct mddev *md_alloc(dev_t dev, char *name); struct mddev *md_alloc(dev_t dev, char *name);
void mddev_put(struct mddev *mddev); void mddev_put(struct mddev *mddev);
extern int md_run(struct mddev *mddev); extern int md_run(struct mddev *mddev);
......
...@@ -3122,8 +3122,7 @@ static int raid1_run(struct mddev *mddev) ...@@ -3122,8 +3122,7 @@ static int raid1_run(struct mddev *mddev)
mdname(mddev)); mdname(mddev));
return -EIO; return -EIO;
} }
if (mddev_init_writes_pending(mddev) < 0)
return -ENOMEM;
/* /*
* copy the already verified devices into our private RAID1 * copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(), * bookkeeping area. [whatever we allocate in run(),
......
...@@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev) ...@@ -4154,9 +4154,6 @@ static int raid10_run(struct mddev *mddev)
sector_t min_offset_diff = 0; sector_t min_offset_diff = 0;
int first = 1; int first = 1;
if (mddev_init_writes_pending(mddev) < 0)
return -ENOMEM;
if (mddev->private == NULL) { if (mddev->private == NULL) {
conf = setup_conf(mddev); conf = setup_conf(mddev);
if (IS_ERR(conf)) if (IS_ERR(conf))
......
...@@ -7778,9 +7778,6 @@ static int raid5_run(struct mddev *mddev) ...@@ -7778,9 +7778,6 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0; long long min_offset_diff = 0;
int first = 1; int first = 1;
if (mddev_init_writes_pending(mddev) < 0)
return -ENOMEM;
if (mddev->recovery_cp != MaxSector) if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev)); mdname(mddev));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment