Commit f01e49fb authored by Jens Axboe's avatar Jens Axboe

Merge branch 'md-next' of...

Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-5.19/drivers

Pull MD updates from Song:

"1. Improve annotation in raid5 code, by Logan Gunthorpe.
 2. Support MD_BROKEN flag in raid-1/5/10, by Mariusz Tkaczyk.
 3. Other small fixes/cleanups."

* 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md:
  md: Replace role magic numbers with defined constants
  md/raid0: Ignore RAID0 layout if the second zone has only one device
  md/raid5: Annotate functions that hold device_lock with __must_hold
  md/raid5-ppl: Annotate with rcu_dereference_protected()
  md/raid5: Annotate rdev/replacement access when mddev_lock is held
  md/raid5: Annotate rdev/replacement accesses when nr_pending is elevated
  md/raid5: Add __rcu annotation to struct disk_info
  md/raid5: Un-nest struct raid5_percpu definition
  md/raid5: Cleanup setup_conf() error returns
  md: replace deprecated strlcpy & remove duplicated line
  md/bitmap: don't set sb values if can't pass sanity check
  md: fix an incorrect NULL check in md_reload_sb
  md: fix an incorrect NULL check in does_sb_need_changing
  raid5: introduce MD_BROKEN
  md: Set MD_BROKEN for RAID1 and RAID10
parents 8ba816b2 9151ad5d
......@@ -639,14 +639,6 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
/* Setup nodes/clustername only if bitmap version is
* cluster-compatible
*/
if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
}
/* verify that the bitmap-specific fields are valid */
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
......@@ -668,6 +660,16 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
goto out;
}
/*
* Setup nodes/clustername only if bitmap version is
* cluster-compatible
*/
if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strscpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
}
/* keep the array size field of the bitmap superblock up to date */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
......@@ -695,14 +697,13 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
err = 0;
out:
kunmap_atomic(sb);
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
/* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
if (err) {
pr_warn("%s: Could not setup cluster service (%d)\n",
......@@ -713,8 +714,8 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
goto re_read;
}
out_no_sb:
if (err == 0) {
if (test_bit(BITMAP_STALE, &bitmap->flags))
bitmap->events_cleared = bitmap->mddev->events;
bitmap->mddev->bitmap_info.chunksize = chunksize;
......@@ -724,7 +725,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
if (bitmap->mddev->bitmap_info.space == 0 ||
bitmap->mddev->bitmap_info.space > sectors_reserved)
bitmap->mddev->bitmap_info.space = sectors_reserved;
if (err) {
} else {
md_bitmap_print_sb(bitmap);
if (bitmap->cluster_slot < 0)
md_cluster_stop(bitmap->mddev);
......
......@@ -201,7 +201,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
goto out_err;
}
strlcpy(res->name, name, namelen + 1);
strscpy(res->name, name, namelen + 1);
if (with_lvb) {
res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
if (!res->lksb.sb_lvbptr) {
......
......@@ -2627,14 +2627,16 @@ static void sync_sbs(struct mddev *mddev, int nospares)
static bool does_sb_need_changing(struct mddev *mddev)
{
struct md_rdev *rdev;
struct md_rdev *rdev = NULL, *iter;
struct mdp_superblock_1 *sb;
int role;
/* Find a good rdev */
rdev_for_each(rdev, mddev)
if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
rdev_for_each(iter, mddev)
if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) {
rdev = iter;
break;
}
/* No good device found. */
if (!rdev)
......@@ -2645,11 +2647,11 @@ static bool does_sb_need_changing(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
/* Device activated? */
if (role == 0xffff && rdev->raid_disk >=0 &&
if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 &&
!test_bit(Faulty, &rdev->flags))
return true;
/* Device turned faulty? */
if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX))
return true;
}
......@@ -2984,10 +2986,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
md_error(rdev->mddev, rdev);
if (test_bit(Faulty, &rdev->flags))
err = 0;
else
if (test_bit(MD_BROKEN, &rdev->mddev->flags))
err = -EBUSY;
else
err = 0;
} else if (cmd_match(buf, "remove")) {
if (rdev->mddev->pers) {
clear_bit(Blocked, &rdev->flags);
......@@ -4028,7 +4031,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
oldpriv = mddev->private;
mddev->pers = pers;
mddev->private = priv;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
mddev->level = mddev->new_level;
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
......@@ -4353,10 +4356,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
* like active, but no writes have been seen for a while (100msec).
*
* broken
* RAID0/LINEAR-only: same as clean, but array is missing a member.
* It's useful because RAID0/LINEAR mounted-arrays aren't stopped
* when a member is gone, so this state will at least alert the
* user that something is wrong.
* Array is failed. It's useful because mounted-arrays aren't stopped
* when array is failed, so this state will at least alert the user that
* something is wrong.
*/
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
write_pending, active_idle, broken, bad_word};
......@@ -5763,7 +5765,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
len--;
if (len >= DISK_NAME_LEN)
return -E2BIG;
strlcpy(buf, val, len+1);
strscpy(buf, val, len+1);
if (strncmp(buf, "md_", 3) == 0)
return md_alloc(0, buf);
if (strncmp(buf, "md", 2) == 0 &&
......@@ -5896,7 +5898,7 @@ int md_run(struct mddev *mddev)
mddev->level = pers->level;
mddev->new_level = pers->level;
}
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
......@@ -7443,7 +7445,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
err = -ENODEV;
else {
md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags))
if (test_bit(MD_BROKEN, &mddev->flags))
err = -EBUSY;
}
rcu_read_unlock();
......@@ -7984,13 +7986,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
if (!mddev->pers || !mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
if (mddev->degraded)
mddev->pers->error_handler(mddev, rdev);
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
if (!test_bit(MD_BROKEN, &mddev->flags)) {
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work);
md_new_event();
......@@ -9670,7 +9675,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
if (test_bit(Candidate, &rdev2->flags)) {
if (role == 0xfffe) {
if (role == MD_DISK_ROLE_FAULTY) {
pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
md_kick_rdev_from_array(rdev2);
continue;
......@@ -9683,7 +9688,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
/*
* got activated except reshape is happening.
*/
if (rdev2->raid_disk == -1 && role != 0xffff &&
if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE &&
!(le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) {
rdev2->saved_raid_disk = role;
......@@ -9700,7 +9705,8 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
* as faulty. The recovery is performed by the
* one who initiated the error.
*/
if ((role == 0xfffe) || (role == 0xfffd)) {
if (role == MD_DISK_ROLE_FAULTY ||
role == MD_DISK_ROLE_JOURNAL) {
md_error(mddev, rdev2);
clear_bit(Blocked, &rdev2->flags);
}
......@@ -9790,16 +9796,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
void md_reload_sb(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
struct md_rdev *rdev = NULL, *iter;
int err;
/* Find the rdev */
rdev_for_each_rcu(rdev, mddev) {
if (rdev->desc_nr == nr)
rdev_for_each_rcu(iter, mddev) {
if (iter->desc_nr == nr) {
rdev = iter;
break;
}
}
if (!rdev || rdev->desc_nr != nr) {
if (!rdev) {
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
return;
}
......
......@@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
struct md_cluster_info;
/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */
enum mddev_flags {
MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */
MD_CLOSING, /* If set, we are closing the array, do not open
* it then */
MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
MD_HAS_JOURNAL, /* The raid array has journal feature set */
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
* supported as calls to md_error() will
* never cause the array to become failed.
*/
MD_HAS_PPL, /* The raid array has PPL feature set */
MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */
MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update
* the metadata without taking reconfig_mutex.
*/
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
MD_NOT_READY, /* do_md_run() is active, so 'array_state'
* must not report that array is ready yet
*/
MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
* I/O in case an array member is gone/failed.
/**
* enum mddev_flags - md device flags.
* @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
* @MD_CLOSING: If set, we are closing the array, do not open it then.
* @MD_JOURNAL_CLEAN: A raid with journal is already clean.
* @MD_HAS_JOURNAL: The raid array has journal feature set.
* @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
* resync lock, need to release the lock.
* @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
* calls to md_error() will never cause the array to
* become failed.
* @MD_HAS_PPL: The raid array has PPL feature set.
* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
* @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
* without taking reconfig_mutex.
* @MD_UPDATING_SB: md_check_recovery is updating the metadata without
* explicitly holding reconfig_mutex.
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
*
* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
*/
enum mddev_flags {
MD_ARRAY_FIRST_USE,
MD_CLOSING,
MD_JOURNAL_CLEAN,
MD_HAS_JOURNAL,
MD_CLUSTER_RESYNC_LOCKED,
MD_FAILFAST_SUPPORTED,
MD_HAS_PPL,
MD_HAS_MULTIPLE_PPLS,
MD_ALLOW_SB_UPDATE,
MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
};
enum mddev_sb_flags {
......
......@@ -128,21 +128,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
pr_debug("md/raid0:%s: FINAL %d zones\n",
mdname(mddev), conf->nr_strip_zones);
if (conf->nr_strip_zones == 1) {
conf->layout = RAID0_ORIG_LAYOUT;
} else if (mddev->layout == RAID0_ORIG_LAYOUT ||
mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
conf->layout = mddev->layout;
} else if (default_layout == RAID0_ORIG_LAYOUT ||
default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
conf->layout = default_layout;
} else {
pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
mdname(mddev));
pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
err = -ENOTSUPP;
goto abort;
}
/*
* now since we have the hard sector sizes, we can make sure
* chunk size is a multiple of that sector size
......@@ -273,6 +258,22 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
(unsigned long long)smallest->sectors);
}
if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) {
conf->layout = RAID0_ORIG_LAYOUT;
} else if (mddev->layout == RAID0_ORIG_LAYOUT ||
mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
conf->layout = mddev->layout;
} else if (default_layout == RAID0_ORIG_LAYOUT ||
default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
conf->layout = default_layout;
} else {
pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
mdname(mddev));
pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
err = -EOPNOTSUPP;
goto abort;
}
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
......
......@@ -1641,31 +1641,40 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "]");
}
/**
* raid1_error() - RAID1 error handler.
* @mddev: affected md device.
* @rdev: member device to fail.
*
* The routine acknowledges &rdev failure and determines new @mddev state.
* If it failed, then:
* - &MD_BROKEN flag is set in &mddev->flags.
* - recovery is disabled.
* Otherwise, it must be degraded:
* - recovery is interrupted.
* - &mddev->degraded is bumped.
*
* @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off.
*/
static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private;
unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
* else if it is the last working disks with "fail_last_dev == false",
* ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) {
/*
* Don't fail the drive, act as though we were just a
* normal single drive.
* However don't try a recovery from this drive as
* it is very likely to fail.
*/
if (test_bit(In_sync, &rdev->flags) &&
(conf->raid_disks - mddev->degraded) == 1) {
set_bit(MD_BROKEN, &mddev->flags);
if (!mddev->fail_last_dev) {
conf->recovery_disabled = mddev->recovery_disabled;
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}
}
set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
......
......@@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore)
_enough(conf, 1, ignore);
}
/**
* raid10_error() - RAID10 error handler.
* @mddev: affected md device.
* @rdev: member device to fail.
*
* The routine acknowledges &rdev failure and determines new @mddev state.
* If it failed, then:
* - &MD_BROKEN flag is set in &mddev->flags.
* Otherwise, it must be degraded:
* - recovery is interrupted.
* - &mddev->degraded is bumped.
* @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off.
*/
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{
char b[BDEVNAME_SIZE];
struct r10conf *conf = mddev->private;
unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
* else if it is the last working disks with "fail_last_dev == false",
* ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) {
/*
* Don't fail the drive, just return an IO error.
*/
if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
set_bit(MD_BROKEN, &mddev->flags);
if (!mddev->fail_last_dev) {
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}
}
if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
/*
* If recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
......
......@@ -883,7 +883,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)r_sector, dd_idx,
(unsigned long long)sector);
rdev = conf->disks[dd_idx].rdev;
/* Array has not started so rcu dereference is safe */
rdev = rcu_dereference_protected(
conf->disks[dd_idx].rdev, 1);
if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n",
......@@ -934,7 +936,10 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
parity_sector = raid5_compute_sector(conf, r_sector_first + i,
0, &disk, &sh);
BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
parity_rdev = conf->disks[sh.pd_idx].rdev;
/* Array has not started so rcu dereference is safe */
parity_rdev = rcu_dereference_protected(
conf->disks[sh.pd_idx].rdev, 1);
BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
......@@ -1404,7 +1409,9 @@ int ppl_init_log(struct r5conf *conf)
for (i = 0; i < ppl_conf->count; i++) {
struct ppl_log *log = &ppl_conf->child_logs[i];
struct md_rdev *rdev = conf->disks[i].rdev;
/* Array has not started so rcu dereference is safe */
struct md_rdev *rdev =
rcu_dereference_protected(conf->disks[i].rdev, 1);
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
......
......@@ -79,18 +79,21 @@ static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect)
}
static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
__acquires(&conf->device_lock)
{
spin_lock_irq(conf->hash_locks + hash);
spin_lock(&conf->device_lock);
}
static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
__releases(&conf->device_lock)
{
spin_unlock(&conf->device_lock);
spin_unlock_irq(conf->hash_locks + hash);
}
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
__acquires(&conf->device_lock)
{
int i;
spin_lock_irq(conf->hash_locks);
......@@ -100,6 +103,7 @@ static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
}
static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
__releases(&conf->device_lock)
{
int i;
spin_unlock(&conf->device_lock);
......@@ -164,6 +168,7 @@ static bool stripe_is_lowprio(struct stripe_head *sh)
}
static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
__must_hold(&sh->raid_conf->device_lock)
{
struct r5conf *conf = sh->raid_conf;
struct r5worker_group *group;
......@@ -211,6 +216,7 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
__must_hold(&conf->device_lock)
{
int i;
int injournal = 0; /* number of date pages with R5_InJournal */
......@@ -296,6 +302,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
struct list_head *temp_inactive_list)
__must_hold(&conf->device_lock)
{
if (atomic_dec_and_test(&sh->count))
do_release_stripe(conf, sh, temp_inactive_list);
......@@ -350,9 +357,9 @@ static void release_inactive_stripe_list(struct r5conf *conf,
}
}
/* should hold conf->device_lock already */
static int release_stripe_list(struct r5conf *conf,
struct list_head *temp_inactive_list)
__must_hold(&conf->device_lock)
{
struct stripe_head *sh, *t;
int count = 0;
......@@ -629,6 +636,10 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
*
* Most calls to this function hold &conf->device_lock. Calls
* in raid5_run() do not require the lock as no other threads
* have been started yet.
*/
int raid5_calc_degraded(struct r5conf *conf)
{
......@@ -686,17 +697,17 @@ int raid5_calc_degraded(struct r5conf *conf)
return degraded;
}
static int has_failed(struct r5conf *conf)
static bool has_failed(struct r5conf *conf)
{
int degraded;
int degraded = conf->mddev->degraded;
if (conf->mddev->reshape_position == MaxSector)
return conf->mddev->degraded > conf->max_degraded;
if (test_bit(MD_BROKEN, &conf->mddev->flags))
return true;
if (conf->mddev->reshape_position != MaxSector)
degraded = raid5_calc_degraded(conf);
if (degraded > conf->max_degraded)
return 1;
return 0;
return degraded > conf->max_degraded;
}
struct stripe_head *
......@@ -2648,6 +2659,28 @@ static void shrink_stripes(struct r5conf *conf)
conf->slab_cache = NULL;
}
/*
* This helper wraps rcu_dereference_protected() and can be used when
* it is known that the nr_pending of the rdev is elevated.
*/
static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
{
return rcu_dereference_protected(rdev,
atomic_read(&rcu_access_pointer(rdev)->nr_pending));
}
/*
* This helper wraps rcu_dereference_protected() and should be used
* when it is known that the mddev_lock() is held. This is safe
* seeing raid5_remove_disk() has the same lock held.
*/
static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
struct md_rdev __rcu *rdev)
{
return rcu_dereference_protected(rdev,
lockdep_is_held(&mddev->reconfig_mutex));
}
static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
......@@ -2674,9 +2707,9 @@ static void raid5_end_read_request(struct bio * bi)
* In that case it moved down to 'rdev'.
* rdev is not removed until all requests are finished.
*/
rdev = conf->disks[i].replacement;
rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
rdev = conf->disks[i].rdev;
rdev = rdev_pend_deref(conf->disks[i].rdev);
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
......@@ -2790,11 +2823,11 @@ static void raid5_end_write_request(struct bio *bi)
for (i = 0 ; i < disks; i++) {
if (bi == &sh->dev[i].req) {
rdev = conf->disks[i].rdev;
rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
if (bi == &sh->dev[i].rreq) {
rdev = conf->disks[i].replacement;
rdev = rdev_pend_deref(conf->disks[i].replacement);
if (rdev)
replacement = 1;
else
......@@ -2802,7 +2835,7 @@ static void raid5_end_write_request(struct bio *bi)
* replaced it. rdev is not removed
* until all requests are finished.
*/
rdev = conf->disks[i].rdev;
rdev = rdev_pend_deref(conf->disks[i].rdev);
break;
}
}
......@@ -2863,34 +2896,31 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
unsigned long flags;
pr_debug("raid456: error called\n");
pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n",
mdname(mddev), bdevname(rdev->bdev, b));
spin_lock_irqsave(&conf->device_lock, flags);
set_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
mddev->degraded = raid5_calc_degraded(conf);
if (test_bit(In_sync, &rdev->flags) &&
mddev->degraded == conf->max_degraded) {
/*
* Don't allow to achieve failed state
* Don't try to recover this device
*/
if (has_failed(conf)) {
set_bit(MD_BROKEN, &conf->mddev->flags);
conf->recovery_disabled = mddev->recovery_disabled;
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
mdname(mddev), mddev->degraded, conf->raid_disks);
} else {
pr_crit("md/raid:%s: Operation continuing on %d devices.\n",
mdname(mddev), conf->raid_disks - mddev->degraded);
}
set_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
mddev->degraded = raid5_calc_degraded(conf);
spin_unlock_irqrestore(&conf->device_lock, flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n",
mdname(mddev),
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
r5c_update_on_rdev_error(mddev, rdev);
}
......@@ -5213,23 +5243,23 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
/* We own a safe reference to the rdev */
rdev = conf->disks[i].rdev;
rdev = rdev_pend_deref(conf->disks[i].rdev);
if (!rdev_set_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0))
md_error(conf->mddev, rdev);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
rdev = conf->disks[i].replacement;
rdev = rdev_pend_deref(conf->disks[i].replacement);
if (!rdev)
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev = rdev_pend_deref(conf->disks[i].rdev);
rdev_clear_badblocks(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf), 0);
rdev_dec_pending(rdev, conf->mddev);
......@@ -5256,6 +5286,7 @@ static void handle_stripe(struct stripe_head *sh)
}
static void raid5_activate_delayed(struct r5conf *conf)
__must_hold(&conf->device_lock)
{
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
while (!list_empty(&conf->delayed_list)) {
......@@ -5274,8 +5305,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
static void activate_bit_delay(struct r5conf *conf,
struct list_head *temp_inactive_list)
__must_hold(&conf->device_lock)
{
/* device_lock is held */
struct list_head head;
list_add(&head, &conf->bitmap_list);
list_del_init(&conf->bitmap_list);
......@@ -5500,6 +5531,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
* handle_list.
*/
static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
__must_hold(&conf->device_lock)
{
struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
......@@ -6288,7 +6320,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
*/
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
still_degraded = 1;
......@@ -6371,8 +6403,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
static int handle_active_stripes(struct r5conf *conf, int group,
struct r5worker *worker,
struct list_head *temp_inactive_list)
__releases(&conf->device_lock)
__acquires(&conf->device_lock)
__must_hold(&conf->device_lock)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0, hash;
......@@ -7166,7 +7197,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
int i;
int group_cnt;
struct r5worker_group *new_group;
int ret;
int ret = -ENOMEM;
if (mddev->new_level != 5
&& mddev->new_level != 4
......@@ -7225,6 +7256,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_spinlock_init(&conf->gen_lock, &conf->device_lock);
mutex_init(&conf->cache_size_mutex);
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
......@@ -7302,11 +7334,13 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->level = mddev->new_level;
conf->chunk_sectors = mddev->new_chunk_sectors;
if (raid5_alloc_percpu(conf) != 0)
ret = raid5_alloc_percpu(conf);
if (ret)
goto abort;
pr_debug("raid456: run(%s) called.\n", mdname(mddev));
ret = -EIO;
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
......@@ -7317,11 +7351,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (test_bit(Replacement, &rdev->flags)) {
if (disk->replacement)
goto abort;
disk->replacement = rdev;
RCU_INIT_POINTER(disk->replacement, rdev);
} else {
if (disk->rdev)
goto abort;
disk->rdev = rdev;
RCU_INIT_POINTER(disk->rdev, rdev);
}
if (test_bit(In_sync, &rdev->flags)) {
......@@ -7370,6 +7404,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (grow_stripes(conf, conf->min_nr_stripes)) {
pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
ret = -ENOMEM;
goto abort;
} else
pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
......@@ -7383,7 +7418,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf->shrinker.count_objects = raid5_cache_count;
conf->shrinker.batch = 128;
conf->shrinker.flags = 0;
if (register_shrinker(&conf->shrinker)) {
ret = register_shrinker(&conf->shrinker);
if (ret) {
pr_warn("md/raid:%s: couldn't register shrinker.\n",
mdname(mddev));
goto abort;
......@@ -7394,17 +7430,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (!conf->thread) {
pr_warn("md/raid:%s: couldn't allocate thread.\n",
mdname(mddev));
ret = -ENOMEM;
goto abort;
}
return conf;
abort:
if (conf) {
if (conf)
free_conf(conf);
return ERR_PTR(-EIO);
} else
return ERR_PTR(-ENOMEM);
return ERR_PTR(ret);
}
static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
......@@ -7621,17 +7656,18 @@ static int raid5_run(struct mddev *mddev)
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
i++) {
rdev = conf->disks[i].rdev;
rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
if (!rdev && conf->disks[i].replacement) {
/* The replacement is all we have yet */
rdev = conf->disks[i].replacement;
rdev = rdev_mdlock_deref(mddev,
conf->disks[i].replacement);
conf->disks[i].replacement = NULL;
clear_bit(Replacement, &rdev->flags);
conf->disks[i].rdev = rdev;
rcu_assign_pointer(conf->disks[i].rdev, rdev);
}
if (!rdev)
continue;
if (conf->disks[i].replacement &&
if (rcu_access_pointer(conf->disks[i].replacement) &&
conf->reshape_progress != MaxSector) {
/* replacements and reshape simply do not mix. */
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
......@@ -7828,8 +7864,8 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
static void print_raid5_conf (struct r5conf *conf)
{
struct md_rdev *rdev;
int i;
struct disk_info *tmp;
pr_debug("RAID conf printout:\n");
if (!conf) {
......@@ -7840,50 +7876,54 @@ static void print_raid5_conf (struct r5conf *conf)
conf->raid_disks,
conf->raid_disks - conf->mddev->degraded);
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
char b[BDEVNAME_SIZE];
tmp = conf->disks + i;
if (tmp->rdev)
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev)
pr_debug(" disk %d, o:%d, dev:%s\n",
i, !test_bit(Faulty, &tmp->rdev->flags),
bdevname(tmp->rdev->bdev, b));
i, !test_bit(Faulty, &rdev->flags),
bdevname(rdev->bdev, b));
}
rcu_read_unlock();
}
static int raid5_spare_active(struct mddev *mddev)
{
int i;
struct r5conf *conf = mddev->private;
struct disk_info *tmp;
struct md_rdev *rdev, *replacement;
int count = 0;
unsigned long flags;
for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i;
if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->replacement->flags)
&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
replacement = rdev_mdlock_deref(mddev,
conf->disks[i].replacement);
if (replacement
&& replacement->recovery_offset == MaxSector
&& !test_bit(Faulty, &replacement->flags)
&& !test_and_set_bit(In_sync, &replacement->flags)) {
/* Replacement has just become active. */
if (!tmp->rdev
|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))
if (!rdev
|| !test_and_clear_bit(In_sync, &rdev->flags))
count++;
if (tmp->rdev) {
if (rdev) {
/* Replaced device not technically faulty,
* but we need to be sure it gets removed
* and never re-added.
*/
set_bit(Faulty, &tmp->rdev->flags);
set_bit(Faulty, &rdev->flags);
sysfs_notify_dirent_safe(
tmp->rdev->sysfs_state);
rdev->sysfs_state);
}
sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
} else if (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
sysfs_notify_dirent_safe(replacement->sysfs_state);
} else if (rdev
&& rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &rdev->flags)
&& !test_and_set_bit(In_sync, &rdev->flags)) {
count++;
sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
}
spin_lock_irqsave(&conf->device_lock, flags);
......@@ -7898,8 +7938,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r5conf *conf = mddev->private;
int err = 0;
int number = rdev->raid_disk;
struct md_rdev **rdevp;
struct md_rdev __rcu **rdevp;
struct disk_info *p = conf->disks + number;
struct md_rdev *tmp;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags) && conf->log) {
......@@ -7917,9 +7958,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
log_exit(conf);
return 0;
}
if (rdev == p->rdev)
if (rdev == rcu_access_pointer(p->rdev))
rdevp = &p->rdev;
else if (rdev == p->replacement)
else if (rdev == rcu_access_pointer(p->replacement))
rdevp = &p->replacement;
else
return 0;
......@@ -7939,18 +7980,20 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
!has_failed(conf) &&
(!p->replacement || p->replacement == rdev) &&
(!rcu_access_pointer(p->replacement) ||
rcu_access_pointer(p->replacement) == rdev) &&
number < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
*rdevp = NULL;
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
lockdep_assert_held(&mddev->reconfig_mutex);
synchronize_rcu();
if (atomic_read(&rdev->nr_pending)) {
/* lost the race, try later */
err = -EBUSY;
*rdevp = rdev;
rcu_assign_pointer(*rdevp, rdev);
}
}
if (!err) {
......@@ -7958,17 +8001,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err)
goto abort;
}
if (p->replacement) {
tmp = rcu_access_pointer(p->replacement);
if (tmp) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
clear_bit(Replacement, &p->replacement->flags);
rcu_assign_pointer(p->rdev, tmp);
clear_bit(Replacement, &tmp->flags);
smp_mb(); /* Make sure other CPUs may see both as identical
* but will never see neither - if they are careful
*/
p->replacement = NULL;
rcu_assign_pointer(p->replacement, NULL);
if (!err)
err = log_modify(conf, p->rdev, true);
err = log_modify(conf, tmp, true);
}
clear_bit(WantReplacement, &rdev->flags);
......@@ -7984,6 +8029,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int ret, err = -EEXIST;
int disk;
struct disk_info *p;
struct md_rdev *tmp;
int first = 0;
int last = conf->raid_disks - 1;
......@@ -8041,7 +8087,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
}
for (disk = first; disk <= last; disk++) {
p = conf->disks + disk;
if (test_bit(WantReplacement, &p->rdev->flags) &&
tmp = rdev_mdlock_deref(mddev, p->rdev);
if (test_bit(WantReplacement, &tmp->flags) &&
p->replacement == NULL) {
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
......@@ -8332,6 +8379,7 @@ static void end_reshape(struct r5conf *conf)
static void raid5_finish_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
struct md_rdev *rdev;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
......@@ -8343,10 +8391,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++) {
struct md_rdev *rdev = conf->disks[d].rdev;
rdev = rdev_mdlock_deref(mddev,
conf->disks[d].rdev);
if (rdev)
clear_bit(In_sync, &rdev->flags);
rdev = conf->disks[d].replacement;
rdev = rdev_mdlock_deref(mddev,
conf->disks[d].replacement);
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
......
......@@ -473,7 +473,8 @@ enum {
*/
struct disk_info {
struct md_rdev *rdev, *replacement;
struct md_rdev __rcu *rdev;
struct md_rdev __rcu *replacement;
struct page *extra_page; /* extra page to use in prexor */
};
......@@ -560,6 +561,16 @@ struct r5pending_data {
struct bio_list bios;
};
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
void *scribble; /* space for constructing buffer
* lists and performing address
* conversions
*/
int scribble_obj_size;
local_lock_t lock;
};
struct r5conf {
struct hlist_head *stripe_hashtbl;
/* only protect corresponding hash list and inactive_list */
......@@ -635,15 +646,7 @@ struct r5conf {
*/
int recovery_disabled;
/* per cpu variables */
struct raid5_percpu {
struct page *spare_page; /* Used when checking P/Q in raid6 */
void *scribble; /* space for constructing buffer
* lists and performing address
* conversions
*/
int scribble_obj_size;
local_lock_t lock;
} __percpu *percpu;
struct raid5_percpu __percpu *percpu;
int scribble_disks;
int scribble_sectors;
struct hlist_node node;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment