Commit 9631abdb authored by Mariusz Tkaczyk's avatar Mariusz Tkaczyk Committed by Song Liu

md: Set MD_BROKEN for RAID1 and RAID10

There is no direct mechanism to determine raid failure outside
personality. It is done by checking rdev->flags after executing
md_error(). If "faulty" flag is not set then -EBUSY is returned to
userspace. -EBUSY means that array will be failed after drive removal.

Mdadm has special routine to handle the array failure and it is executed
if -EBUSY is returned by md.

There are at least two known reasons to not consider this mechanism
as correct:
1. drive can be removed even if array will be failed[1].
2. -EBUSY seems to be wrong status. Array is not busy, but removal
   process cannot proceed safe.

-EBUSY expectation cannot be removed without breaking compatibility
with userspace. In this patch first issue is resolved by adding support
for MD_BROKEN flag for RAID1 and RAID10. Support for RAID456 is added in
next commit.

The idea is to set the MD_BROKEN if we are sure that raid is in failed
state now. This is done in each error_handler(). In md_error() MD_BROKEN
flag is checked. If is set, then -EBUSY is returned to userspace.

As in previous commit, it causes that #mdadm --set-faulty is able to
fail array. Previously proposed workaround is valid if optional
functionality[1] is disabled.

[1] commit 9a567843("md: allow last device to be forcibly removed from
    RAID1/RAID10.")
Reviewd-by: default avatarXiao Ni <xni@redhat.com>
Signed-off-by: default avatarMariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Signed-off-by: default avatarSong Liu <song@kernel.org>
parent 5ea7c133
...@@ -2984,10 +2984,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2984,10 +2984,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (cmd_match(buf, "faulty") && rdev->mddev->pers) { if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
md_error(rdev->mddev, rdev); md_error(rdev->mddev, rdev);
if (test_bit(Faulty, &rdev->flags))
err = 0; if (test_bit(MD_BROKEN, &rdev->mddev->flags))
else
err = -EBUSY; err = -EBUSY;
else
err = 0;
} else if (cmd_match(buf, "remove")) { } else if (cmd_match(buf, "remove")) {
if (rdev->mddev->pers) { if (rdev->mddev->pers) {
clear_bit(Blocked, &rdev->flags); clear_bit(Blocked, &rdev->flags);
...@@ -4353,10 +4354,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, ...@@ -4353,10 +4354,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
* like active, but no writes have been seen for a while (100msec). * like active, but no writes have been seen for a while (100msec).
* *
* broken * broken
* RAID0/LINEAR-only: same as clean, but array is missing a member. * Array is failed. It's useful because mounted-arrays aren't stopped
* It's useful because RAID0/LINEAR mounted-arrays aren't stopped * when array is failed, so this state will at least alert the user that
* when a member is gone, so this state will at least alert the * something is wrong.
* user that something is wrong.
*/ */
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
write_pending, active_idle, broken, bad_word}; write_pending, active_idle, broken, bad_word};
...@@ -7443,7 +7443,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) ...@@ -7443,7 +7443,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
err = -ENODEV; err = -ENODEV;
else { else {
md_error(mddev, rdev); md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)) if (test_bit(MD_BROKEN, &mddev->flags))
err = -EBUSY; err = -EBUSY;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -7984,13 +7984,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -7984,13 +7984,16 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
if (!mddev->pers || !mddev->pers->error_handler) if (!mddev->pers || !mddev->pers->error_handler)
return; return;
mddev->pers->error_handler(mddev,rdev); mddev->pers->error_handler(mddev, rdev);
if (mddev->degraded)
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
sysfs_notify_dirent_safe(rdev->sysfs_state); sysfs_notify_dirent_safe(rdev->sysfs_state);
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (!test_bit(MD_BROKEN, &mddev->flags)) {
md_wakeup_thread(mddev->thread); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
if (mddev->event_work.func) if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work); queue_work(md_misc_wq, &mddev->event_work);
md_new_event(); md_new_event();
......
...@@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new); int is_new);
struct md_cluster_info; struct md_cluster_info;
/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ /**
* enum mddev_flags - md device flags.
* @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
* @MD_CLOSING: If set, we are closing the array, do not open it then.
* @MD_JOURNAL_CLEAN: A raid with journal is already clean.
* @MD_HAS_JOURNAL: The raid array has journal feature set.
* @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
* resync lock, need to release the lock.
* @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
* calls to md_error() will never cause the array to
* become failed.
* @MD_HAS_PPL: The raid array has PPL feature set.
* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
* @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
* without taking reconfig_mutex.
* @MD_UPDATING_SB: md_check_recovery is updating the metadata without
* explicitly holding reconfig_mutex.
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
*
* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
*/
enum mddev_flags { enum mddev_flags {
MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ MD_ARRAY_FIRST_USE,
MD_CLOSING, /* If set, we are closing the array, do not open MD_CLOSING,
* it then */ MD_JOURNAL_CLEAN,
MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ MD_HAS_JOURNAL,
MD_HAS_JOURNAL, /* The raid array has journal feature set */ MD_CLUSTER_RESYNC_LOCKED,
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node MD_FAILFAST_SUPPORTED,
* already took resync lock, need to MD_HAS_PPL,
* release the lock */ MD_HAS_MULTIPLE_PPLS,
MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is MD_ALLOW_SB_UPDATE,
* supported as calls to md_error() will MD_UPDATING_SB,
* never cause the array to become failed. MD_NOT_READY,
*/ MD_BROKEN,
MD_HAS_PPL, /* The raid array has PPL feature set */
MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */
MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update
* the metadata without taking reconfig_mutex.
*/
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
MD_NOT_READY, /* do_md_run() is active, so 'array_state'
* must not report that array is ready yet
*/
MD_BROKEN, /* This is used in RAID-0/LINEAR only, to stop
* I/O in case an array member is gone/failed.
*/
}; };
enum mddev_sb_flags { enum mddev_sb_flags {
......
...@@ -1641,30 +1641,39 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) ...@@ -1641,30 +1641,39 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "]"); seq_printf(seq, "]");
} }
/**
* raid1_error() - RAID1 error handler.
* @mddev: affected md device.
* @rdev: member device to fail.
*
* The routine acknowledges &rdev failure and determines new @mddev state.
* If it failed, then:
* - &MD_BROKEN flag is set in &mddev->flags.
* - recovery is disabled.
* Otherwise, it must be degraded:
* - recovery is interrupted.
* - &mddev->degraded is bumped.
*
* @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off.
*/
static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
unsigned long flags; unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
* else if it is the last working disks with "fail_last_dev == false",
* ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& (conf->raid_disks - mddev->degraded) == 1) { if (test_bit(In_sync, &rdev->flags) &&
/* (conf->raid_disks - mddev->degraded) == 1) {
* Don't fail the drive, act as though we were just a set_bit(MD_BROKEN, &mddev->flags);
* normal single drive.
* However don't try a recovery from this drive as if (!mddev->fail_last_dev) {
* it is very likely to fail. conf->recovery_disabled = mddev->recovery_disabled;
*/ spin_unlock_irqrestore(&conf->device_lock, flags);
conf->recovery_disabled = mddev->recovery_disabled; return;
spin_unlock_irqrestore(&conf->device_lock, flags); }
return;
} }
set_bit(Blocked, &rdev->flags); set_bit(Blocked, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) if (test_and_clear_bit(In_sync, &rdev->flags))
......
...@@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore) ...@@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore)
_enough(conf, 1, ignore); _enough(conf, 1, ignore);
} }
/**
* raid10_error() - RAID10 error handler.
* @mddev: affected md device.
* @rdev: member device to fail.
*
* The routine acknowledges &rdev failure and determines new @mddev state.
* If it failed, then:
* - &MD_BROKEN flag is set in &mddev->flags.
* Otherwise, it must be degraded:
* - recovery is interrupted.
* - &mddev->degraded is bumped.
* @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off.
*/
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
unsigned long flags; unsigned long flags;
/*
* If it is not operational, then we have already marked it as dead
* else if it is the last working disks with "fail_last_dev == false",
* ignore the error, let the next level up know.
* else mark the drive as failed
*/
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
&& !enough(conf, rdev->raid_disk)) { if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
/* set_bit(MD_BROKEN, &mddev->flags);
* Don't fail the drive, just return an IO error.
*/ if (!mddev->fail_last_dev) {
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
return; return;
}
} }
if (test_and_clear_bit(In_sync, &rdev->flags)) if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++; mddev->degraded++;
/*
* If recovery is running, make sure it aborts.
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags); set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment