Commit c492e2d4 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md

Pull m,ore md bugfixes gfrom Neil Brown:
 "Assorted fixes for new RAID5 stripe-batching functionality.

  Unfortunately this functionality was merged a little prematurely.  The
  necessary testing and code review is now complete (or as complete as
  it can be) and to code passes a variety of tests and looks quite
  sensible.

  Also a fix for some recent locking changes - a race was introduced
  which causes a reshape request to sometimes fail.  No data safety
  issues"

* tag 'md/4.1-rc5-fixes' of git://neil.brown.name/md:
  md: fix race when unfreezing sync_action
  md/raid5: break stripe-batches when the array has failed.
  md/raid5: call break_stripe_batch_list from handle_stripe_clean_event
  md/raid5: be more selective about distributing flags across batch.
  md/raid5: add handle_flags arg to break_stripe_batch_list.
  md/raid5: duplicate some more handle_stripe_clean_event code in break_stripe_batch_list
  md/raid5: remove condition test from check_break_stripe_batch_list.
  md/raid5: Ensure a batch member is not handled prematurely.
  md/raid5: close race between STRIPE_BIT_DELAY and batching.
  md/raid5: ensure whole batch is delayed for all required bitmap updates.
parents ace6a22a 56ccc112
...@@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len) ...@@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (!mddev->pers || !mddev->pers->sync_request) if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL; return -EINVAL;
if (cmd_match(page, "frozen"))
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (cmd_match(page, "frozen"))
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq); flush_workqueue(md_misc_wq);
if (mddev->sync_thread) { if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
...@@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len) ...@@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY; return -EBUSY;
else if (cmd_match(page, "resync")) else if (cmd_match(page, "resync"))
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else if (cmd_match(page, "recover")) { else if (cmd_match(page, "recover")) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} else if (cmd_match(page, "reshape")) { } else if (cmd_match(page, "reshape")) {
int err; int err;
if (mddev->pers->start_reshape == NULL) if (mddev->pers->start_reshape == NULL)
return -EINVAL; return -EINVAL;
err = mddev_lock(mddev); err = mddev_lock(mddev);
if (!err) { if (!err) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
err = mddev->pers->start_reshape(mddev); err = mddev->pers->start_reshape(mddev);
mddev_unlock(mddev); mddev_unlock(mddev);
} }
...@@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) ...@@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
set_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
else if (!cmd_match(page, "repair")) else if (!cmd_match(page, "repair"))
return -EINVAL; return -EINVAL;
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
} }
......
...@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) ...@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static bool stripe_can_batch(struct stripe_head *sh) static bool stripe_can_batch(struct stripe_head *sh)
{ {
return test_bit(STRIPE_BATCH_READY, &sh->state) && return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh); is_full_stripe_write(sh);
} }
...@@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh ...@@ -837,6 +838,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
< IO_THRESHOLD) < IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
int seq = sh->bm_seq;
if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
sh->batch_head->bm_seq > seq)
seq = sh->batch_head->bm_seq;
set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
sh->batch_head->bm_seq = seq;
}
atomic_inc(&sh->count); atomic_inc(&sh->count);
unlock_out: unlock_out:
unlock_two_stripes(head, sh); unlock_two_stripes(head, sh);
...@@ -2987,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, ...@@ -2987,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)(*bip)->bi_iter.bi_sector, (unsigned long long)(*bip)->bi_iter.bi_sector,
(unsigned long long)sh->sector, dd_idx); (unsigned long long)sh->sector, dd_idx);
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap && firstwrite) { if (conf->mddev->bitmap && firstwrite) {
/* Cannot hold spinlock over bitmap_startwrite,
* but must ensure this isn't added to a batch until
* we have added to the bitmap and set bm_seq.
* So set STRIPE_BITMAP_PENDING to prevent
* batching.
* If multiple add_stripe_bio() calls race here they
* much all set STRIPE_BITMAP_PENDING. So only the first one
* to complete "bitmap_startwrite" gets to set
* STRIPE_BIT_DELAY. This is important as once a stripe
* is added to a batch, STRIPE_BIT_DELAY cannot be changed
* any more.
*/
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
bitmap_startwrite(conf->mddev->bitmap, sh->sector, bitmap_startwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0); STRIPE_SECTORS, 0);
sh->bm_seq = conf->seq_flush+1; spin_lock_irq(&sh->stripe_lock);
set_bit(STRIPE_BIT_DELAY, &sh->state); clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
sh->bm_seq = conf->seq_flush+1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
} }
spin_unlock_irq(&sh->stripe_lock);
if (stripe_can_batch(sh)) if (stripe_can_batch(sh))
stripe_add_to_batch_list(conf, sh); stripe_add_to_batch_list(conf, sh);
...@@ -3392,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh, ...@@ -3392,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
} }
static void break_stripe_batch_list(struct stripe_head *head_sh,
unsigned long handle_flags);
/* handle_stripe_clean_event /* handle_stripe_clean_event
* any written block on an uptodate or failed drive can be returned. * any written block on an uptodate or failed drive can be returned.
* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
...@@ -3405,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf, ...@@ -3405,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,
int discard_pending = 0; int discard_pending = 0;
struct stripe_head *head_sh = sh; struct stripe_head *head_sh = sh;
bool do_endio = false; bool do_endio = false;
int wakeup_nr = 0;
for (i = disks; i--; ) for (i = disks; i--; )
if (sh->dev[i].written) { if (sh->dev[i].written) {
...@@ -3494,44 +3523,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, ...@@ -3494,44 +3523,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
if (atomic_dec_and_test(&conf->pending_full_writes)) if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
if (!head_sh->batch_head || !do_endio) if (head_sh->batch_head && do_endio)
return; break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
for (i = 0; i < head_sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
wakeup_nr++;
}
while (!list_empty(&head_sh->batch_list)) {
int i;
sh = list_first_entry(&head_sh->batch_list,
struct stripe_head, batch_list);
list_del_init(&sh->batch_list);
set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
head_sh->state & ~((1 << STRIPE_ACTIVE) |
(1 << STRIPE_PREREAD_ACTIVE) |
STRIPE_EXPAND_SYNC_FLAG));
sh->check_state = head_sh->check_state;
sh->reconstruct_state = head_sh->reconstruct_state;
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wakeup_nr++;
sh->dev[i].flags = head_sh->dev[i].flags;
}
spin_lock_irq(&sh->stripe_lock);
sh->batch_head = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
spin_unlock_irq(&head_sh->stripe_lock);
wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
set_bit(STRIPE_HANDLE, &head_sh->state);
} }
static void handle_stripe_dirtying(struct r5conf *conf, static void handle_stripe_dirtying(struct r5conf *conf,
...@@ -4172,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4172,9 +4165,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
static int clear_batch_ready(struct stripe_head *sh) static int clear_batch_ready(struct stripe_head *sh)
{ {
/* Return '1' if this is a member of batch, or
* '0' if it is a lone stripe or a head which can now be
* handled.
*/
struct stripe_head *tmp; struct stripe_head *tmp;
if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
return 0; return (sh->batch_head && sh->batch_head != sh);
spin_lock(&sh->stripe_lock); spin_lock(&sh->stripe_lock);
if (!sh->batch_head) { if (!sh->batch_head) {
spin_unlock(&sh->stripe_lock); spin_unlock(&sh->stripe_lock);
...@@ -4202,38 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh) ...@@ -4202,38 +4199,65 @@ static int clear_batch_ready(struct stripe_head *sh)
return 0; return 0;
} }
static void check_break_stripe_batch_list(struct stripe_head *sh) static void break_stripe_batch_list(struct stripe_head *head_sh,
unsigned long handle_flags)
{ {
struct stripe_head *head_sh, *next; struct stripe_head *sh, *next;
int i; int i;
int do_wakeup = 0;
if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
return;
head_sh = sh;
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
list_del_init(&sh->batch_list); list_del_init(&sh->batch_list);
set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG, WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
head_sh->state & ~((1 << STRIPE_ACTIVE) | (1 << STRIPE_SYNCING) |
(1 << STRIPE_PREREAD_ACTIVE) | (1 << STRIPE_REPLACED) |
(1 << STRIPE_DEGRADED) | (1 << STRIPE_PREREAD_ACTIVE) |
STRIPE_EXPAND_SYNC_FLAG)); (1 << STRIPE_DELAYED) |
(1 << STRIPE_BIT_DELAY) |
(1 << STRIPE_FULL_WRITE) |
(1 << STRIPE_BIOFILL_RUN) |
(1 << STRIPE_COMPUTE_RUN) |
(1 << STRIPE_OPS_REQ_PENDING) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
(1 << STRIPE_BATCH_ERR) |
(1 << STRIPE_BITMAP_PENDING)));
WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
(1 << STRIPE_REPLACED)));
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
(1 << STRIPE_DEGRADED)),
head_sh->state & (1 << STRIPE_INSYNC));
sh->check_state = head_sh->check_state; sh->check_state = head_sh->check_state;
sh->reconstruct_state = head_sh->reconstruct_state; sh->reconstruct_state = head_sh->reconstruct_state;
for (i = 0; i < sh->disks; i++) for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
do_wakeup = 1;
sh->dev[i].flags = head_sh->dev[i].flags & sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap))); (~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
spin_lock_irq(&sh->stripe_lock); spin_lock_irq(&sh->stripe_lock);
sh->batch_head = NULL; sh->batch_head = NULL;
spin_unlock_irq(&sh->stripe_lock); spin_unlock_irq(&sh->stripe_lock);
if (handle_flags == 0 ||
set_bit(STRIPE_HANDLE, &sh->state); sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
} }
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
spin_unlock_irq(&head_sh->stripe_lock);
for (i = 0; i < head_sh->disks; i++)
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
do_wakeup = 1;
if (head_sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &head_sh->state);
if (do_wakeup)
wake_up(&head_sh->raid_conf->wait_for_overlap);
} }
static void handle_stripe(struct stripe_head *sh) static void handle_stripe(struct stripe_head *sh)
...@@ -4258,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4258,7 +4282,8 @@ static void handle_stripe(struct stripe_head *sh)
return; return;
} }
check_break_stripe_batch_list(sh); if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
break_stripe_batch_list(sh, 0);
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock); spin_lock(&sh->stripe_lock);
...@@ -4312,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4312,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)
if (s.failed > conf->max_degraded) { if (s.failed > conf->max_degraded) {
sh->check_state = 0; sh->check_state = 0;
sh->reconstruct_state = 0; sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
if (s.to_read+s.to_write+s.written) if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
if (s.syncing + s.replacing) if (s.syncing + s.replacing)
......
...@@ -337,9 +337,12 @@ enum { ...@@ -337,9 +337,12 @@ enum {
STRIPE_ON_RELEASE_LIST, STRIPE_ON_RELEASE_LIST,
STRIPE_BATCH_READY, STRIPE_BATCH_READY,
STRIPE_BATCH_ERR, STRIPE_BATCH_ERR,
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet.
*/
}; };
#define STRIPE_EXPAND_SYNC_FLAG \ #define STRIPE_EXPAND_SYNC_FLAGS \
((1 << STRIPE_EXPAND_SOURCE) |\ ((1 << STRIPE_EXPAND_SOURCE) |\
(1 << STRIPE_EXPAND_READY) |\ (1 << STRIPE_EXPAND_READY) |\
(1 << STRIPE_EXPANDING) |\ (1 << STRIPE_EXPANDING) |\
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment