Commit 1d838d70 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md-3.7-fixes' of git://neil.brown.name/md

Pull md fixes from NeilBrown:
 "Several bug fixes for md in 3.7:

   - raid5 discard has problems
   - raid10 replacement devices have problems
   - bad block lock seqlock usage has problems
   - dm-raid doesn't free everything"

* tag 'md-3.7-fixes' of git://neil.brown.name/md:
  md/raid10: decrement correct pending counter when writing to replacement.
  md/raid10: close race that lose writes lost when replacement completes.
  md/raid5: Make sure we clear R5_Discard when discard is finished.
  md/raid5: move resolving of reconstruct_state earlier in stripe_handle.
  md/raid5: round discard alignment up to power of 2.
  md: make sure everything is freed when dm-raid stops an array.
  md: Avoid write invalid address if read_seqretry returned true.
  md: Reassigned the parameters if read_seqretry returned true in func md_is_badblock.
parents a8946afe 884162df
...@@ -1817,10 +1817,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1817,10 +1817,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
memset(bbp, 0xff, PAGE_SIZE); memset(bbp, 0xff, PAGE_SIZE);
for (i = 0 ; i < bb->count ; i++) { for (i = 0 ; i < bb->count ; i++) {
u64 internal_bb = *p++; u64 internal_bb = p[i];
u64 store_bb = ((BB_OFFSET(internal_bb) << 10) u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
| BB_LEN(internal_bb)); | BB_LEN(internal_bb));
*bbp++ = cpu_to_le64(store_bb); bbp[i] = cpu_to_le64(store_bb);
} }
bb->changed = 0; bb->changed = 0;
if (read_seqretry(&bb->lock, seq)) if (read_seqretry(&bb->lock, seq))
...@@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev) ...@@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev)
} }
EXPORT_SYMBOL_GPL(md_stop_writes); EXPORT_SYMBOL_GPL(md_stop_writes);
void md_stop(struct mddev *mddev) static void __md_stop(struct mddev *mddev)
{ {
mddev->ready = 0; mddev->ready = 0;
mddev->pers->stop(mddev); mddev->pers->stop(mddev);
...@@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev) ...@@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev)
mddev->pers = NULL; mddev->pers = NULL;
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
} }
void md_stop(struct mddev *mddev)
{
/* stop the array and free an attached data structures.
* This is called from dm-raid
*/
__md_stop(mddev);
bitmap_destroy(mddev);
if (mddev->bio_set)
bioset_free(mddev->bio_set);
}
EXPORT_SYMBOL_GPL(md_stop); EXPORT_SYMBOL_GPL(md_stop);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
...@@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode, ...@@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode,
set_disk_ro(disk, 0); set_disk_ro(disk, 0);
__md_stop_writes(mddev); __md_stop_writes(mddev);
md_stop(mddev); __md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL; mddev->queue->merge_bvec_fn = NULL;
mddev->queue->backing_dev_info.congested_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL;
...@@ -7936,9 +7948,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, ...@@ -7936,9 +7948,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
sector_t *first_bad, int *bad_sectors) sector_t *first_bad, int *bad_sectors)
{ {
int hi; int hi;
int lo = 0; int lo;
u64 *p = bb->page; u64 *p = bb->page;
int rv = 0; int rv;
sector_t target = s + sectors; sector_t target = s + sectors;
unsigned seq; unsigned seq;
...@@ -7953,7 +7965,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, ...@@ -7953,7 +7965,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
retry: retry:
seq = read_seqbegin(&bb->lock); seq = read_seqbegin(&bb->lock);
lo = 0;
rv = 0;
hi = bb->count; hi = bb->count;
/* Binary search between lo and hi for 'target' /* Binary search between lo and hi for 'target'
......
...@@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error) ...@@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
*/ */
one_write_done(r10_bio); one_write_done(r10_bio);
if (dec_rdev) if (dec_rdev)
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
/* /*
...@@ -1334,18 +1334,21 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1334,18 +1334,21 @@ static void make_request(struct mddev *mddev, struct bio * bio)
blocked_rdev = rrdev; blocked_rdev = rrdev;
break; break;
} }
if (rdev && (test_bit(Faulty, &rdev->flags)
|| test_bit(Unmerged, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags) if (rrdev && (test_bit(Faulty, &rrdev->flags)
|| test_bit(Unmerged, &rrdev->flags))) || test_bit(Unmerged, &rrdev->flags)))
rrdev = NULL; rrdev = NULL;
r10_bio->devs[i].bio = NULL; r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL; r10_bio->devs[i].repl_bio = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
test_bit(Unmerged, &rdev->flags)) { if (!rdev && !rrdev) {
set_bit(R10BIO_Degraded, &r10_bio->state); set_bit(R10BIO_Degraded, &r10_bio->state);
continue; continue;
} }
if (test_bit(WriteErrorSeen, &rdev->flags)) { if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad; sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr; sector_t dev_sector = r10_bio->devs[i].addr;
int bad_sectors; int bad_sectors;
...@@ -1387,8 +1390,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1387,8 +1390,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
max_sectors = good_sectors; max_sectors = good_sectors;
} }
} }
if (rdev) {
r10_bio->devs[i].bio = bio; r10_bio->devs[i].bio = bio;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
}
if (rrdev) { if (rrdev) {
r10_bio->devs[i].repl_bio = bio; r10_bio->devs[i].repl_bio = bio;
atomic_inc(&rrdev->nr_pending); atomic_inc(&rrdev->nr_pending);
...@@ -1444,9 +1449,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1444,9 +1449,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
for (i = 0; i < conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
struct bio *mbio; struct bio *mbio;
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
if (!r10_bio->devs[i].bio) if (r10_bio->devs[i].bio) {
continue; struct md_rdev *rdev = conf->mirrors[d].rdev;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors); max_sectors);
...@@ -1454,17 +1458,19 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1454,17 +1458,19 @@ static void make_request(struct mddev *mddev, struct bio * bio)
mbio->bi_sector = (r10_bio->devs[i].addr+ mbio->bi_sector = (r10_bio->devs[i].addr+
choose_data_offset(r10_bio, choose_data_offset(r10_bio,
conf->mirrors[d].rdev)); rdev));
mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request; mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio; mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining); atomic_inc(&r10_bio->remaining);
cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); cb = blk_check_plugged(raid10_unplug, mddev,
sizeof(*plug));
if (cb) if (cb)
plug = container_of(cb, struct raid10_plug_cb, cb); plug = container_of(cb, struct raid10_plug_cb,
cb);
else else
plug = NULL; plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
...@@ -1478,24 +1484,24 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1478,24 +1484,24 @@ static void make_request(struct mddev *mddev, struct bio * bio)
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug) if (!plug)
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
}
if (!r10_bio->devs[i].repl_bio) if (r10_bio->devs[i].repl_bio) {
continue; struct md_rdev *rdev = conf->mirrors[d].replacement;
if (rdev == NULL) {
/* Replacement just got moved to main 'rdev' */
smp_mb();
rdev = conf->mirrors[d].rdev;
}
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors); max_sectors);
r10_bio->devs[i].repl_bio = mbio; r10_bio->devs[i].repl_bio = mbio;
/* We are actively writing to the original device
* so it cannot disappear, so the replacement cannot
* become NULL here
*/
mbio->bi_sector = (r10_bio->devs[i].addr + mbio->bi_sector = (r10_bio->devs[i].addr +
choose_data_offset( choose_data_offset(
r10_bio, r10_bio, rdev));
conf->mirrors[d].replacement)); mbio->bi_bdev = rdev->bdev;
mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
mbio->bi_end_io = raid10_end_write_request; mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio; mbio->bi_private = r10_bio;
...@@ -1508,6 +1514,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1508,6 +1514,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
if (!mddev_check_plugged(mddev)) if (!mddev_check_plugged(mddev))
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
} }
}
/* Don't remove the bias on 'remaining' (one_write_done) until /* Don't remove the bias on 'remaining' (one_write_done) until
* after checking if we need to go around again. * after checking if we need to go around again.
......
...@@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf, ...@@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf,
dev = &sh->dev[i]; dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) && if (!test_bit(R5_LOCKED, &dev->flags) &&
(test_bit(R5_UPTODATE, &dev->flags) || (test_bit(R5_UPTODATE, &dev->flags) ||
test_and_clear_bit(R5_Discard, &dev->flags))) { test_bit(R5_Discard, &dev->flags))) {
/* We can return any write requests */ /* We can return any write requests */
struct bio *wbi, *wbi2; struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i); pr_debug("Return write for disc %d\n", i);
if (test_and_clear_bit(R5_Discard, &dev->flags))
clear_bit(R5_UPTODATE, &dev->flags);
wbi = dev->written; wbi = dev->written;
dev->written = NULL; dev->written = NULL;
while (wbi && wbi->bi_sector < while (wbi && wbi->bi_sector <
...@@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, ...@@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
!test_bit(STRIPE_DEGRADED, &sh->state), !test_bit(STRIPE_DEGRADED, &sh->state),
0); 0);
} }
} } else if (test_bit(R5_Discard, &sh->dev[i].flags))
clear_bit(R5_Discard, &sh->dev[i].flags);
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes)) if (atomic_dec_and_test(&conf->pending_full_writes))
...@@ -3490,40 +3493,6 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3490,40 +3493,6 @@ static void handle_stripe(struct stripe_head *sh)
handle_failed_sync(conf, sh, &s); handle_failed_sync(conf, sh, &s);
} }
/*
* might be able to return some write requests if the parity blocks
* are safe, or on a failed drive
*/
pdev = &sh->dev[sh->pd_idx];
s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
qdev = &sh->dev[sh->qd_idx];
s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
|| conf->level < 6;
if (s.written &&
(s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
&& (test_bit(R5_UPTODATE, &pdev->flags) ||
test_bit(R5_Discard, &pdev->flags))))) &&
(s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
|| (conf->level == 6 && s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
handle_stripe_fill(sh, &s, disks);
/* Now we check to see if any write operations have recently /* Now we check to see if any write operations have recently
* completed * completed
*/ */
...@@ -3561,6 +3530,40 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3561,6 +3530,40 @@ static void handle_stripe(struct stripe_head *sh)
s.dec_preread_active = 1; s.dec_preread_active = 1;
} }
/*
* might be able to return some write requests if the parity blocks
* are safe, or on a failed drive
*/
pdev = &sh->dev[sh->pd_idx];
s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
qdev = &sh->dev[sh->qd_idx];
s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
|| conf->level < 6;
if (s.written &&
(s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
&& !test_bit(R5_LOCKED, &pdev->flags)
&& (test_bit(R5_UPTODATE, &pdev->flags) ||
test_bit(R5_Discard, &pdev->flags))))) &&
(s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& (test_bit(R5_UPTODATE, &qdev->flags) ||
test_bit(R5_Discard, &qdev->flags))))))
handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite
|| (conf->level == 6 && s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
handle_stripe_fill(sh, &s, disks);
/* Now to consider new write requests and what else, if anything /* Now to consider new write requests and what else, if anything
* should be read. We do not handle new writes when: * should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight. * 1/ A 'write' operation (copy+xor) is already in flight.
...@@ -5529,6 +5532,10 @@ static int run(struct mddev *mddev) ...@@ -5529,6 +5532,10 @@ static int run(struct mddev *mddev)
* discard data disk but write parity disk * discard data disk but write parity disk
*/ */
stripe = stripe * PAGE_SIZE; stripe = stripe * PAGE_SIZE;
/* Round up to power of 2, as discard handling
* currently assumes that */
while ((stripe-1) & stripe)
stripe = (stripe | (stripe-1)) + 1;
mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_alignment = stripe;
mddev->queue->limits.discard_granularity = stripe; mddev->queue->limits.discard_granularity = stripe;
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment