Commit 46533ff7 authored by NeilBrown's avatar NeilBrown Committed by Shaohua Li

md: Use REQ_FAILFAST_* on metadata writes where appropriate

This can only be supported on personalities which ensure
that md_error() never causes an array to enter the 'failed'
state.  i.e. if marking a device Faulty would cause some
data to be inaccessible, the device is status is left as
non-Faulty.  This is true for RAID1 and RAID10.

If we get a failure writing metadata but the device doesn't
fail, it must be the last device so we re-write without
FAILFAST to improve chance of success.  We also flag the
device as LastDev so that future metadata updates don't
waste time on failfast writes.
Signed-off-by: default avatarNeilBrown <neilb@suse.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 688834e6
...@@ -209,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde ...@@ -209,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{ {
struct md_rdev *rdev = NULL; struct md_rdev *rdev;
struct block_device *bdev; struct block_device *bdev;
struct mddev *mddev = bitmap->mddev; struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage; struct bitmap_storage *store = &bitmap->storage;
restart:
rdev = NULL;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE; int size = PAGE_SIZE;
loff_t offset = mddev->bitmap_info.offset; loff_t offset = mddev->bitmap_info.offset;
...@@ -269,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -269,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
page); page);
} }
if (wait) if (wait && md_super_wait(mddev) < 0)
md_super_wait(mddev); goto restart;
return 0; return 0;
bad_alignment: bad_alignment:
...@@ -428,6 +430,13 @@ static void bitmap_wait_writes(struct bitmap *bitmap) ...@@ -428,6 +430,13 @@ static void bitmap_wait_writes(struct bitmap *bitmap)
wait_event(bitmap->write_wait, wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0); atomic_read(&bitmap->pending_writes)==0);
else else
/* Note that we ignore the return value. The writes
* might have failed, but that would just mean that
* some bits which should be cleared haven't been,
* which is safe. The relevant bitmap blocks will
* probably get written again, but there is no great
* loss if they aren't.
*/
md_super_wait(bitmap->mddev); md_super_wait(bitmap->mddev);
} }
......
...@@ -727,7 +727,13 @@ static void super_written(struct bio *bio) ...@@ -727,7 +727,13 @@ static void super_written(struct bio *bio)
if (bio->bi_error) { if (bio->bi_error) {
pr_err("md: super_written gets error=%d\n", bio->bi_error); pr_err("md: super_written gets error=%d\n", bio->bi_error);
md_error(mddev, rdev); md_error(mddev, rdev);
if (!test_bit(Faulty, &rdev->flags)
&& (bio->bi_opf & MD_FAILFAST)) {
set_bit(MD_NEED_REWRITE, &mddev->flags);
set_bit(LastDev, &rdev->flags);
} }
} else
clear_bit(LastDev, &rdev->flags);
if (atomic_dec_and_test(&mddev->pending_writes)) if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait); wake_up(&mddev->sb_wait);
...@@ -744,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, ...@@ -744,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
* if zero is reached. * if zero is reached.
* If an error occurred, call md_error * If an error occurred, call md_error
*/ */
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); struct bio *bio;
int ff = 0;
if (test_bit(Faulty, &rdev->flags))
return;
bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
...@@ -753,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, ...@@ -753,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
bio_add_page(bio, page, size, 0); bio_add_page(bio, page, size, 0);
bio->bi_private = rdev; bio->bi_private = rdev;
bio->bi_end_io = super_written; bio->bi_end_io = super_written;
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA);
if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
test_bit(FailFast, &rdev->flags) &&
!test_bit(LastDev, &rdev->flags))
ff = MD_FAILFAST;
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA | ff);
atomic_inc(&mddev->pending_writes); atomic_inc(&mddev->pending_writes);
submit_bio(bio); submit_bio(bio);
} }
void md_super_wait(struct mddev *mddev) int md_super_wait(struct mddev *mddev)
{ {
/* wait for all superblock writes that were scheduled to complete */ /* wait for all superblock writes that were scheduled to complete */
wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
if (test_and_clear_bit(MD_NEED_REWRITE, &mddev->flags))
return -EAGAIN;
return 0;
} }
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
...@@ -1334,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) ...@@ -1334,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
rdev->mddev->level >= 1) rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2; num_sectors = (sector_t)(2ULL << 32) - 2;
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page); rdev->sb_page);
md_super_wait(rdev->mddev); } while (md_super_wait(rdev->mddev) < 0);
return num_sectors; return num_sectors;
} }
...@@ -1877,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) ...@@ -1877,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sb->data_size = cpu_to_le64(num_sectors); sb->data_size = cpu_to_le64(num_sectors);
sb->super_offset = rdev->sb_start; sb->super_offset = rdev->sb_start;
sb->sb_csum = calc_sb_1_csum(sb); sb->sb_csum = calc_sb_1_csum(sb);
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page); rdev->sb_page);
md_super_wait(rdev->mddev); } while (md_super_wait(rdev->mddev) < 0);
return num_sectors; return num_sectors;
} }
...@@ -2416,6 +2438,7 @@ void md_update_sb(struct mddev *mddev, int force_change) ...@@ -2416,6 +2438,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
if (mddev->queue) if (mddev->queue)
blk_add_trace_msg(mddev->queue, "md md_update_sb"); blk_add_trace_msg(mddev->queue, "md md_update_sb");
rewrite:
bitmap_update_sb(mddev->bitmap); bitmap_update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
...@@ -2447,7 +2470,8 @@ void md_update_sb(struct mddev *mddev, int force_change) ...@@ -2447,7 +2470,8 @@ void md_update_sb(struct mddev *mddev, int force_change)
/* only need to write one superblock... */ /* only need to write one superblock... */
break; break;
} }
md_super_wait(mddev); if (md_super_wait(mddev) < 0)
goto rewrite;
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
if (mddev_is_clustered(mddev) && ret == 0) if (mddev_is_clustered(mddev) && ret == 0)
......
...@@ -29,6 +29,16 @@ ...@@ -29,6 +29,16 @@
#define MaxSector (~(sector_t)0) #define MaxSector (~(sector_t)0)
/*
* These flags should really be called "NO_RETRY" rather than
* "FAILFAST" because they don't make any promise about time lapse,
* only about the number of retries, which will be zero.
* REQ_FAILFAST_DRIVER is not included because
* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
* seems to suggest that the errors it avoids retrying should usually
* be retried.
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
/* /*
* MD's 'extended' device * MD's 'extended' device
*/ */
...@@ -177,6 +187,10 @@ enum flag_bits { ...@@ -177,6 +187,10 @@ enum flag_bits {
* It is expects that no bad block log * It is expects that no bad block log
* is present. * is present.
*/ */
LastDev, /* Seems to be the last working dev as
* it didn't fail, so don't use FailFast
* any more for metadata
*/
}; };
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
...@@ -213,6 +227,11 @@ enum mddev_flags { ...@@ -213,6 +227,11 @@ enum mddev_flags {
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
* already took resync lock, need to * already took resync lock, need to
* release the lock */ * release the lock */
MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
* supported as calls to md_error() will
* never cause the array to become failed.
*/
MD_NEED_REWRITE, /* metadata write needs to be repeated */
}; };
#define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \ #define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
BIT(MD_CHANGE_CLEAN) | \ BIT(MD_CHANGE_CLEAN) | \
...@@ -628,7 +647,7 @@ extern int mddev_congested(struct mddev *mddev, int bits); ...@@ -628,7 +647,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
extern void md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
sector_t sector, int size, struct page *page); sector_t sector, int size, struct page *page);
extern void md_super_wait(struct mddev *mddev); extern int md_super_wait(struct mddev *mddev);
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int op, int op_flags, struct page *page, int op, int op_flags,
bool metadata_op); bool metadata_op);
......
...@@ -2988,6 +2988,7 @@ static int raid1_run(struct mddev *mddev) ...@@ -2988,6 +2988,7 @@ static int raid1_run(struct mddev *mddev)
mddev->thread = conf->thread; mddev->thread = conf->thread;
conf->thread = NULL; conf->thread = NULL;
mddev->private = conf; mddev->private = conf;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
......
...@@ -3729,6 +3729,7 @@ static int raid10_run(struct mddev *mddev) ...@@ -3729,6 +3729,7 @@ static int raid10_run(struct mddev *mddev)
size = raid10_size(mddev, 0, 0); size = raid10_size(mddev, 0, 0);
md_set_array_sectors(mddev, size); md_set_array_sectors(mddev, size);
mddev->resync_max_sectors = size; mddev->resync_max_sectors = size;
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
if (mddev->queue) { if (mddev->queue) {
int stripe = conf->geo.raid_disks * int stripe = conf->geo.raid_disks *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment