Commit 4b382d06 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md:
  md: allow resync_start to be set while an array is active.
  md/raid10:  reformat some loops with less indenting.
  md/raid10: remove unused variable.
  md/raid10: make more use of 'slot' in raid10d.
  md/raid10: some tidying up in fix_read_error
  md/raid1: improve handling of pages allocated for write-behind.
  md/raid1: try fix_sync_read_error before process_checks.
  md/raid1: tidy up new functions: process_checks and fix_sync_read_error.
  md/raid1: split out two sub-functions from sync_request_write
  md: make error_handler functions more uniform and correct.
  md/multipath: discard ->working_disks in favour of ->degraded
  md/raid1: clean up read_balance.
  md: simplify raid10 read_balance
  md/bitmap: fix saving of events_cleared and other state.
  md: reject a re-add request that cannot be honoured.
  md: Fix race when creating a new md device.
parents bdfbe804 b098636c
......@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags);
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared) {
if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
}
sb->state = cpu_to_le32(bitmap->flags);
/* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
......@@ -618,7 +618,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
bitmap->flags |= BITMAP_HOSTENDIAN;
bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
if (sb->state & cpu_to_le32(BITMAP_STALE))
if (bitmap->flags & BITMAP_STALE)
bitmap->events_cleared = bitmap->mddev->events;
err = 0;
out:
......@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
switch (op) {
case MASK_SET:
sb->state |= cpu_to_le32(bits);
bitmap->flags |= bits;
break;
case MASK_UNSET:
sb->state &= cpu_to_le32(~bits);
bitmap->flags &= ~bits;
break;
default:
BUG();
......
......@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
char *e;
unsigned long long n = simple_strtoull(buf, &e, 10);
if (mddev->pers)
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
return -EBUSY;
if (cmd_match(buf, "none"))
n = MaxSector;
......@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name)
disk->fops = &md_fops;
disk->private_data = mddev;
disk->queue = mddev->queue;
blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
/* Allow extended partitions. This makes the
* 'mdp' device redundant, but we can't really
* remove it now.
*/
disk->flags |= GENHD_FL_EXT_DEVT;
add_disk(disk);
mddev->gendisk = disk;
/* As soon as we call add_disk(), another thread could get
* through to md_open, so make sure it doesn't get too far
*/
mutex_lock(&mddev->open_mutex);
add_disk(disk);
error = kobject_init_and_add(&mddev->kobj, &md_ktype,
&disk_to_dev(disk)->kobj, "%s", "md");
if (error) {
......@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_bitmap_group))
printk(KERN_DEBUG "pointless warning\n");
blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
mutex_unlock(&mddev->open_mutex);
abort:
mutex_unlock(&disks_mutex);
if (!error && mddev->kobj.sd) {
......@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
} else
super_types[mddev->major_version].
validate_super(mddev, rdev);
if ((info->state & (1<<MD_DISK_SYNC)) &&
(!test_bit(In_sync, &rdev->flags) ||
rdev->raid_disk != info->raid_disk)) {
/* This was a hot-add request, but events doesn't
* match, so reject it.
*/
export_rdev(rdev);
return -EINVAL;
}
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = rdev->raid_disk;
else
......
......@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
int i;
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
conf->working_disks);
conf->raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++)
seq_printf (seq, "%s",
conf->multipaths[i].rdev &&
......@@ -186,8 +186,9 @@ static int multipath_congested(void *data, int bits)
static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev->private;
char b[BDEVNAME_SIZE];
if (conf->working_disks <= 1) {
if (conf->raid_disks - mddev->degraded <= 1) {
/*
* Uh oh, we can do nothing if this is our last path, but
* first check if this is a queued request for a device
......@@ -196,25 +197,25 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
printk(KERN_ALERT
"multipath: only one IO path left and IO error.\n");
/* leave it active... it's all we have */
} else {
return;
}
/*
* Mark disk as unusable
*/
if (!test_bit(Faulty, &rdev->flags)) {
char b[BDEVNAME_SIZE];
clear_bit(In_sync, &rdev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded++;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
conf->working_disks--;
mddev->degraded++;
printk(KERN_ALERT "multipath: IO failure on %s,"
" disabling IO path.\n"
"multipath: Operation continuing"
" on %d IO paths.\n",
bdevname (rdev->bdev,b),
conf->working_disks);
}
}
bdevname(rdev->bdev, b),
conf->raid_disks - mddev->degraded);
}
static void print_multipath_conf (multipath_conf_t *conf)
......@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
printk("(conf==NULL)\n");
return;
}
printk(" --- wd:%d rd:%d\n", conf->working_disks,
printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
conf->raid_disks);
for (i = 0; i < conf->raid_disks; i++) {
......@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
PAGE_CACHE_SIZE - 1);
}
conf->working_disks++;
spin_lock_irq(&conf->device_lock);
mddev->degraded--;
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
spin_unlock_irq(&conf->device_lock);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
md_integrity_add_rdev(rdev, mddev);
......@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev)
int disk_idx;
struct multipath_info *disk;
mdk_rdev_t *rdev;
int working_disks;
if (md_check_no_bitmap(mddev))
return -EINVAL;
......@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev)
goto out_free_conf;
}
conf->working_disks = 0;
working_disks = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) {
disk_idx = rdev->raid_disk;
if (disk_idx < 0 ||
......@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev)
}
if (!test_bit(Faulty, &rdev->flags))
conf->working_disks++;
working_disks++;
}
conf->raid_disks = mddev->raid_disks;
......@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev)
spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list);
if (!conf->working_disks) {
if (!working_disks) {
printk(KERN_ERR "multipath: no operational IO paths for %s\n",
mdname(mddev));
goto out_free_conf;
}
mddev->degraded = conf->raid_disks - conf->working_disks;
mddev->degraded = conf->raid_disks - working_disks;
conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
sizeof(struct multipath_bh));
......@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev)
printk(KERN_INFO
"multipath: array %s active with %d out of %d IO paths\n",
mdname(mddev), conf->working_disks, mddev->raid_disks);
mdname(mddev), conf->raid_disks - mddev->degraded,
mddev->raid_disks);
/*
* Ok, everything is just fine now
*/
......
......@@ -9,7 +9,6 @@ struct multipath_private_data {
mddev_t *mddev;
struct multipath_info *multipaths;
int raid_disks;
int working_disks;
spinlock_t device_lock;
struct list_head retry_list;
......
......@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error)
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
}
static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv,
int behind)
static void r1_bio_write_done(r1bio_t *r1_bio)
{
if (atomic_dec_and_test(&r1_bio->remaining))
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
/* free extra copy of the data pages */
int i = vcnt;
int i = r1_bio->behind_page_count;
while (i--)
safe_put_page(bv[i].bv_page);
safe_put_page(r1_bio->behind_pages[i]);
kfree(r1_bio->behind_pages);
r1_bio->behind_pages = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
behind);
test_bit(R1BIO_BehindIO, &r1_bio->state));
md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio);
}
......@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
* Let's see if all mirrored write operations have finished
* already.
*/
r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind);
r1_bio_write_done(r1_bio);
if (to_put)
bio_put(to_put);
......@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
{
const sector_t this_sector = r1_bio->sector;
const int sectors = r1_bio->sectors;
int new_disk = -1;
int start_disk;
int best_disk;
int i;
sector_t new_distance, current_distance;
sector_t best_dist;
mdk_rdev_t *rdev;
int choose_first;
......@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
* We take the first readable disk when above the resync window.
*/
retry:
best_disk = -1;
best_dist = MaxSector;
if (conf->mddev->recovery_cp < MaxSector &&
(this_sector + sectors >= conf->next_resync)) {
choose_first = 1;
......@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
start_disk = conf->last_used;
}
/* make sure the disk is operational */
for (i = 0 ; i < conf->raid_disks ; i++) {
sector_t dist;
int disk = start_disk + i;
if (disk >= conf->raid_disks)
disk -= conf->raid_disks;
......@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| !test_bit(In_sync, &rdev->flags))
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < this_sector + sectors)
continue;
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
if (best_disk < 0)
best_disk = disk;
continue;
new_disk = disk;
if (!test_bit(WriteMostly, &rdev->flags))
break;
}
if (new_disk < 0 || choose_first)
goto rb_out;
/*
* Don't change to another disk for sequential reads:
/* This is a reasonable device to use. It might
* even be best.
*/
if (conf->next_seq_sect == this_sector)
goto rb_out;
if (this_sector == conf->mirrors[new_disk].head_position)
goto rb_out;
current_distance = abs(this_sector
- conf->mirrors[new_disk].head_position);
/* look for a better disk - i.e. head is closer */
start_disk = new_disk;
for (i = 1; i < conf->raid_disks; i++) {
int disk = start_disk + 1;
if (disk >= conf->raid_disks)
disk -= conf->raid_disks;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| !test_bit(In_sync, &rdev->flags)
|| test_bit(WriteMostly, &rdev->flags))
continue;
if (!atomic_read(&rdev->nr_pending)) {
new_disk = disk;
dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first
/* Don't change to another disk for sequential reads */
|| conf->next_seq_sect == this_sector
|| dist == 0
/* If device is idle, use it */
|| atomic_read(&rdev->nr_pending) == 0) {
best_disk = disk;
break;
}
new_distance = abs(this_sector - conf->mirrors[disk].head_position);
if (new_distance < current_distance) {
current_distance = new_distance;
new_disk = disk;
if (dist < best_dist) {
best_dist = dist;
best_disk = disk;
}
}
rb_out:
if (new_disk >= 0) {
rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
if (best_disk >= 0) {
rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
if (!test_bit(In_sync, &rdev->flags)) {
if (test_bit(Faulty, &rdev->flags)) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
......@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
goto retry;
}
conf->next_seq_sect = this_sector + sectors;
conf->last_used = new_disk;
conf->last_used = best_disk;
}
rcu_read_unlock();
return new_disk;
return best_disk;
}
static int raid1_congested(void *data, int bits)
......@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf)
/* duplicate the data pages for behind I/O
* We return a list of bio_vec rather than just page pointers
* as it makes freeing easier
*/
static struct bio_vec *alloc_behind_pages(struct bio *bio)
static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
{
int i;
struct bio_vec *bvec;
struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
GFP_NOIO);
if (unlikely(!pages))
goto do_sync_io;
return;
bio_for_each_segment(bvec, bio, i) {
pages[i].bv_page = alloc_page(GFP_NOIO);
if (unlikely(!pages[i].bv_page))
pages[i] = alloc_page(GFP_NOIO);
if (unlikely(!pages[i]))
goto do_sync_io;
memcpy(kmap(pages[i].bv_page) + bvec->bv_offset,
memcpy(kmap(pages[i]) + bvec->bv_offset,
kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
kunmap(pages[i].bv_page);
kunmap(pages[i]);
kunmap(bvec->bv_page);
}
return pages;
r1_bio->behind_pages = pages;
r1_bio->behind_page_count = bio->bi_vcnt;
set_bit(R1BIO_BehindIO, &r1_bio->state);
return;
do_sync_io:
if (pages)
for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++)
put_page(pages[i].bv_page);
for (i = 0; i < bio->bi_vcnt; i++)
if (pages[i])
put_page(pages[i]);
kfree(pages);
PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
return NULL;
}
static int make_request(mddev_t *mddev, struct bio * bio)
......@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
int i, targets = 0, disks;
struct bitmap *bitmap;
unsigned long flags;
struct bio_vec *behind_pages = NULL;
const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
......@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
if (bitmap &&
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait) &&
(behind_pages = alloc_behind_pages(bio)) != NULL)
set_bit(R1BIO_BehindIO, &r1_bio->state);
!waitqueue_active(&bitmap->behind_wait))
alloc_behind_pages(bio, r1_bio);
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
......@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio;
if (behind_pages) {
if (r1_bio->behind_pages) {
struct bio_vec *bvec;
int j;
......@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
* them all
*/
__bio_for_each_segment(bvec, mbio, j, 0)
bvec->bv_page = behind_pages[j].bv_page;
bvec->bv_page = r1_bio->behind_pages[j];
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
......@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
bio_list_add(&conf->pending_bio_list, mbio);
spin_unlock_irqrestore(&conf->device_lock, flags);
}
r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL);
kfree(behind_pages); /* the behind pages are attached to the bios now */
r1_bio_write_done(r1_bio);
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
......@@ -1196,101 +1178,9 @@ static void end_sync_write(struct bio *bio, int error)
}
}
static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
static int fix_sync_read_error(r1bio_t *r1_bio)
{
conf_t *conf = mddev->private;
int i;
int disks = conf->raid_disks;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We have read all readable devices. If we haven't
* got the block, then there is no hope left.
* If we have, then we want to do a comparison
* and skip the write if everything is the same.
* If any blocks failed to read, then we need to
* attempt an over-write
*/
int primary;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
for (i=0; i<mddev->raid_disks; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_read)
md_error(mddev, conf->mirrors[i].rdev);
md_done_sync(mddev, r1_bio->sectors, 1);
put_buf(r1_bio);
return;
}
for (primary=0; primary<mddev->raid_disks; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
}
r1_bio->read_disk = primary;
for (i=0; i<mddev->raid_disks; i++)
if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
s = sbio->bi_io_vec[j].bv_page;
if (memcmp(page_address(p),
page_address(s),
PAGE_SIZE))
break;
}
} else
j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
&& test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
} else {
/* fixup the bio for reuse */
int size;
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
sbio->bi_phys_segments = 0;
sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bi_flags |= 1 << BIO_UPTODATE;
sbio->bi_next = NULL;
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
size = sbio->bi_size;
for (j = 0; j < vcnt ; j++) {
struct bio_vec *bi;
bi = &sbio->bi_io_vec[j];
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
else
bi->bv_len = size;
size -= PAGE_SIZE;
memcpy(page_address(bi->bv_page),
page_address(pbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
}
}
}
}
if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
/* ouch - failed to read all of that.
* Try some synchronous reads of other devices to get
/* Try some synchronous reads of other devices to get
* good data, much like with normal read errors. Only
* read into the pages we already have so we don't
* need to re-issue the read request.
......@@ -1298,6 +1188,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* active sync request, there is no normal IO, and
* no overlapping syncs.
*/
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
struct bio *bio = r1_bio->bios[r1_bio->read_disk];
sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors;
int idx = 0;
......@@ -1307,6 +1200,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
int d = r1_bio->read_disk;
int success = 0;
mdk_rdev_t *rdev;
int start;
if (s > (PAGE_SIZE>>9))
s = PAGE_SIZE >> 9;
......@@ -1331,10 +1225,22 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
d = 0;
} while (!success && d != r1_bio->read_disk);
if (success) {
int start = d;
if (!success) {
char b[BDEVNAME_SIZE];
/* Cannot read from anywhere, array is toast */
md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
return 0;
}
start = d;
/* write it back and re-read */
set_bit(R1BIO_Uptodate, &r1_bio->state);
while (d != r1_bio->read_disk) {
if (d == 0)
d = conf->raid_disks;
......@@ -1342,13 +1248,16 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
if (r1_bio->bios[d]->bi_end_io != end_sync_read)
continue;
rdev = conf->mirrors[d].rdev;
atomic_add(s, &rdev->corrected_errors);
if (sync_page_io(rdev,
sect,
s<<9,
bio->bi_io_vec[idx].bv_page,
WRITE, false) == 0)
WRITE, false) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
md_error(mddev, rdev);
} else
atomic_add(s, &rdev->corrected_errors);
}
d = start;
while (d != r1_bio->read_disk) {
......@@ -1365,25 +1274,114 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
READ, false) == 0)
md_error(mddev, rdev);
}
} else {
char b[BDEVNAME_SIZE];
/* Cannot read from anywhere, array is toast */
md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
" for block %llu\n",
mdname(mddev),
bdevname(bio->bi_bdev, b),
(unsigned long long)r1_bio->sector);
md_done_sync(mddev, r1_bio->sectors, 0);
put_buf(r1_bio);
return;
}
sectors -= s;
sect += s;
idx ++;
}
set_bit(R1BIO_Uptodate, &r1_bio->state);
set_bit(BIO_UPTODATE, &bio->bi_flags);
return 1;
}
static int process_checks(r1bio_t *r1_bio)
{
/* We have read all readable devices. If we haven't
* got the block, then there is no hope left.
* If we have, then we want to do a comparison
* and skip the write if everything is the same.
* If any blocks failed to read, then we need to
* attempt an over-write
*/
mddev_t *mddev = r1_bio->mddev;
conf_t *conf = mddev->private;
int primary;
int i;
for (primary = 0; primary < conf->raid_disks; primary++)
if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
r1_bio->bios[primary]->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
break;
}
r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks; i++) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
int size;
if (r1_bio->bios[i]->bi_end_io != end_sync_read)
continue;
if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
for (j = vcnt; j-- ; ) {
struct page *p, *s;
p = pbio->bi_io_vec[j].bv_page;
s = sbio->bi_io_vec[j].bv_page;
if (memcmp(page_address(p),
page_address(s),
PAGE_SIZE))
break;
}
} else
j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
&& test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
/* No need to write to this device. */
sbio->bi_end_io = NULL;
rdev_dec_pending(conf->mirrors[i].rdev, mddev);
continue;
}
/* fixup the bio for reuse */
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
sbio->bi_phys_segments = 0;
sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bi_flags |= 1 << BIO_UPTODATE;
sbio->bi_next = NULL;
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
size = sbio->bi_size;
for (j = 0; j < vcnt ; j++) {
struct bio_vec *bi;
bi = &sbio->bi_io_vec[j];
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
else
bi->bv_len = size;
size -= PAGE_SIZE;
memcpy(page_address(bi->bv_page),
page_address(pbio->bi_io_vec[j].bv_page),
PAGE_SIZE);
}
}
return 0;
}
static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
{
conf_t *conf = mddev->private;
int i;
int disks = conf->raid_disks;
struct bio *bio, *wbio;
bio = r1_bio->bios[r1_bio->read_disk];
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
/* ouch - failed to read all of that. */
if (!fix_sync_read_error(r1_bio))
return;
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
if (process_checks(r1_bio) < 0)
return;
/*
* schedule writes
*/
......@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
......
......@@ -94,7 +94,9 @@ struct r1bio_s {
int read_disk;
struct list_head retry_list;
struct bitmap_update *bitmap_update;
/* Next two are only valid when R1BIO_BehindIO is set */
struct page **behind_pages;
int behind_page_count;
/*
* if the IO is in WRITE direction, then multiple bios are used.
* We choose the number when they are allocated.
......
......@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error)
*/
set_bit(R10BIO_Uptodate, &r10_bio->state);
raid_end_bio_io(r10_bio);
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
} else {
/*
* oops, read error:
* oops, read error - keep the refcount on the rdev
*/
char b[BDEVNAME_SIZE];
if (printk_ratelimit())
......@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error)
bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
reschedule_retry(r10_bio);
}
rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
}
static void raid10_end_write_request(struct bio *bio, int error)
......@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q,
static int read_balance(conf_t *conf, r10bio_t *r10_bio)
{
const sector_t this_sector = r10_bio->sector;
int disk, slot, nslot;
int disk, slot;
const int sectors = r10_bio->sectors;
sector_t new_distance, current_distance;
sector_t new_distance, best_dist;
mdk_rdev_t *rdev;
int do_balance;
int best_slot;
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
retry:
best_slot = -1;
best_dist = MaxSector;
do_balance = 1;
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below
......@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
* above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) {
/* make sure that disk is operational */
slot = 0;
disk = r10_bio->devs[slot].devnum;
while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
r10_bio->devs[slot].bio == IO_BLOCKED ||
!test_bit(In_sync, &rdev->flags)) {
slot++;
if (slot == conf->copies) {
slot = 0;
disk = -1;
break;
}
disk = r10_bio->devs[slot].devnum;
}
goto rb_out;
}
&& (this_sector + sectors >= conf->next_resync))
do_balance = 0;
/* make sure the disk is operational */
slot = 0;
disk = r10_bio->devs[slot].devnum;
while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
r10_bio->devs[slot].bio == IO_BLOCKED ||
!test_bit(In_sync, &rdev->flags)) {
slot ++;
if (slot == conf->copies) {
disk = -1;
goto rb_out;
}
for (slot = 0; slot < conf->copies ; slot++) {
if (r10_bio->devs[slot].bio == IO_BLOCKED)
continue;
disk = r10_bio->devs[slot].devnum;
}
current_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
/* Find the disk whose head is closest,
* or - for far > 1 - find the closest to partition beginning */
for (nslot = slot; nslot < conf->copies; nslot++) {
int ndisk = r10_bio->devs[nslot].devnum;
if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
r10_bio->devs[nslot].bio == IO_BLOCKED ||
!test_bit(In_sync, &rdev->flags))
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (rdev == NULL)
continue;
if (!test_bit(In_sync, &rdev->flags))
continue;
if (!do_balance)
break;
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
*/
if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) {
disk = ndisk;
slot = nslot;
if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
break;
}
/* for far > 1 always use the lowest address */
if (conf->far_copies > 1)
new_distance = r10_bio->devs[nslot].addr;
new_distance = r10_bio->devs[slot].addr;
else
new_distance = abs(r10_bio->devs[nslot].addr -
conf->mirrors[ndisk].head_position);
if (new_distance < current_distance) {
current_distance = new_distance;
disk = ndisk;
slot = nslot;
new_distance = abs(r10_bio->devs[slot].addr -
conf->mirrors[disk].head_position);
if (new_distance < best_dist) {
best_dist = new_distance;
best_slot = slot;
}
}
if (slot == conf->copies)
slot = best_slot;
rb_out:
if (slot >= 0) {
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
if (test_bit(Faulty, &rdev->flags)) {
/* Cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
rdev_dec_pending(rdev, conf->mddev);
goto retry;
}
r10_bio->read_slot = slot;
/* conf->next_seq_sect = this_sector + sectors;*/
if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL)
atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
else
} else
disk = -1;
rcu_read_unlock();
......@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
rcu_read_lock();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev) { /* If rdev is not NULL */
char b[BDEVNAME_SIZE];
int cur_read_error_count = 0;
bdevname(rdev->bdev, b);
/* still own a reference to this rdev, so it cannot
* have been cleared recently.
*/
rdev = conf->mirrors[d].rdev;
if (test_bit(Faulty, &rdev->flags)) {
rcu_read_unlock();
if (test_bit(Faulty, &rdev->flags))
/* drive has already been failed, just ignore any
more fix_read_error() attempts */
return;
}
check_decay_read_errors(mddev, rdev);
atomic_inc(&rdev->read_errors);
cur_read_error_count = atomic_read(&rdev->read_errors);
if (cur_read_error_count > max_read_errors) {
rcu_read_unlock();
if (atomic_read(&rdev->read_errors) > max_read_errors) {
char b[BDEVNAME_SIZE];
bdevname(rdev->bdev, b);
printk(KERN_NOTICE
"md/raid10:%s: %s: Raid device exceeded "
"read_error threshold "
"[cur %d:max %d]\n",
mdname(mddev),
b, cur_read_error_count, max_read_errors);
"read_error threshold [cur %d:max %d]\n",
mdname(mddev), b,
atomic_read(&rdev->read_errors), max_read_errors);
printk(KERN_NOTICE
"md/raid10:%s: %s: Failing raid "
"device\n", mdname(mddev), b);
"md/raid10:%s: %s: Failing raid device\n",
mdname(mddev), b);
md_error(mddev, conf->mirrors[d].rdev);
return;
}
}
rcu_read_unlock();
while(sectors) {
int s = sectors;
......@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"write failed"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
(unsigned long long)(sect+
rdev->data_offset),
(unsigned long long)(
sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n",
......@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"corrected sectors"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
(unsigned long long)(sect+
rdev->data_offset),
(unsigned long long)(
sect + rdev->data_offset),
bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n",
mdname(mddev),
......@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
"md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)\n",
mdname(mddev), s,
(unsigned long long)(sect+
rdev->data_offset),
(unsigned long long)(
sect + rdev->data_offset),
bdevname(rdev->bdev, b));
}
......@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev)
else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
recovery_request_write(mddev, r10_bio);
else {
int mirror;
int slot = r10_bio->read_slot;
int mirror = r10_bio->devs[slot].devnum;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
* We freeze all other IO, and try reading the block from
......@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev)
fix_read_error(conf, mddev, r10_bio);
unfreeze_array(conf);
}
rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
bio = r10_bio->devs[r10_bio->read_slot].bio;
r10_bio->devs[r10_bio->read_slot].bio =
bio = r10_bio->devs[slot].bio;
r10_bio->devs[slot].bio =
mddev->ro ? IO_BLOCKED : NULL;
mirror = read_balance(conf, r10_bio);
if (mirror == -1) {
......@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev)
} else {
const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
bio_put(bio);
slot = r10_bio->read_slot;
rdev = conf->mirrors[mirror].rdev;
if (printk_ratelimit())
printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to"
......@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev)
(unsigned long long)r10_bio->sector);
bio = bio_clone_mddev(r10_bio->master_bio,
GFP_NOIO, mddev);
r10_bio->devs[r10_bio->read_slot].bio = bio;
bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
r10_bio->devs[slot].bio = bio;
bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset;
bio->bi_bdev = rdev->bdev;
bio->bi_rw = READ | do_sync;
......@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf)
*
*/
static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
int *skipped, int go_faster)
{
conf_t *conf = mddev->private;
r10bio_t *r10_bio;
struct bio *biolist = NULL, *bio;
sector_t max_sector, nr_sectors;
int disk;
int i;
int max_sync;
sector_t sync_blocks;
......@@ -1858,14 +1831,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int j, k;
r10_bio = NULL;
for (i=0 ; i<conf->raid_disks; i++)
if (conf->mirrors[i].rdev &&
!test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
int still_degraded = 0;
/* want to reconstruct this device */
r10bio_t *rb2 = r10_bio;
sector_t sect = raid10_find_virt(conf, sector_nr, i);
for (i=0 ; i<conf->raid_disks; i++) {
int still_degraded;
r10bio_t *rb2;
sector_t sect;
int must_sync;
if (conf->mirrors[i].rdev == NULL ||
test_bit(In_sync, &conf->mirrors[i].rdev->flags))
continue;
still_degraded = 0;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
/* Unless we are doing a full sync, we only need
* to recover the block if it is set in the bitmap
*/
......@@ -1910,8 +1889,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev &&
test_bit(In_sync, &conf->mirrors[d].rdev->flags)) {
if (!conf->mirrors[d].rdev ||
!test_bit(In_sync, &conf->mirrors[d].rdev->flags))
continue;
/* This is where we read from */
bio = r10_bio->devs[0].bio;
bio->bi_next = biolist;
......@@ -1945,7 +1925,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
break;
}
}
if (j == conf->copies) {
/* Cannot recover, so abort the recovery */
put_buf(r10_bio);
......@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
*skipped = 1;
return sync_blocks + sectors_skipped;
......@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0; i<conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
if (r10_bio->devs[i].bio->bi_end_io)
rdev_dec_pending(conf->mirrors[d].rdev, mddev);
rdev_dec_pending(conf->mirrors[d].rdev,
mddev);
}
put_buf(r10_bio);
biolist = NULL;
......@@ -2047,18 +2028,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
do {
struct page *page;
int len = PAGE_SIZE;
disk = 0;
if (sector_nr + (len>>9) > max_sector)
len = (max_sector - sector_nr) << 9;
if (len == 0)
break;
for (bio= biolist ; bio ; bio=bio->bi_next) {
struct bio *bio2;
page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
if (bio_add_page(bio, page, len, 0) == 0) {
if (bio_add_page(bio, page, len, 0))
continue;
/* stop here */
struct bio *bio2;
bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
for (bio2 = biolist;
bio2 && bio2 != bio;
bio2 = bio2->bi_next) {
/* remove last page from this bio */
bio2->bi_vcnt--;
bio2->bi_size -= len;
......@@ -2066,8 +2050,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
}
goto bio_full;
}
disk = i;
}
nr_sectors += len>>9;
sector_nr += len>>9;
} while (biolist->bi_vcnt < RESYNC_PAGES);
......
......@@ -1700,8 +1700,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
raid5_conf_t *conf = mddev->private;
pr_debug("raid456: error called\n");
if (!test_bit(Faulty, &rdev->flags)) {
set_bit(MD_CHANGE_DEVS, &mddev->flags);
if (test_and_clear_bit(In_sync, &rdev->flags)) {
unsigned long flags;
spin_lock_irqsave(&conf->device_lock, flags);
......@@ -1713,6 +1711,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
}
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n",
......@@ -1720,7 +1719,6 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
}
}
/*
......@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment