Commit 93416253 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md:
  md/raid5: don't include 'spare' drives when reshaping to fewer devices.
  md/raid5: add a missing 'continue' in a loop.
  md/raid5: Allow recovered part of partially recovered devices to be in-sync
  md/raid5: More careful check for "has array failed".
  md: Don't update ->recovery_offset when reshaping an array to fewer devices.
  md/raid5: avoid oops when number of devices is reduced then increased.
  md: enable raid4->raid0 takeover
  md: clear layout after ->raid0 takeover
  md: fix raid10 takeover: use new_layout for setup_conf
  md: fix handling of array level takeover that re-arranges devices.
  md: raid10: Fix null pointer dereference in fix_read_error()
  Restore partition detection of newly created md arrays.
parents b4322e70 3424bf6a
...@@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares) ...@@ -2087,6 +2087,7 @@ static void sync_sbs(mddev_t * mddev, int nospares)
/* First make sure individual recovery_offsets are correct */ /* First make sure individual recovery_offsets are correct */
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset) mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed; rdev->recovery_offset = mddev->curr_resync_completed;
...@@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -3001,6 +3002,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
return -EINVAL; return -EINVAL;
} }
list_for_each_entry(rdev, &mddev->disks, same_set)
rdev->new_raid_disk = rdev->raid_disk;
/* ->takeover must set new_* and/or delta_disks /* ->takeover must set new_* and/or delta_disks
* if it succeeds, and may set them when it fails. * if it succeeds, and may set them when it fails.
*/ */
...@@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -3051,13 +3055,35 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
mddev->safemode = 0; mddev->safemode = 0;
} }
module_put(mddev->pers->owner); list_for_each_entry(rdev, &mddev->disks, same_set) {
/* Invalidate devices that are now superfluous */ char nm[20];
list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk < 0)
if (rdev->raid_disk >= mddev->raid_disks) { continue;
rdev->raid_disk = -1; if (rdev->new_raid_disk > mddev->raid_disks)
rdev->new_raid_disk = -1;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk < 0)
continue;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
rdev->raid_disk = rdev->new_raid_disk;
if (rdev->raid_disk < 0)
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
else {
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
printk("md: cannot register %s for %s after level change\n",
nm, mdname(mddev));
} }
}
module_put(mddev->pers->owner);
mddev->pers = pers; mddev->pers = pers;
mddev->private = priv; mddev->private = priv;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
...@@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) ...@@ -5895,6 +5921,7 @@ static int md_open(struct block_device *bdev, fmode_t mode)
atomic_inc(&mddev->openers); atomic_inc(&mddev->openers);
mutex_unlock(&mddev->open_mutex); mutex_unlock(&mddev->open_mutex);
check_disk_size_change(mddev->gendisk, bdev);
out: out:
return err; return err;
} }
...@@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev) ...@@ -6846,6 +6873,7 @@ void md_do_sync(mddev_t *mddev)
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(rdev, &mddev->disks, same_set) list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset < mddev->curr_resync)
......
...@@ -78,6 +78,9 @@ struct mdk_rdev_s ...@@ -78,6 +78,9 @@ struct mdk_rdev_s
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */ int raid_disk; /* role of device in array */
int new_raid_disk; /* role that the device will have in
* the array after a level-change completes.
*/
int saved_raid_disk; /* role that device used to have in the int saved_raid_disk; /* role that device used to have in the
* array and could again if we did a partial * array and could again if we did a partial
* resync from the bitmap * resync from the bitmap
......
...@@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) ...@@ -173,9 +173,11 @@ static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf)
list_for_each_entry(rdev1, &mddev->disks, same_set) { list_for_each_entry(rdev1, &mddev->disks, same_set) {
int j = rdev1->raid_disk; int j = rdev1->raid_disk;
if (mddev->level == 10) if (mddev->level == 10) {
/* taking over a raid10-n2 array */ /* taking over a raid10-n2 array */
j /= 2; j /= 2;
rdev1->new_raid_disk = j;
}
if (j < 0 || j >= mddev->raid_disks) { if (j < 0 || j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - " printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
...@@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev) ...@@ -361,12 +363,6 @@ static int raid0_run(mddev_t *mddev)
mddev->private = conf; mddev->private = conf;
} }
conf = mddev->private; conf = mddev->private;
if (conf->scale_raid_disks) {
int i;
for (i=0; i < conf->strip_zone[0].nb_dev; i++)
conf->devlist[i]->raid_disk /= conf->scale_raid_disks;
/* FIXME update sysfs rd links */
}
/* calculate array device size */ /* calculate array device size */
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
...@@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev) ...@@ -573,7 +569,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
return; return;
} }
static void *raid0_takeover_raid5(mddev_t *mddev) static void *raid0_takeover_raid45(mddev_t *mddev)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
raid0_conf_t *priv_conf; raid0_conf_t *priv_conf;
...@@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev) ...@@ -596,6 +592,7 @@ static void *raid0_takeover_raid5(mddev_t *mddev)
/* Set new parameters */ /* Set new parameters */
mddev->new_level = 0; mddev->new_level = 0;
mddev->new_layout = 0;
mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->raid_disks--; mddev->raid_disks--;
mddev->delta_disks = -1; mddev->delta_disks = -1;
...@@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev) ...@@ -635,6 +632,7 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
/* Set new parameters */ /* Set new parameters */
mddev->new_level = 0; mddev->new_level = 0;
mddev->new_layout = 0;
mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->delta_disks = - mddev->raid_disks / 2; mddev->delta_disks = - mddev->raid_disks / 2;
mddev->raid_disks += mddev->delta_disks; mddev->raid_disks += mddev->delta_disks;
...@@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev) ...@@ -643,19 +641,22 @@ static void *raid0_takeover_raid10(mddev_t *mddev)
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
create_strip_zones(mddev, &priv_conf); create_strip_zones(mddev, &priv_conf);
priv_conf->scale_raid_disks = 2;
return priv_conf; return priv_conf;
} }
static void *raid0_takeover(mddev_t *mddev) static void *raid0_takeover(mddev_t *mddev)
{ {
/* raid0 can take over: /* raid0 can take over:
* raid4 - if all data disks are active.
* raid5 - providing it is Raid4 layout and one disk is faulty * raid5 - providing it is Raid4 layout and one disk is faulty
* raid10 - assuming we have all necessary active disks * raid10 - assuming we have all necessary active disks
*/ */
if (mddev->level == 4)
return raid0_takeover_raid45(mddev);
if (mddev->level == 5) { if (mddev->level == 5) {
if (mddev->layout == ALGORITHM_PARITY_N) if (mddev->layout == ALGORITHM_PARITY_N)
return raid0_takeover_raid5(mddev); return raid0_takeover_raid45(mddev);
printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
mdname(mddev), ALGORITHM_PARITY_N); mdname(mddev), ALGORITHM_PARITY_N);
......
...@@ -13,9 +13,6 @@ struct raid0_private_data ...@@ -13,9 +13,6 @@ struct raid0_private_data
struct strip_zone *strip_zone; struct strip_zone *strip_zone;
mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
int nr_strip_zones; int nr_strip_zones;
int scale_raid_disks; /* divide rdev->raid_disks by this in run()
* to handle conversion from raid10
*/
}; };
typedef struct raid0_private_data raid0_conf_t; typedef struct raid0_private_data raid0_conf_t;
......
...@@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1482,14 +1482,14 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
int sectors = r10_bio->sectors; int sectors = r10_bio->sectors;
mdk_rdev_t*rdev; mdk_rdev_t*rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
rcu_read_lock(); rcu_read_lock();
{ rdev = rcu_dereference(conf->mirrors[d].rdev);
int d = r10_bio->devs[r10_bio->read_slot].devnum; if (rdev) { /* If rdev is not NULL */
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
int cur_read_error_count = 0; int cur_read_error_count = 0;
rdev = rcu_dereference(conf->mirrors[d].rdev);
bdevname(rdev->bdev, b); bdevname(rdev->bdev, b);
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
...@@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1530,7 +1530,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_lock(); rcu_read_lock();
do { do {
int d = r10_bio->devs[sl].devnum; d = r10_bio->devs[sl].devnum;
rdev = rcu_dereference(conf->mirrors[d].rdev); rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev && if (rdev &&
test_bit(In_sync, &rdev->flags)) { test_bit(In_sync, &rdev->flags)) {
...@@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1564,7 +1564,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
rcu_read_lock(); rcu_read_lock();
while (sl != r10_bio->read_slot) { while (sl != r10_bio->read_slot) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
int d;
if (sl==0) if (sl==0)
sl = conf->copies; sl = conf->copies;
sl--; sl--;
...@@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) ...@@ -1601,7 +1601,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
} }
sl = start; sl = start;
while (sl != r10_bio->read_slot) { while (sl != r10_bio->read_slot) {
int d;
if (sl==0) if (sl==0)
sl = conf->copies; sl = conf->copies;
sl--; sl--;
...@@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev) ...@@ -2161,22 +2161,22 @@ static conf_t *setup_conf(mddev_t *mddev)
sector_t stride, size; sector_t stride, size;
int err = -EINVAL; int err = -EINVAL;
if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
!is_power_of_2(mddev->chunk_sectors)) { !is_power_of_2(mddev->new_chunk_sectors)) {
printk(KERN_ERR "md/raid10:%s: chunk size must be " printk(KERN_ERR "md/raid10:%s: chunk size must be "
"at least PAGE_SIZE(%ld) and be a power of 2.\n", "at least PAGE_SIZE(%ld) and be a power of 2.\n",
mdname(mddev), PAGE_SIZE); mdname(mddev), PAGE_SIZE);
goto out; goto out;
} }
nc = mddev->layout & 255; nc = mddev->new_layout & 255;
fc = (mddev->layout >> 8) & 255; fc = (mddev->new_layout >> 8) & 255;
fo = mddev->layout & (1<<16); fo = mddev->new_layout & (1<<16);
if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
(mddev->layout >> 17)) { (mddev->new_layout >> 17)) {
printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
mdname(mddev), mddev->layout); mdname(mddev), mddev->new_layout);
goto out; goto out;
} }
...@@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev) ...@@ -2241,7 +2241,6 @@ static conf_t *setup_conf(mddev_t *mddev)
if (!conf->thread) if (!conf->thread)
goto out; goto out;
conf->scale_disks = 0;
conf->mddev = mddev; conf->mddev = mddev;
return conf; return conf;
...@@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev) ...@@ -2300,11 +2299,6 @@ static int run(mddev_t *mddev)
if (disk_idx >= conf->raid_disks if (disk_idx >= conf->raid_disks
|| disk_idx < 0) || disk_idx < 0)
continue; continue;
if (conf->scale_disks) {
disk_idx *= conf->scale_disks;
rdev->raid_disk = disk_idx;
/* MOVE 'rd%d' link !! */
}
disk = conf->mirrors + disk_idx; disk = conf->mirrors + disk_idx;
disk->rdev = rdev; disk->rdev = rdev;
...@@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev) ...@@ -2435,26 +2429,22 @@ static void *raid10_takeover_raid0(mddev_t *mddev)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
/* Update slot numbers to obtain
* degraded raid10 with missing mirrors
*/
list_for_each_entry(rdev, &mddev->disks, same_set) {
rdev->raid_disk *= 2;
}
/* Set new parameters */ /* Set new parameters */
mddev->new_level = 10; mddev->new_level = 10;
/* new layout: far_copies = 1, near_copies = 2 */ /* new layout: far_copies = 1, near_copies = 2 */
mddev->new_layout = (1<<8) + 2; mddev->new_layout = (1<<8) + 2;
mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->delta_disks = mddev->raid_disks; mddev->delta_disks = mddev->raid_disks;
mddev->degraded = mddev->raid_disks;
mddev->raid_disks *= 2; mddev->raid_disks *= 2;
/* make sure it will be not marked as dirty */ /* make sure it will be not marked as dirty */
mddev->recovery_cp = MaxSector; mddev->recovery_cp = MaxSector;
conf = setup_conf(mddev); conf = setup_conf(mddev);
conf->scale_disks = 2; if (!IS_ERR(conf))
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0)
rdev->new_raid_disk = rdev->raid_disk * 2;
return conf; return conf;
} }
......
...@@ -38,11 +38,6 @@ struct r10_private_data_s { ...@@ -38,11 +38,6 @@ struct r10_private_data_s {
int chunk_shift; /* shift from chunks to sectors */ int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask; sector_t chunk_mask;
int scale_disks; /* When starting array, multiply
* each ->raid_disk by this.
* Need for raid0->raid10 migration
*/
struct list_head retry_list; struct list_head retry_list;
/* queue pending writes and submit them on unplug */ /* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list; struct bio_list pending_bio_list;
......
...@@ -277,12 +277,13 @@ static struct stripe_head *get_free_stripe(raid5_conf_t *conf) ...@@ -277,12 +277,13 @@ static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
return sh; return sh;
} }
static void shrink_buffers(struct stripe_head *sh, int num) static void shrink_buffers(struct stripe_head *sh)
{ {
struct page *p; struct page *p;
int i; int i;
int num = sh->raid_conf->pool_size;
for (i=0; i<num ; i++) { for (i = 0; i < num ; i++) {
p = sh->dev[i].page; p = sh->dev[i].page;
if (!p) if (!p)
continue; continue;
...@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num) ...@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
} }
} }
static int grow_buffers(struct stripe_head *sh, int num) static int grow_buffers(struct stripe_head *sh)
{ {
int i; int i;
int num = sh->raid_conf->pool_size;
for (i=0; i<num; i++) { for (i = 0; i < num; i++) {
struct page *page; struct page *page;
if (!(page = alloc_page(GFP_KERNEL))) { if (!(page = alloc_page(GFP_KERNEL))) {
...@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, ...@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
return NULL; return NULL;
} }
/*
* Need to check if array has failed when deciding whether to:
* - start an array
* - remove non-faulty devices
* - add a spare
* - allow a reshape
* This determination is simple when no reshape is happening.
* However if there is a reshape, we need to carefully check
* both the before and after sections.
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
*/
static int has_failed(raid5_conf_t *conf)
{
int degraded;
int i;
if (conf->mddev->reshape_position == MaxSector)
return conf->mddev->degraded > conf->max_degraded;
rcu_read_lock();
degraded = 0;
for (i = 0; i < conf->previous_raid_disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (test_bit(In_sync, &rdev->flags))
;
else
/* not in-sync or faulty.
* If the reshape increases the number of devices,
* this is being recovered by the reshape, so
* this 'previous' section is not in_sync.
* If the number of devices is being reduced however,
* the device can only be part of the array if
* we are reverting a reshape, so this section will
* be in-sync.
*/
if (conf->raid_disks >= conf->previous_raid_disks)
degraded++;
}
rcu_read_unlock();
if (degraded > conf->max_degraded)
return 1;
rcu_read_lock();
degraded = 0;
for (i = 0; i < conf->raid_disks; i++) {
mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (test_bit(In_sync, &rdev->flags))
;
else
/* not in-sync or faulty.
* If reshape increases the number of devices, this
* section has already been recovered, else it
* almost certainly hasn't.
*/
if (conf->raid_disks <= conf->previous_raid_disks)
degraded++;
}
rcu_read_unlock();
if (degraded > conf->max_degraded)
return 1;
return 0;
}
static void unplug_slaves(mddev_t *mddev); static void unplug_slaves(mddev_t *mddev);
static void raid5_unplug_device(struct request_queue *q); static void raid5_unplug_device(struct request_queue *q);
...@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) ...@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
static int grow_one_stripe(raid5_conf_t *conf) static int grow_one_stripe(raid5_conf_t *conf)
{ {
struct stripe_head *sh; struct stripe_head *sh;
int disks = max(conf->raid_disks, conf->previous_raid_disks);
sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
if (!sh) if (!sh)
return 0; return 0;
memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev)); memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
sh->raid_conf = conf; sh->raid_conf = conf;
spin_lock_init(&sh->lock); spin_lock_init(&sh->lock);
#ifdef CONFIG_MULTICORE_RAID456 #ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&sh->ops.wait_for_ops); init_waitqueue_head(&sh->ops.wait_for_ops);
#endif #endif
if (grow_buffers(sh, disks)) { if (grow_buffers(sh)) {
shrink_buffers(sh, disks); shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh); kmem_cache_free(conf->slab_cache, sh);
return 0; return 0;
} }
...@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf) ...@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
if (!sh) if (!sh)
return 0; return 0;
BUG_ON(atomic_read(&sh->count)); BUG_ON(atomic_read(&sh->count));
shrink_buffers(sh, conf->pool_size); shrink_buffers(sh);
kmem_cache_free(conf->slab_cache, sh); kmem_cache_free(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes); atomic_dec(&conf->active_stripes);
return 1; return 1;
...@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh)
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
dev = &sh->dev[i]; dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
pr_debug("check %d: state 0x%lx toread %p read %p write %p " pr_debug("check %d: state 0x%lx toread %p read %p write %p "
"written %p\n", i, dev->flags, dev->toread, dev->read, "written %p\n", i, dev->flags, dev->toread, dev->read,
...@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh)
blocked_rdev = rdev; blocked_rdev = rdev;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
} }
if (!rdev || !test_bit(In_sync, &rdev->flags)) { clear_bit(R5_Insync, &dev->flags);
if (!rdev)
/* Not in-sync */;
else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
else {
/* could be in-sync depending on recovery/reshape status */
if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
set_bit(R5_Insync, &dev->flags);
}
if (!test_bit(R5_Insync, &dev->flags)) {
/* The ReadError flag will just be confusing now */ /* The ReadError flag will just be confusing now */
clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReadError, &dev->flags);
clear_bit(R5_ReWrite, &dev->flags); clear_bit(R5_ReWrite, &dev->flags);
} }
if (!rdev || !test_bit(In_sync, &rdev->flags) if (test_bit(R5_ReadError, &dev->flags))
|| test_bit(R5_ReadError, &dev->flags)) { clear_bit(R5_Insync, &dev->flags);
if (!test_bit(R5_Insync, &dev->flags)) {
s.failed++; s.failed++;
s.failed_num = i; s.failed_num = i;
} else }
set_bit(R5_Insync, &dev->flags);
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh)
for (i=disks; i--; ) { for (i=disks; i--; ) {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
dev = &sh->dev[i]; dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
pr_debug("check %d: state 0x%lx read %p write %p written %p\n", pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
i, dev->flags, dev->toread, dev->towrite, dev->written); i, dev->flags, dev->toread, dev->towrite, dev->written);
...@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh)
blocked_rdev = rdev; blocked_rdev = rdev;
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
} }
if (!rdev || !test_bit(In_sync, &rdev->flags)) { clear_bit(R5_Insync, &dev->flags);
if (!rdev)
/* Not in-sync */;
else if (test_bit(In_sync, &rdev->flags))
set_bit(R5_Insync, &dev->flags);
else {
/* in sync if before recovery_offset */
if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
set_bit(R5_Insync, &dev->flags);
}
if (!test_bit(R5_Insync, &dev->flags)) {
/* The ReadError flag will just be confusing now */ /* The ReadError flag will just be confusing now */
clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReadError, &dev->flags);
clear_bit(R5_ReWrite, &dev->flags); clear_bit(R5_ReWrite, &dev->flags);
} }
if (!rdev || !test_bit(In_sync, &rdev->flags) if (test_bit(R5_ReadError, &dev->flags))
|| test_bit(R5_ReadError, &dev->flags)) { clear_bit(R5_Insync, &dev->flags);
if (!test_bit(R5_Insync, &dev->flags)) {
if (s.failed < 2) if (s.failed < 2)
r6s.failed_num[s.failed] = i; r6s.failed_num[s.failed] = i;
s.failed++; s.failed++;
} else }
set_bit(R5_Insync, &dev->flags);
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev) ...@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev)
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
if (rdev->raid_disk < 0) if (rdev->raid_disk < 0)
continue; continue;
if (test_bit(In_sync, &rdev->flags)) if (test_bit(In_sync, &rdev->flags)) {
working_disks++; working_disks++;
continue;
}
/* This disc is not fully in-sync. However if it /* This disc is not fully in-sync. However if it
* just stored parity (beyond the recovery_offset), * just stored parity (beyond the recovery_offset),
* when we don't need to be concerned about the * when we don't need to be concerned about the
...@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev) ...@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev)
mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks) mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
- working_disks); - working_disks);
if (mddev->degraded > conf->max_degraded) { if (has_failed(conf)) {
printk(KERN_ERR "md/raid:%s: not enough operational devices" printk(KERN_ERR "md/raid:%s: not enough operational devices"
" (%d/%d failed)\n", " (%d/%d failed)\n",
mdname(mddev), mddev->degraded, conf->raid_disks); mdname(mddev), mddev->degraded, conf->raid_disks);
...@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev) ...@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev)
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i; tmp = conf->disks + i;
if (tmp->rdev if (tmp->rdev
&& tmp->rdev->recovery_offset == MaxSector
&& !test_bit(Faulty, &tmp->rdev->flags) && !test_bit(Faulty, &tmp->rdev->flags)
&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) { && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
unsigned long flags; unsigned long flags;
...@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number) ...@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
* isn't possible. * isn't possible.
*/ */
if (!test_bit(Faulty, &rdev->flags) && if (!test_bit(Faulty, &rdev->flags) &&
mddev->degraded <= conf->max_degraded && !has_failed(conf) &&
number < conf->raid_disks) { number < conf->raid_disks) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
...@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
int first = 0; int first = 0;
int last = conf->raid_disks - 1; int last = conf->raid_disks - 1;
if (mddev->degraded > conf->max_degraded) if (has_failed(conf))
/* no point adding a device */ /* no point adding a device */
return -EINVAL; return -EINVAL;
...@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev) ...@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev)
if (mddev->bitmap) if (mddev->bitmap)
/* Cannot grow a bitmap yet */ /* Cannot grow a bitmap yet */
return -EBUSY; return -EBUSY;
if (mddev->degraded > conf->max_degraded) if (has_failed(conf))
return -EINVAL; return -EINVAL;
if (mddev->delta_disks < 0) { if (mddev->delta_disks < 0) {
/* We might be able to shrink, but the devices must /* We might be able to shrink, but the devices must
...@@ -5437,7 +5526,12 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -5437,7 +5526,12 @@ static int raid5_start_reshape(mddev_t *mddev)
/* Add some new drives, as many as will fit. /* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work. * We know there are enough to make the newly sized array work.
* Don't add devices if we are reducing the number of
* devices in the array. This is because it is not possible
* to correctly record the "partially reconstructed" state of
* such devices during the reshape and confusion could result.
*/ */
if (mddev->delta_disks >= 0)
list_for_each_entry(rdev, &mddev->disks, same_set) list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk < 0 && if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) { !test_bit(Faulty, &rdev->flags)) {
...@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev)
} }
/* When a reshape changes the number of devices, ->degraded /* When a reshape changes the number of devices, ->degraded
* is measured against the large of the pre and post number of * is measured against the larger of the pre and post number of
* devices.*/ * devices.*/
if (mddev->delta_disks > 0) { if (mddev->delta_disks > 0) {
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment