Commit 8a392625 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (52 commits)
  md: Protect access to mddev->disks list using RCU
  md: only count actual openers as access which prevent a 'stop'
  md: linear: Make array_size sector-based and rename it to array_sectors.
  md: Make mddev->array_size sector-based.
  md: Make super_type->rdev_size_change() take sector-based sizes.
  md: Fix check for overlapping devices.
  md: Tidy up rdev_size_store a bit:
  md: Remove some unused macros.
  md: Turn rdev->sb_offset into a sector-based quantity.
  md: Make calc_dev_sboffset() return a sector count.
  md: Replace calc_dev_size() by calc_num_sectors().
  md: Make update_size() take the number of sectors.
  md: Better control of when do_md_stop is allowed to stop the array.
  md: get_disk_info(): Don't convert between signed and unsigned and back.
  md: Simplify restart_array().
  md: alloc_disk_sb(): Return proper error value.
  md: Simplify sb_equal().
  md: Simplify uuid_equal().
  md: sb_equal(): Fix misleading printk.
  md: Fix a typo in the comment to cmd_match().
  ...
parents 519f0141 4b80991c
...@@ -236,6 +236,11 @@ All md devices contain: ...@@ -236,6 +236,11 @@ All md devices contain:
writing the word for the desired state, however some states writing the word for the desired state, however some states
cannot be explicitly set, and some transitions are not allowed. cannot be explicitly set, and some transitions are not allowed.
Select/poll works on this file. All changes except between
active_idle and active (which can be frequent and are not
very interesting) are notified. active->active_idle is
reported if the metadata is externally managed.
clear clear
No devices, no size, no level No devices, no size, no level
Writing is equivalent to STOP_ARRAY ioctl Writing is equivalent to STOP_ARRAY ioctl
...@@ -292,6 +297,10 @@ Each directory contains: ...@@ -292,6 +297,10 @@ Each directory contains:
writemostly - device will only be subject to read writemostly - device will only be subject to read
requests if there are no other options. requests if there are no other options.
This applies only to raid1 arrays. This applies only to raid1 arrays.
blocked - device has failed, metadata is "external",
and the failure hasn't been acknowledged yet.
Writes that would write to this device if
it were not faulty are blocked.
spare - device is working, but not a full member. spare - device is working, but not a full member.
This includes spares that are in the process This includes spares that are in the process
of being recovered to of being recovered to
...@@ -301,6 +310,12 @@ Each directory contains: ...@@ -301,6 +310,12 @@ Each directory contains:
Writing "remove" removes the device from the array. Writing "remove" removes the device from the array.
Writing "writemostly" sets the writemostly flag. Writing "writemostly" sets the writemostly flag.
Writing "-writemostly" clears the writemostly flag. Writing "-writemostly" clears the writemostly flag.
Writing "blocked" sets the "blocked" flag.
Writing "-blocked" clear the "blocked" flag and allows writes
to complete.
This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event.
errors errors
An approximate count of read errors that have been detected on An approximate count of read errors that have been detected on
...@@ -332,7 +347,7 @@ Each directory contains: ...@@ -332,7 +347,7 @@ Each directory contains:
for storage of data. This will normally be the same as the for storage of data. This will normally be the same as the
component_size. This can be written while assembling an component_size. This can be written while assembling an
array. If a value less than the current component_size is array. If a value less than the current component_size is
written, component_size will be reduced to this value. written, it will be rejected.
An active md device will also contain and entry for each active device An active md device will also contain and entry for each active device
...@@ -381,6 +396,19 @@ also have ...@@ -381,6 +396,19 @@ also have
'check' and 'repair' will start the appropriate process 'check' and 'repair' will start the appropriate process
providing the current state is 'idle'. providing the current state is 'idle'.
This file responds to select/poll. Any important change in the value
triggers a poll event. Sometimes the value will briefly be
"recover" if a recovery seems to be needed, but cannot be
achieved. In that case, the transition to "recover" isn't
notified, but the transition away is.
degraded
This contains a count of the number of devices by which the
arrays is degraded. So an optimal array with show '0'. A
single failed/missing drive will show '1', etc.
This file responds to select/poll, any increase or decrease
in the count of missing devices will trigger an event.
mismatch_count mismatch_count
When performing 'check' and 'repair', and possibly when When performing 'check' and 'repair', and possibly when
performing 'resync', md will count the number of errors that are performing 'resync', md will count the number of errors that are
......
...@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde ...@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
|| test_bit(Faulty, &rdev->flags)) || test_bit(Faulty, &rdev->flags))
continue; continue;
target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
page->index = index; page->index = index;
...@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde ...@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct list_head *tmp;
mddev_t *mddev = bitmap->mddev; mddev_t *mddev = bitmap->mddev;
rdev_for_each(rdev, tmp, mddev) rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (test_bit(In_sync, &rdev->flags) if (test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) { && !test_bit(Faulty, &rdev->flags)) {
int size = PAGE_SIZE; int size = PAGE_SIZE;
...@@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ (long)(page->index * (PAGE_SIZE/512)) + (long)(page->index * (PAGE_SIZE/512))
+ size/512 > 0) + size/512 > 0)
/* bitmap runs in to metadata */ /* bitmap runs in to metadata */
return -EINVAL; goto bad_alignment;
if (rdev->data_offset + mddev->size*2 if (rdev->data_offset + mddev->size*2
> rdev->sb_offset*2 + bitmap->offset) > rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */ /* data runs in to bitmap */
return -EINVAL; goto bad_alignment;
} else if (rdev->sb_offset*2 < rdev->data_offset) { } else if (rdev->sb_start < rdev->data_offset) {
/* METADATA BITMAP DATA */ /* METADATA BITMAP DATA */
if (rdev->sb_offset*2 if (rdev->sb_start
+ bitmap->offset + bitmap->offset
+ page->index*(PAGE_SIZE/512) + size/512 + page->index*(PAGE_SIZE/512) + size/512
> rdev->data_offset) > rdev->data_offset)
/* bitmap runs in to data */ /* bitmap runs in to data */
return -EINVAL; goto bad_alignment;
} else { } else {
/* DATA METADATA BITMAP - no problems */ /* DATA METADATA BITMAP - no problems */
} }
md_super_write(mddev, rdev, md_super_write(mddev, rdev,
(rdev->sb_offset<<1) + bitmap->offset rdev->sb_start + bitmap->offset
+ page->index * (PAGE_SIZE/512), + page->index * (PAGE_SIZE/512),
size, size,
page); page);
} }
rcu_read_unlock();
if (wait) if (wait)
md_super_wait(mddev); md_super_wait(mddev);
return 0; return 0;
bad_alignment:
rcu_read_unlock();
return -EINVAL;
} }
static void bitmap_file_kick(struct bitmap *bitmap); static void bitmap_file_kick(struct bitmap *bitmap);
...@@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) ...@@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->lock, flags);
sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events = cpu_to_le64(bitmap->mddev->events); sb->events = cpu_to_le64(bitmap->mddev->events);
if (!bitmap->mddev->degraded) if (bitmap->mddev->events < bitmap->events_cleared) {
sb->events_cleared = cpu_to_le64(bitmap->mddev->events); /* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
}
kunmap_atomic(sb, KM_USER0); kunmap_atomic(sb, KM_USER0);
write_page(bitmap, bitmap->sb_page, 1); write_page(bitmap, bitmap->sb_page, 1);
} }
...@@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap) ...@@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
} else } else
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->lock, flags);
lastpage = page; lastpage = page;
/*
printk("bitmap clean at page %lu\n", j); /* We are possibly going to clear some bits, so make
*/ * sure that events_cleared is up-to-date.
*/
if (bitmap->need_sync) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
kunmap_atomic(sb, KM_USER0);
write_page(bitmap, bitmap->sb_page, 1);
}
spin_lock_irqsave(&bitmap->lock, flags); spin_lock_irqsave(&bitmap->lock, flags);
clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
} }
...@@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto ...@@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
return; return;
} }
if (success &&
bitmap->events_cleared < bitmap->mddev->events) {
bitmap->events_cleared = bitmap->mddev->events;
bitmap->need_sync = 1;
}
if (!success && ! (*bmc & NEEDED_MASK)) if (!success && ! (*bmc & NEEDED_MASK))
*bmc |= NEEDED_MASK; *bmc |= NEEDED_MASK;
......
...@@ -297,7 +297,7 @@ static int run(mddev_t *mddev) ...@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
rdev_for_each(rdev, tmp, mddev) rdev_for_each(rdev, tmp, mddev)
conf->rdev = rdev; conf->rdev = rdev;
mddev->array_size = mddev->size; mddev->array_sectors = mddev->size * 2;
mddev->private = conf; mddev->private = conf;
reconfig(mddev, mddev->layout, -1); reconfig(mddev, mddev->layout, -1);
......
...@@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
return NULL; return NULL;
cnt = 0; cnt = 0;
conf->array_size = 0; conf->array_sectors = 0;
rdev_for_each(rdev, tmp, mddev) { rdev_for_each(rdev, tmp, mddev) {
int j = rdev->raid_disk; int j = rdev->raid_disk;
dev_info_t *disk = conf->disks + j; dev_info_t *disk = conf->disks + j;
if (j < 0 || j > raid_disks || disk->rdev) { if (j < 0 || j >= raid_disks || disk->rdev) {
printk("linear: disk numbering problem. Aborting!\n"); printk("linear: disk numbering problem. Aborting!\n");
goto out; goto out;
} }
...@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->size = rdev->size; disk->size = rdev->size;
conf->array_size += rdev->size; conf->array_sectors += rdev->size * 2;
cnt++; cnt++;
} }
...@@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
goto out; goto out;
} }
min_spacing = conf->array_size; min_spacing = conf->array_sectors / 2;
sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
/* min_spacing is the minimum spacing that will fit the hash /* min_spacing is the minimum spacing that will fit the hash
...@@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
* that is larger than min_spacing as use the size of that as * that is larger than min_spacing as use the size of that as
* the actual spacing * the actual spacing
*/ */
conf->hash_spacing = conf->array_size; conf->hash_spacing = conf->array_sectors / 2;
for (i=0; i < cnt-1 ; i++) { for (i=0; i < cnt-1 ; i++) {
sector_t sz = 0; sector_t sz = 0;
int j; int j;
...@@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
unsigned round; unsigned round;
unsigned long base; unsigned long base;
sz = conf->array_size >> conf->preshift; sz = conf->array_sectors >> (conf->preshift + 1);
sz += 1; /* force round-up */ sz += 1; /* force round-up */
base = conf->hash_spacing >> conf->preshift; base = conf->hash_spacing >> conf->preshift;
round = sector_div(sz, base); round = sector_div(sz, base);
...@@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
curr_offset = 0; curr_offset = 0;
i = 0; i = 0;
for (curr_offset = 0; for (curr_offset = 0;
curr_offset < conf->array_size; curr_offset < conf->array_sectors / 2;
curr_offset += conf->hash_spacing) { curr_offset += conf->hash_spacing) {
while (i < raid_disks-1 && while (i < raid_disks-1 &&
...@@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev) ...@@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev)
if (!conf) if (!conf)
return 1; return 1;
mddev->private = conf; mddev->private = conf;
mddev->array_size = conf->array_size; mddev->array_sectors = conf->array_sectors;
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug; mddev->queue->unplug_fn = linear_unplug;
...@@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
newconf->prev = mddev_to_conf(mddev); newconf->prev = mddev_to_conf(mddev);
mddev->private = newconf; mddev->private = newconf;
mddev->raid_disks++; mddev->raid_disks++;
mddev->array_size = newconf->array_size; mddev->array_sectors = newconf->array_sectors;
set_capacity(mddev->gendisk, mddev->array_size << 1); set_capacity(mddev->gendisk, mddev->array_sectors);
return 0; return 0;
} }
......
...@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev) ...@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
{ {
atomic_inc(&md_event_count); atomic_inc(&md_event_count);
wake_up(&md_event_waiters); wake_up(&md_event_waiters);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
} }
EXPORT_SYMBOL_GPL(md_new_event); EXPORT_SYMBOL_GPL(md_new_event);
...@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit) ...@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit)
INIT_LIST_HEAD(&new->all_mddevs); INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer); init_timer(&new->safemode_timer);
atomic_set(&new->active, 1); atomic_set(&new->active, 1);
atomic_set(&new->openers, 0);
spin_lock_init(&new->write_lock); spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->sb_wait);
init_waitqueue_head(&new->recovery_wait); init_waitqueue_head(&new->recovery_wait);
new->reshape_position = MaxSector; new->reshape_position = MaxSector;
new->resync_min = 0;
new->resync_max = MaxSector; new->resync_max = MaxSector;
new->level = LEVEL_NONE; new->level = LEVEL_NONE;
...@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel) ...@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel)
return NULL; return NULL;
} }
/* return the offset of the super block in 512byte sectors */
static inline sector_t calc_dev_sboffset(struct block_device *bdev) static inline sector_t calc_dev_sboffset(struct block_device *bdev)
{ {
sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; sector_t num_sectors = bdev->bd_inode->i_size / 512;
return MD_NEW_SIZE_BLOCKS(size); return MD_NEW_SIZE_SECTORS(num_sectors);
} }
static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
{ {
sector_t size; sector_t num_sectors = rdev->sb_start;
size = rdev->sb_offset;
if (chunk_size) if (chunk_size)
size &= ~((sector_t)chunk_size/1024 - 1); num_sectors &= ~((sector_t)chunk_size/512 - 1);
return size; return num_sectors;
} }
static int alloc_disk_sb(mdk_rdev_t * rdev) static int alloc_disk_sb(mdk_rdev_t * rdev)
...@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) ...@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
rdev->sb_page = alloc_page(GFP_KERNEL); rdev->sb_page = alloc_page(GFP_KERNEL);
if (!rdev->sb_page) { if (!rdev->sb_page) {
printk(KERN_ALERT "md: out of memory.\n"); printk(KERN_ALERT "md: out of memory.\n");
return -EINVAL; return -ENOMEM;
} }
return 0; return 0;
...@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) ...@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
put_page(rdev->sb_page); put_page(rdev->sb_page);
rdev->sb_loaded = 0; rdev->sb_loaded = 0;
rdev->sb_page = NULL; rdev->sb_page = NULL;
rdev->sb_offset = 0; rdev->sb_start = 0;
rdev->size = 0; rdev->size = 0;
} }
} }
...@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) ...@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0; return 0;
if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
goto fail; goto fail;
rdev->sb_loaded = 1; rdev->sb_loaded = 1;
return 0; return 0;
...@@ -543,17 +543,12 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) ...@@ -543,17 +543,12 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{ {
if ( (sb1->set_uuid0 == sb2->set_uuid0) && return sb1->set_uuid0 == sb2->set_uuid0 &&
(sb1->set_uuid1 == sb2->set_uuid1) && sb1->set_uuid1 == sb2->set_uuid1 &&
(sb1->set_uuid2 == sb2->set_uuid2) && sb1->set_uuid2 == sb2->set_uuid2 &&
(sb1->set_uuid3 == sb2->set_uuid3)) sb1->set_uuid3 == sb2->set_uuid3;
return 1;
return 0;
} }
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{ {
int ret; int ret;
...@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) ...@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
if (!tmp1 || !tmp2) { if (!tmp1 || !tmp2) {
ret = 0; ret = 0;
printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
goto abort; goto abort;
} }
...@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) ...@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
tmp1->nr_disks = 0; tmp1->nr_disks = 0;
tmp2->nr_disks = 0; tmp2->nr_disks = 0;
if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
ret = 0;
else
ret = 1;
abort: abort:
kfree(tmp1); kfree(tmp1);
kfree(tmp2); kfree(tmp2);
...@@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) ...@@ -658,11 +649,14 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
*/ */
struct super_type { struct super_type {
char *name; char *name;
struct module *owner; struct module *owner;
int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); int minor_version);
void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
sector_t num_sectors);
}; };
/* /*
...@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ...@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
mdp_super_t *sb; mdp_super_t *sb;
int ret; int ret;
sector_t sb_offset;
/* /*
* Calculate the position of the superblock, * Calculate the position of the superblock (512byte sectors),
* it's at the end of the disk. * it's at the end of the disk.
* *
* It also happens to be a multiple of 4Kb. * It also happens to be a multiple of 4Kb.
*/ */
sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_start = calc_dev_sboffset(rdev->bdev);
rdev->sb_offset = sb_offset;
ret = read_disk_sb(rdev, MD_SB_BYTES); ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret; if (ret) return ret;
...@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ...@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
else else
ret = 0; ret = 0;
} }
rdev->size = calc_dev_size(rdev, sb->chunk_size); rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
if (rdev->size < sb->size && sb->level > 1) if (rdev->size < sb->size && sb->level > 1)
/* "this cannot possibly happen" ... */ /* "this cannot possibly happen" ... */
...@@ -1003,6 +995,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1003,6 +995,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->sb_csum = calc_sb_csum(sb); sb->sb_csum = calc_sb_csum(sb);
} }
/*
* rdev_size_change for 0.90.0
*/
static unsigned long long
super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{
if (num_sectors && num_sectors < rdev->mddev->size * 2)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_offset)
return 0; /* can't move bitmap */
rdev->sb_start = calc_dev_sboffset(rdev->bdev);
if (!num_sectors || num_sectors > rdev->sb_start)
num_sectors = rdev->sb_start;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
return num_sectors / 2; /* kB for sysfs */
}
/* /*
* version 1 superblock * version 1 superblock
*/ */
...@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ...@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{ {
struct mdp_superblock_1 *sb; struct mdp_superblock_1 *sb;
int ret; int ret;
sector_t sb_offset; sector_t sb_start;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask; int bmask;
/* /*
* Calculate the position of the superblock. * Calculate the position of the superblock in 512byte sectors.
* It is always aligned to a 4K boundary and * It is always aligned to a 4K boundary and
* depeding on minor_version, it can be: * depeding on minor_version, it can be:
* 0: At least 8K, but less than 12K, from end of device * 0: At least 8K, but less than 12K, from end of device
...@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ...@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
*/ */
switch(minor_version) { switch(minor_version) {
case 0: case 0:
sb_offset = rdev->bdev->bd_inode->i_size >> 9; sb_start = rdev->bdev->bd_inode->i_size >> 9;
sb_offset -= 8*2; sb_start -= 8*2;
sb_offset &= ~(sector_t)(4*2-1); sb_start &= ~(sector_t)(4*2-1);
/* convert from sectors to K */
sb_offset /= 2;
break; break;
case 1: case 1:
sb_offset = 0; sb_start = 0;
break; break;
case 2: case 2:
sb_offset = 4; sb_start = 8;
break; break;
default: default:
return -EINVAL; return -EINVAL;
} }
rdev->sb_offset = sb_offset; rdev->sb_start = sb_start;
/* superblock is rarely larger than 1K, but it can be larger, /* superblock is rarely larger than 1K, but it can be larger,
* and it is safe to read 4k, so we do that * and it is safe to read 4k, so we do that
...@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ...@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
sb->major_version != cpu_to_le32(1) || sb->major_version != cpu_to_le32(1) ||
le32_to_cpu(sb->max_dev) > (4096-256)/2 || le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || le64_to_cpu(sb->super_offset) != rdev->sb_start ||
(le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
return -EINVAL; return -EINVAL;
...@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ...@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
rdev->sb_size = (rdev->sb_size | bmask) + 1; rdev->sb_size = (rdev->sb_size | bmask) + 1;
if (minor_version if (minor_version
&& rdev->data_offset < sb_offset + (rdev->sb_size/512)) && rdev->data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL; return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
...@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ...@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (minor_version) if (minor_version)
rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
else else
rdev->size = rdev->sb_offset; rdev->size = rdev->sb_start / 2;
if (rdev->size < le64_to_cpu(sb->data_size)/2) if (rdev->size < le64_to_cpu(sb->data_size)/2)
return -EINVAL; return -EINVAL;
rdev->size = le64_to_cpu(sb->data_size)/2; rdev->size = le64_to_cpu(sb->data_size)/2;
...@@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1328,35 +1338,74 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->sb_csum = calc_sb_1_csum(sb); sb->sb_csum = calc_sb_1_csum(sb);
} }
static unsigned long long
super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{
struct mdp_superblock_1 *sb;
sector_t max_sectors;
if (num_sectors && num_sectors < rdev->mddev->size * 2)
return 0; /* component must fit device */
if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */
max_sectors = rdev->bdev->bd_inode->i_size >> 9;
max_sectors -= rdev->data_offset;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
} else if (rdev->mddev->bitmap_offset) {
/* minor version 0 with bitmap we can't move */
return 0;
} else {
/* minor version 0; superblock after data */
sector_t sb_start;
sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
}
sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
sb->super_offset = rdev->sb_start;
sb->sb_csum = calc_sb_1_csum(sb);
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
return num_sectors / 2; /* kB for sysfs */
}
static struct super_type super_types[] = { static struct super_type super_types[] = {
[0] = { [0] = {
.name = "0.90.0", .name = "0.90.0",
.owner = THIS_MODULE, .owner = THIS_MODULE,
.load_super = super_90_load, .load_super = super_90_load,
.validate_super = super_90_validate, .validate_super = super_90_validate,
.sync_super = super_90_sync, .sync_super = super_90_sync,
.rdev_size_change = super_90_rdev_size_change,
}, },
[1] = { [1] = {
.name = "md-1", .name = "md-1",
.owner = THIS_MODULE, .owner = THIS_MODULE,
.load_super = super_1_load, .load_super = super_1_load,
.validate_super = super_1_validate, .validate_super = super_1_validate,
.sync_super = super_1_sync, .sync_super = super_1_sync,
.rdev_size_change = super_1_rdev_size_change,
}, },
}; };
static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
{ {
struct list_head *tmp, *tmp2;
mdk_rdev_t *rdev, *rdev2; mdk_rdev_t *rdev, *rdev2;
rdev_for_each(rdev, tmp, mddev1) rcu_read_lock();
rdev_for_each(rdev2, tmp2, mddev2) rdev_for_each_rcu(rdev, mddev1)
rdev_for_each_rcu(rdev2, mddev2)
if (rdev->bdev->bd_contains == if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1; return 1;
}
rcu_read_unlock();
return 0; return 0;
} }
...@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) ...@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
kobject_del(&rdev->kobj); kobject_del(&rdev->kobj);
goto fail; goto fail;
} }
list_add(&rdev->same_set, &mddev->disks); list_add_rcu(&rdev->same_set, &mddev->disks);
bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
return 0; return 0;
...@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) ...@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
return; return;
} }
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
list_del_init(&rdev->same_set); list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL; rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block"); sysfs_remove_link(&rdev->kobj, "block");
/* We need to delay this, otherwise we can deadlock when /* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state" * writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
*/ */
synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete); INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj); kobject_get(&rdev->kobj);
schedule_work(&rdev->del_work); schedule_work(&rdev->del_work);
...@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) ...@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
if (rdev->mddev) if (rdev->mddev)
MD_BUG(); MD_BUG();
free_disk_sb(rdev); free_disk_sb(rdev);
list_del_init(&rdev->same_set);
#ifndef MODULE #ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags)) if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev); md_autodetect_dev(rdev->bdev->bd_dev);
...@@ -1758,11 +1808,11 @@ static void md_update_sb(mddev_t * mddev, int force_change) ...@@ -1758,11 +1808,11 @@ static void md_update_sb(mddev_t * mddev, int force_change)
dprintk("%s ", bdevname(rdev->bdev,b)); dprintk("%s ", bdevname(rdev->bdev,b));
if (!test_bit(Faulty, &rdev->flags)) { if (!test_bit(Faulty, &rdev->flags)) {
md_super_write(mddev,rdev, md_super_write(mddev,rdev,
rdev->sb_offset<<1, rdev->sb_size, rdev->sb_start, rdev->sb_size,
rdev->sb_page); rdev->sb_page);
dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
bdevname(rdev->bdev,b), bdevname(rdev->bdev,b),
(unsigned long long)rdev->sb_offset); (unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events; rdev->sb_events = mddev->events;
} else } else
...@@ -1787,7 +1837,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) ...@@ -1787,7 +1837,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
} }
/* words written to sysfs files may, or my not, be \n terminated. /* words written to sysfs files may, or may not, be \n terminated.
* We want to accept with case. For this we use cmd_match. * We want to accept with case. For this we use cmd_match.
*/ */
static int cmd_match(const char *cmd, const char *str) static int cmd_match(const char *cmd, const char *str)
...@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
err = 0; err = 0;
} }
if (!err)
sysfs_notify(&rdev->kobj, NULL, "state");
return err ? err : len; return err ? err : len;
} }
static struct rdev_sysfs_entry rdev_state = static struct rdev_sysfs_entry rdev_state =
...@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
slot = -1; slot = -1;
else if (e==buf || (*e && *e!= '\n')) else if (e==buf || (*e && *e!= '\n'))
return -EINVAL; return -EINVAL;
if (rdev->mddev->pers) { if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also /* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating * updating the 'rd%d' link, and communicating
* with the personality with ->hot_*_disk. * with the personality with ->hot_*_disk.
...@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
* failed/spare devices. This normally happens automatically, * failed/spare devices. This normally happens automatically,
* but not when the metadata is externally managed. * but not when the metadata is externally managed.
*/ */
if (slot != -1)
return -EBUSY;
if (rdev->raid_disk == -1) if (rdev->raid_disk == -1)
return -EEXIST; return -EEXIST;
/* personality does all needed checks */ /* personality does all needed checks */
...@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
sysfs_remove_link(&rdev->mddev->kobj, nm); sysfs_remove_link(&rdev->mddev->kobj, nm);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread); md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
mdk_rdev_t *rdev2;
struct list_head *tmp;
/* Activating a spare .. or possibly reactivating
* if we every get bitmaps working here.
*/
if (rdev->raid_disk != -1)
return -EBUSY;
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
rdev_for_each(rdev2, tmp, rdev->mddev)
if (rdev2->raid_disk == slot)
return -EEXIST;
rdev->raid_disk = slot;
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = slot;
else
rdev->saved_raid_disk = -1;
err = rdev->mddev->pers->
hot_add_disk(rdev->mddev, rdev);
if (err) {
rdev->raid_disk = -1;
return err;
} else
sysfs_notify(&rdev->kobj, NULL, "state");
sprintf(nm, "rd%d", rdev->raid_disk);
if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
printk(KERN_WARNING
"md: cannot register "
"%s for %s\n",
nm, mdname(rdev->mddev));
/* don't wakeup anyone, leave that to userspace. */
} else { } else {
if (slot >= rdev->mddev->raid_disks) if (slot >= rdev->mddev->raid_disks)
return -ENOSPC; return -ENOSPC;
...@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
clear_bit(Faulty, &rdev->flags); clear_bit(Faulty, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags); clear_bit(WriteMostly, &rdev->flags);
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
sysfs_notify(&rdev->kobj, NULL, "state");
} }
return len; return len;
} }
...@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
unsigned long long offset = simple_strtoull(buf, &e, 10); unsigned long long offset = simple_strtoull(buf, &e, 10);
if (e==buf || (*e && *e != '\n')) if (e==buf || (*e && *e != '\n'))
return -EINVAL; return -EINVAL;
if (rdev->mddev->pers) if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY; return -EBUSY;
if (rdev->size && rdev->mddev->external) if (rdev->size && rdev->mddev->external)
/* Must set offset before size, so overlap checks /* Must set offset before size, so overlap checks
...@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) ...@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
static ssize_t static ssize_t
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{ {
char *e; unsigned long long size;
unsigned long long size = simple_strtoull(buf, &e, 10);
unsigned long long oldsize = rdev->size; unsigned long long oldsize = rdev->size;
mddev_t *my_mddev = rdev->mddev; mddev_t *my_mddev = rdev->mddev;
if (e==buf || (*e && *e != '\n')) if (strict_strtoull(buf, 10, &size) < 0)
return -EINVAL; return -EINVAL;
if (my_mddev->pers) if (size < my_mddev->size)
return -EBUSY; return -EINVAL;
if (my_mddev->pers && rdev->raid_disk >= 0) {
if (my_mddev->persistent) {
size = super_types[my_mddev->major_version].
rdev_size_change(rdev, size * 2);
if (!size)
return -EBUSY;
} else if (!size) {
size = (rdev->bdev->bd_inode->i_size >> 10);
size -= rdev->data_offset/2;
}
if (size < my_mddev->size)
return -EINVAL; /* component must fit device */
}
rdev->size = size; rdev->size = size;
if (size > oldsize && rdev->mddev->external) { if (size > oldsize && my_mddev->external) {
/* need to check that all other rdevs with the same ->bdev /* need to check that all other rdevs with the same ->bdev
* do not overlap. We need to unlock the mddev to avoid * do not overlap. We need to unlock the mddev to avoid
* a deadlock. We have already changed rdev->size, and if * a deadlock. We have already changed rdev->size, and if
...@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (test_bit(AllReserved, &rdev2->flags) || if (test_bit(AllReserved, &rdev2->flags) ||
(rdev->bdev == rdev2->bdev && (rdev->bdev == rdev2->bdev &&
rdev != rdev2 && rdev != rdev2 &&
overlaps(rdev->data_offset, rdev->size, overlaps(rdev->data_offset, rdev->size * 2,
rdev2->data_offset, rdev2->size))) { rdev2->data_offset,
rdev2->size * 2))) {
overlap = 1; overlap = 1;
break; break;
} }
...@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EBUSY; return -EBUSY;
} }
} }
if (size < my_mddev->size || my_mddev->size == 0)
my_mddev->size = size;
return len; return len;
} }
...@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); ...@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
* When written, doesn't tear down array, but just stops it * When written, doesn't tear down array, but just stops it
* suspended (not supported yet) * suspended (not supported yet)
* All IO requests will block. The array can be reconfigured. * All IO requests will block. The array can be reconfigured.
* Writing this, if accepted, will block until array is quiessent * Writing this, if accepted, will block until array is quiescent
* readonly * readonly
* no resync can happen. no superblocks get written. * no resync can happen. no superblocks get written.
* write requests fail * write requests fail
...@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page) ...@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]); return sprintf(page, "%s\n", array_states[st]);
} }
static int do_md_stop(mddev_t * mddev, int ro); static int do_md_stop(mddev_t * mddev, int ro, int is_open);
static int do_md_run(mddev_t * mddev); static int do_md_run(mddev_t * mddev);
static int restart_array(mddev_t *mddev); static int restart_array(mddev_t *mddev);
...@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break; break;
case clear: case clear:
/* stopping an active array */ /* stopping an active array */
if (atomic_read(&mddev->active) > 1) if (atomic_read(&mddev->openers) > 0)
return -EBUSY; return -EBUSY;
err = do_md_stop(mddev, 0); err = do_md_stop(mddev, 0, 0);
break; break;
case inactive: case inactive:
/* stopping an active array */ /* stopping an active array */
if (mddev->pers) { if (mddev->pers) {
if (atomic_read(&mddev->active) > 1) if (atomic_read(&mddev->openers) > 0)
return -EBUSY; return -EBUSY;
err = do_md_stop(mddev, 2); err = do_md_stop(mddev, 2, 0);
} else } else
err = 0; /* already inactive */ err = 0; /* already inactive */
break; break;
...@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break; /* not supported yet */ break; /* not supported yet */
case readonly: case readonly:
if (mddev->pers) if (mddev->pers)
err = do_md_stop(mddev, 1); err = do_md_stop(mddev, 1, 0);
else { else {
mddev->ro = 1; mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
...@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case read_auto: case read_auto:
if (mddev->pers) { if (mddev->pers) {
if (mddev->ro != 1) if (mddev->ro != 1)
err = do_md_stop(mddev, 1); err = do_md_stop(mddev, 1, 0);
else else
err = restart_array(mddev); err = restart_array(mddev);
if (err == 0) { if (err == 0) {
...@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
} }
if (err) if (err)
return err; return err;
else else {
sysfs_notify(&mddev->kobj, NULL, "array_state");
return len; return len;
}
} }
static struct md_sysfs_entry md_array_state = static struct md_sysfs_entry md_array_state =
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
...@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page) ...@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)mddev->size); return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
} }
static int update_size(mddev_t *mddev, unsigned long size); static int update_size(mddev_t *mddev, sector_t num_sectors);
static ssize_t static ssize_t
size_store(mddev_t *mddev, const char *buf, size_t len) size_store(mddev_t *mddev, const char *buf, size_t len)
...@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
return -EINVAL; return -EINVAL;
if (mddev->pers) { if (mddev->pers) {
err = update_size(mddev, size); err = update_size(mddev, size * 2);
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
} else { } else {
if (mddev->size == 0 || if (mddev->size == 0 ||
...@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page) ...@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page)
type = "check"; type = "check";
else else
type = "repair"; type = "repair";
} else } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
type = "recover"; type = "recover";
} }
return sprintf(page, "%s\n", type); return sprintf(page, "%s\n", type);
...@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len) ...@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY; return -EBUSY;
else if (cmd_match(page, "resync") || cmd_match(page, "recover")) else if (cmd_match(page, "resync"))
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
else if (cmd_match(page, "recover")) {
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
else if (cmd_match(page, "reshape")) { } else if (cmd_match(page, "reshape")) {
int err; int err;
if (mddev->pers->start_reshape == NULL) if (mddev->pers->start_reshape == NULL)
return -EINVAL; return -EINVAL;
err = mddev->pers->start_reshape(mddev); err = mddev->pers->start_reshape(mddev);
if (err) if (err)
return err; return err;
sysfs_notify(&mddev->kobj, NULL, "degraded");
} else { } else {
if (cmd_match(page, "check")) if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
...@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) ...@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
} }
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
return len; return len;
} }
...@@ -3049,11 +3156,11 @@ static ssize_t ...@@ -3049,11 +3156,11 @@ static ssize_t
sync_speed_show(mddev_t *mddev, char *page) sync_speed_show(mddev_t *mddev, char *page)
{ {
unsigned long resync, dt, db; unsigned long resync, dt, db;
resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = ((jiffies - mddev->resync_mark) / HZ); dt = (jiffies - mddev->resync_mark) / HZ;
if (!dt) dt++; if (!dt) dt++;
db = resync - (mddev->resync_mark_cnt); db = resync - mddev->resync_mark_cnt;
return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
} }
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
...@@ -3074,6 +3181,36 @@ sync_completed_show(mddev_t *mddev, char *page) ...@@ -3074,6 +3181,36 @@ sync_completed_show(mddev_t *mddev, char *page)
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
static ssize_t
min_sync_show(mddev_t *mddev, char *page)
{
return sprintf(page, "%llu\n",
(unsigned long long)mddev->resync_min);
}
static ssize_t
min_sync_store(mddev_t *mddev, const char *buf, size_t len)
{
unsigned long long min;
if (strict_strtoull(buf, 10, &min))
return -EINVAL;
if (min > mddev->resync_max)
return -EINVAL;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
/* Must be a multiple of chunk_size */
if (mddev->chunk_size) {
if (min & (sector_t)((mddev->chunk_size>>9)-1))
return -EINVAL;
}
mddev->resync_min = min;
return len;
}
static struct md_sysfs_entry md_min_sync =
__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
static ssize_t static ssize_t
max_sync_show(mddev_t *mddev, char *page) max_sync_show(mddev_t *mddev, char *page)
{ {
...@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
if (strncmp(buf, "max", 3) == 0) if (strncmp(buf, "max", 3) == 0)
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
else { else {
char *ep; unsigned long long max;
unsigned long long max = simple_strtoull(buf, &ep, 10); if (strict_strtoull(buf, 10, &max))
if (ep == buf || (*ep != 0 && *ep != '\n')) return -EINVAL;
if (max < mddev->resync_min)
return -EINVAL; return -EINVAL;
if (max < mddev->resync_max && if (max < mddev->resync_max &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
...@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = { ...@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_speed.attr, &md_sync_speed.attr,
&md_sync_force_parallel.attr, &md_sync_force_parallel.attr,
&md_sync_completed.attr, &md_sync_completed.attr,
&md_min_sync.attr,
&md_max_sync.attr, &md_max_sync.attr,
&md_suspend_lo.attr, &md_suspend_lo.attr,
&md_suspend_hi.attr, &md_suspend_hi.attr,
...@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) ...@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
disk->queue = mddev->queue; disk->queue = mddev->queue;
add_disk(disk); add_disk(disk);
mddev->gendisk = disk; mddev->gendisk = disk;
mutex_unlock(&disks_mutex);
error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
"%s", "md"); "%s", "md");
mutex_unlock(&disks_mutex);
if (error) if (error)
printk(KERN_WARNING "md: cannot register %s/md - name in use\n", printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
disk->disk_name); disk->disk_name);
...@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data) ...@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data)
{ {
mddev_t *mddev = (mddev_t *) data; mddev_t *mddev = (mddev_t *) data;
mddev->safemode = 1; if (!atomic_read(&mddev->writes_pending)) {
mddev->safemode = 1;
if (mddev->external)
sysfs_notify(&mddev->kobj, NULL, "array_state");
}
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
} }
...@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev) ...@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev)
* We don't want the data to overlap the metadata, * We don't want the data to overlap the metadata,
* Internal Bitmap issues has handled elsewhere. * Internal Bitmap issues has handled elsewhere.
*/ */
if (rdev->data_offset < rdev->sb_offset) { if (rdev->data_offset < rdev->sb_start) {
if (mddev->size && if (mddev->size &&
rdev->data_offset + mddev->size*2 rdev->data_offset + mddev->size*2
> rdev->sb_offset*2) { > rdev->sb_start) {
printk("md: %s: data overlaps metadata\n", printk("md: %s: data overlaps metadata\n",
mdname(mddev)); mdname(mddev));
return -EINVAL; return -EINVAL;
} }
} else { } else {
if (rdev->sb_offset*2 + rdev->sb_size/512 if (rdev->sb_start + rdev->sb_size/512
> rdev->data_offset) { > rdev->data_offset) {
printk("md: %s: metadata overlaps data\n", printk("md: %s: metadata overlaps data\n",
mdname(mddev)); mdname(mddev));
return -EINVAL; return -EINVAL;
} }
} }
sysfs_notify(&rdev->kobj, NULL, "state");
} }
md_probe(mddev->unit, NULL, NULL); md_probe(mddev->unit, NULL, NULL);
...@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev) ...@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev)
mddev->ro = 2; /* read-only, but switch on first write */ mddev->ro = 2; /* read-only, but switch on first write */
err = mddev->pers->run(mddev); err = mddev->pers->run(mddev);
if (!err && mddev->pers->sync_request) { if (err)
printk(KERN_ERR "md: pers->run() failed ...\n");
else if (mddev->pers->sync_request) {
err = bitmap_create(mddev); err = bitmap_create(mddev);
if (err) { if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n", printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
...@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev) ...@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev)
} }
} }
if (err) { if (err) {
printk(KERN_ERR "md: pers->run() failed ...\n");
module_put(mddev->pers->owner); module_put(mddev->pers->owner);
mddev->pers = NULL; mddev->pers = NULL;
bitmap_destroy(mddev); bitmap_destroy(mddev);
...@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev) ...@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev)
if (mddev->flags) if (mddev->flags)
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
set_capacity(disk, mddev->array_size<<1); set_capacity(disk, mddev->array_sectors);
/* If we call blk_queue_make_request here, it will /* If we call blk_queue_make_request here, it will
* re-initialise max_sectors etc which may have been * re-initialise max_sectors etc which may have been
...@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev) ...@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev)
mddev->changed = 1; mddev->changed = 1;
md_new_event(mddev); md_new_event(mddev);
sysfs_notify(&mddev->kobj, NULL, "array_state");
sysfs_notify(&mddev->kobj, NULL, "sync_action");
sysfs_notify(&mddev->kobj, NULL, "degraded");
kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
return 0; return 0;
} }
...@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev) ...@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev)
static int restart_array(mddev_t *mddev) static int restart_array(mddev_t *mddev)
{ {
struct gendisk *disk = mddev->gendisk; struct gendisk *disk = mddev->gendisk;
int err;
/* /* Complain if it has no devices */
* Complain if it has no devices
*/
err = -ENXIO;
if (list_empty(&mddev->disks)) if (list_empty(&mddev->disks))
goto out; return -ENXIO;
if (!mddev->pers)
if (mddev->pers) { return -EINVAL;
err = -EBUSY; if (!mddev->ro)
if (!mddev->ro) return -EBUSY;
goto out; mddev->safemode = 0;
mddev->ro = 0;
mddev->safemode = 0; set_disk_ro(disk, 0);
mddev->ro = 0; printk(KERN_INFO "md: %s switched to read-write mode.\n",
set_disk_ro(disk, 0); mdname(mddev));
/* Kick recovery or resync if necessary */
printk(KERN_INFO "md: %s switched to read-write mode.\n", set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
mdname(mddev)); md_wakeup_thread(mddev->thread);
/* md_wakeup_thread(mddev->sync_thread);
* Kick recovery or resync if necessary sysfs_notify(&mddev->kobj, NULL, "array_state");
*/ return 0;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
err = 0;
} else
err = -EINVAL;
out:
return err;
} }
/* similar to deny_write_access, but accounts for our holding a reference /* similar to deny_write_access, but accounts for our holding a reference
...@@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file) ...@@ -3680,16 +3815,17 @@ static void restore_bitmap_write_access(struct file *file)
* 1 - switch to readonly * 1 - switch to readonly
* 2 - stop but do not disassemble array * 2 - stop but do not disassemble array
*/ */
static int do_md_stop(mddev_t * mddev, int mode) static int do_md_stop(mddev_t * mddev, int mode, int is_open)
{ {
int err = 0; int err = 0;
struct gendisk *disk = mddev->gendisk; struct gendisk *disk = mddev->gendisk;
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
return -EBUSY;
}
if (mddev->pers) { if (mddev->pers) {
if (atomic_read(&mddev->active)>2) {
printk("md: %s still in use.\n",mdname(mddev));
return -EBUSY;
}
if (mddev->sync_thread) { if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
...@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode) ...@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode)
export_array(mddev); export_array(mddev);
mddev->array_size = 0; mddev->array_sectors = 0;
mddev->size = 0; mddev->size = 0;
mddev->raid_disks = 0; mddev->raid_disks = 0;
mddev->recovery_cp = 0; mddev->recovery_cp = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
mddev->external = 0; mddev->external = 0;
...@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode) ...@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
mdname(mddev)); mdname(mddev));
err = 0; err = 0;
md_new_event(mddev); md_new_event(mddev);
sysfs_notify(&mddev->kobj, NULL, "array_state");
out: out:
return err; return err;
} }
...@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev) ...@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev)
err = do_md_run (mddev); err = do_md_run (mddev);
if (err) { if (err) {
printk(KERN_WARNING "md: do_md_run() returned %d\n", err); printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
do_md_stop (mddev, 0); do_md_stop (mddev, 0, 0);
} }
} }
...@@ -3927,8 +4065,10 @@ static void autorun_devices(int part) ...@@ -3927,8 +4065,10 @@ static void autorun_devices(int part)
/* on success, candidates will be empty, on error /* on success, candidates will be empty, on error
* it won't... * it won't...
*/ */
rdev_for_each_list(rdev, tmp, candidates) rdev_for_each_list(rdev, tmp, candidates) {
list_del_init(&rdev->same_set);
export_rdev(rdev); export_rdev(rdev);
}
mddev_put(mddev); mddev_put(mddev);
} }
printk(KERN_INFO "md: ... autorun DONE.\n"); printk(KERN_INFO "md: ... autorun DONE.\n");
...@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) ...@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
char *ptr, *buf = NULL; char *ptr, *buf = NULL;
int err = -ENOMEM; int err = -ENOMEM;
md_allow_write(mddev); if (md_allow_write(mddev))
file = kmalloc(sizeof(*file), GFP_NOIO);
else
file = kmalloc(sizeof(*file), GFP_KERNEL);
file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file) if (!file)
goto out; goto out;
...@@ -4044,15 +4186,12 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) ...@@ -4044,15 +4186,12 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
static int get_disk_info(mddev_t * mddev, void __user * arg) static int get_disk_info(mddev_t * mddev, void __user * arg)
{ {
mdu_disk_info_t info; mdu_disk_info_t info;
unsigned int nr;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
if (copy_from_user(&info, arg, sizeof(info))) if (copy_from_user(&info, arg, sizeof(info)))
return -EFAULT; return -EFAULT;
nr = info.number; rdev = find_rdev_nr(mddev, info.number);
rdev = find_rdev_nr(mddev, nr);
if (rdev) { if (rdev) {
info.major = MAJOR(rdev->bdev->bd_dev); info.major = MAJOR(rdev->bdev->bd_dev);
info.minor = MINOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev);
...@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
} }
if (err) if (err)
export_rdev(rdev); export_rdev(rdev);
else
sysfs_notify(&rdev->kobj, NULL, "state");
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
if (mddev->degraded)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
return err; return err;
...@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (!mddev->persistent) { if (!mddev->persistent) {
printk(KERN_INFO "md: nonpersistent superblock ...\n"); printk(KERN_INFO "md: nonpersistent superblock ...\n");
rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
} else } else
rdev->sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_start = calc_dev_sboffset(rdev->bdev);
rdev->size = calc_dev_size(rdev, mddev->chunk_size); rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) { if (err) {
...@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) ...@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
if (!mddev->pers)
return -ENODEV;
rdev = find_rdev(mddev, dev); rdev = find_rdev(mddev, dev);
if (!rdev) if (!rdev)
return -ENXIO; return -ENXIO;
...@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
int err; int err;
unsigned int size;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
if (!mddev->pers) if (!mddev->pers)
...@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
} }
if (mddev->persistent) if (mddev->persistent)
rdev->sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_start = calc_dev_sboffset(rdev->bdev);
else else
rdev->sb_offset = rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
size = calc_dev_size(rdev, mddev->chunk_size); rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
rdev->size = size;
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
printk(KERN_WARNING printk(KERN_WARNING
...@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) ...@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
return 0; return 0;
} }
static int update_size(mddev_t *mddev, unsigned long size) static int update_size(mddev_t *mddev, sector_t num_sectors)
{ {
mdk_rdev_t * rdev; mdk_rdev_t * rdev;
int rv; int rv;
struct list_head *tmp; struct list_head *tmp;
int fit = (size == 0); int fit = (num_sectors == 0);
if (mddev->pers->resize == NULL) if (mddev->pers->resize == NULL)
return -EINVAL; return -EINVAL;
/* The "size" is the amount of each device that is used. /* The "num_sectors" is the number of sectors of each device that
* This can only make sense for arrays with redundancy. * is used. This can only make sense for arrays with redundancy.
* linear and raid0 always use whatever space is available * linear and raid0 always use whatever space is available. We can only
* We can only consider changing the size if no resync * consider changing this number if no resync or reconstruction is
* or reconstruction is happening, and if the new size * happening, and if the new size is acceptable. It must fit before the
* is acceptable. It must fit before the sb_offset or, * sb_start or, if that is <data_offset, it must fit before the size
* if that is <data_offset, it must fit before the * of each device. If num_sectors is zero, we find the largest size
* size of each device. * that fits.
* If size is zero, we find the largest size that fits.
*/ */
if (mddev->sync_thread) if (mddev->sync_thread)
return -EBUSY; return -EBUSY;
...@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size) ...@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size)
sector_t avail; sector_t avail;
avail = rdev->size * 2; avail = rdev->size * 2;
if (fit && (size == 0 || size > avail/2)) if (fit && (num_sectors == 0 || num_sectors > avail))
size = avail/2; num_sectors = avail;
if (avail < ((sector_t)size << 1)) if (avail < num_sectors)
return -ENOSPC; return -ENOSPC;
} }
rv = mddev->pers->resize(mddev, (sector_t)size *2); rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) { if (!rv) {
struct block_device *bdev; struct block_device *bdev;
bdev = bdget_disk(mddev->gendisk, 0); bdev = bdget_disk(mddev->gendisk, 0);
if (bdev) { if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex); mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); i_size_write(bdev->bd_inode,
(loff_t)mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex); mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev); bdput(bdev);
} }
...@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ...@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
return mddev->pers->reconfig(mddev, info->layout, -1); return mddev->pers->reconfig(mddev, info->layout, -1);
} }
if (info->size >= 0 && mddev->size != info->size) if (info->size >= 0 && mddev->size != info->size)
rv = update_size(mddev, info->size); rv = update_size(mddev, (sector_t)info->size * 2);
if (mddev->raid_disks != info->raid_disks) if (mddev->raid_disks != info->raid_disks)
rv = update_raid_disks(mddev, info->raid_disks); rv = update_raid_disks(mddev, info->raid_disks);
...@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) ...@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
return 0; return 0;
} }
/*
* We have a problem here : there is no easy way to give a CHS
* virtual geometry. We currently pretend that we have a 2 heads
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{ {
mddev_t *mddev = bdev->bd_disk->private_data; mddev_t *mddev = bdev->bd_disk->private_data;
...@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto done_unlock; goto done_unlock;
case STOP_ARRAY: case STOP_ARRAY:
err = do_md_stop (mddev, 0); err = do_md_stop (mddev, 0, 1);
goto done_unlock; goto done_unlock;
case STOP_ARRAY_RO: case STOP_ARRAY_RO:
err = do_md_stop (mddev, 1); err = do_md_stop (mddev, 1, 1);
goto done_unlock; goto done_unlock;
/*
* We have a problem here : there is no easy way to give a CHS
* virtual geometry. We currently pretend that we have a 2 heads
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
} }
/* /*
...@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
* here and hit the 'default' below, so only disallow * here and hit the 'default' below, so only disallow
* 'md' ioctls, and switch to rw mode if started auto-readonly. * 'md' ioctls, and switch to rw mode if started auto-readonly.
*/ */
if (_IOC_TYPE(cmd) == MD_MAJOR && if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
mddev->ro && mddev->pers) {
if (mddev->ro == 2) { if (mddev->ro == 2) {
mddev->ro = 0; mddev->ro = 0;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify(&mddev->kobj, NULL, "array_state");
md_wakeup_thread(mddev->thread); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} else { } else {
err = -EROFS; err = -EROFS;
goto abort_unlock; goto abort_unlock;
...@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file) ...@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file)
err = 0; err = 0;
mddev_get(mddev); mddev_get(mddev);
atomic_inc(&mddev->openers);
mddev_unlock(mddev); mddev_unlock(mddev);
check_disk_change(inode->i_bdev); check_disk_change(inode->i_bdev);
...@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file) ...@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file)
mddev_t *mddev = inode->i_bdev->bd_disk->private_data; mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
BUG_ON(!mddev); BUG_ON(!mddev);
atomic_dec(&mddev->openers);
mddev_put(mddev); mddev_put(mddev);
return 0; return 0;
...@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!mddev->pers->error_handler) if (!mddev->pers->error_handler)
return; return;
mddev->pers->error_handler(mddev,rdev); mddev->pers->error_handler(mddev,rdev);
if (mddev->degraded)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(StateChanged, &rdev->flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
...@@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -5258,10 +5400,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (!list_empty(&mddev->disks)) { if (!list_empty(&mddev->disks)) {
if (mddev->pers) if (mddev->pers)
seq_printf(seq, "\n %llu blocks", seq_printf(seq, "\n %llu blocks",
(unsigned long long)mddev->array_size); (unsigned long long)
mddev->array_sectors / 2);
else else
seq_printf(seq, "\n %llu blocks", seq_printf(seq, "\n %llu blocks",
(unsigned long long)size); (unsigned long long)size);
} }
if (mddev->persistent) { if (mddev->persistent) {
if (mddev->major_version != 0 || if (mddev->major_version != 0 ||
...@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p) ...@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
static int is_mddev_idle(mddev_t *mddev) static int is_mddev_idle(mddev_t *mddev)
{ {
mdk_rdev_t * rdev; mdk_rdev_t * rdev;
struct list_head *tmp;
int idle; int idle;
long curr_events; long curr_events;
idle = 1; idle = 1;
rdev_for_each(rdev, tmp, mddev) { rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, sectors[0]) + curr_events = disk_stat_read(disk, sectors[0]) +
disk_stat_read(disk, sectors[1]) - disk_stat_read(disk, sectors[1]) -
...@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev) ...@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
idle = 0; idle = 0;
} }
} }
rcu_read_unlock();
return idle; return idle;
} }
...@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) ...@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
*/ */
void md_write_start(mddev_t *mddev, struct bio *bi) void md_write_start(mddev_t *mddev, struct bio *bi)
{ {
int did_change = 0;
if (bio_data_dir(bi) != WRITE) if (bio_data_dir(bi) != WRITE)
return; return;
...@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi) ...@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
did_change = 1;
} }
atomic_inc(&mddev->writes_pending); atomic_inc(&mddev->writes_pending);
if (mddev->safemode == 1) if (mddev->safemode == 1)
...@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi) ...@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
mddev->in_sync = 0; mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags); set_bit(MD_CHANGE_CLEAN, &mddev->flags);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
did_change = 1;
} }
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
sysfs_notify(&mddev->kobj, NULL, "array_state");
} }
if (did_change)
sysfs_notify(&mddev->kobj, NULL, "array_state");
wait_event(mddev->sb_wait, wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags)); !test_bit(MD_CHANGE_PENDING, &mddev->flags));
...@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev) ...@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev)
* may proceed without blocking. It is important to call this before * may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock. * attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held. * Must be called with mddev_lock held.
*
* In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
*/ */
void md_allow_write(mddev_t *mddev) int md_allow_write(mddev_t *mddev)
{ {
if (!mddev->pers) if (!mddev->pers)
return; return 0;
if (mddev->ro) if (mddev->ro)
return; return 0;
if (!mddev->pers->sync_request)
return 0;
spin_lock_irq(&mddev->write_lock); spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) { if (mddev->in_sync) {
...@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev) ...@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev)
mddev->safemode = 1; mddev->safemode = 1;
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
sysfs_notify(&mddev->kobj, NULL, "array_state"); sysfs_notify(&mddev->kobj, NULL, "array_state");
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
} else } else
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
return -EAGAIN;
else
return 0;
} }
EXPORT_SYMBOL_GPL(md_allow_write); EXPORT_SYMBOL_GPL(md_allow_write);
...@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev) ...@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev)
max_sectors = mddev->resync_max_sectors; max_sectors = mddev->resync_max_sectors;
mddev->resync_mismatches = 0; mddev->resync_mismatches = 0;
/* we don't use the checkpoint if there's a bitmap */ /* we don't use the checkpoint if there's a bitmap */
if (!mddev->bitmap && if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min;
else if (!mddev->bitmap)
j = mddev->recovery_cp; j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->size << 1; max_sectors = mddev->size << 1;
else { else {
...@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev) ...@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev)
skip: skip:
mddev->curr_resync = 0; mddev->curr_resync = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
sysfs_notify(&mddev->kobj, NULL, "sync_completed"); sysfs_notify(&mddev->kobj, NULL, "sync_completed");
wake_up(&resync_wait); wake_up(&resync_wait);
...@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev) ...@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev)
if (rdev->raid_disk < 0 if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) { && !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0; rdev->recovery_offset = 0;
if (mddev->pers->hot_add_disk(mddev,rdev)) { if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
char nm[20]; char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk); sprintf(nm, "rd%d", rdev->raid_disk);
if (sysfs_create_link(&mddev->kobj, if (sysfs_create_link(&mddev->kobj,
...@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev) ...@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev)
int spares = 0; int spares = 0;
if (!mddev->external) { if (!mddev->external) {
int did_change = 0;
spin_lock_irq(&mddev->write_lock); spin_lock_irq(&mddev->write_lock);
if (mddev->safemode && if (mddev->safemode &&
!atomic_read(&mddev->writes_pending) && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && !mddev->in_sync &&
mddev->recovery_cp == MaxSector) { mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1; mddev->in_sync = 1;
did_change = 1;
if (mddev->persistent) if (mddev->persistent)
set_bit(MD_CHANGE_CLEAN, &mddev->flags); set_bit(MD_CHANGE_CLEAN, &mddev->flags);
} }
if (mddev->safemode == 1) if (mddev->safemode == 1)
mddev->safemode = 0; mddev->safemode = 0;
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
if (did_change)
sysfs_notify(&mddev->kobj, NULL, "array_state");
} }
if (mddev->flags) if (mddev->flags)
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
rdev_for_each(rdev, rtmp, mddev)
if (test_and_clear_bit(StateChanged, &rdev->flags))
sysfs_notify(&rdev->kobj, NULL, "state");
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
...@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev) ...@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev)
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* success...*/ /* success...*/
/* activate any spares */ /* activate any spares */
mddev->pers->spare_active(mddev); if (mddev->pers->spare_active(mddev))
sysfs_notify(&mddev->kobj, NULL,
"degraded");
} }
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
...@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev) ...@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery = 0; mddev->recovery = 0;
/* flag recovery needed just to double check */ /* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
md_new_event(mddev); md_new_event(mddev);
goto unlock; goto unlock;
} }
/* Set RUNNING before clearing NEEDED to avoid
* any transients in the value of "sync_action".
*/
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
/* Clear some bits that don't mean anything, but /* Clear some bits that don't mean anything, but
* might be left set * might be left set
*/ */
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
...@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev) ...@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev)
/* Cannot proceed */ /* Cannot proceed */
goto unlock; goto unlock;
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if ((spares = remove_and_add_spares(mddev))) { } else if ((spares = remove_and_add_spares(mddev))) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (mddev->recovery_cp < MaxSector) { } else if (mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_SYNC, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
/* nothing to be done ... */ /* nothing to be done ... */
goto unlock; goto unlock;
if (mddev->pers->sync_request) { if (mddev->pers->sync_request) {
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (spares && mddev->bitmap && ! mddev->bitmap->file) { if (spares && mddev->bitmap && ! mddev->bitmap->file) {
/* We are adding a device or devices to an array /* We are adding a device or devices to an array
* which has the bitmap stored on all devices. * which has the bitmap stored on all devices.
...@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev) ...@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery = 0; mddev->recovery = 0;
} else } else
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
md_new_event(mddev); md_new_event(mddev);
} }
unlock: unlock:
if (!mddev->sync_thread) {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
&mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_action");
}
mddev_unlock(mddev); mddev_unlock(mddev);
} }
} }
...@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this, ...@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this,
for_each_mddev(mddev, tmp) for_each_mddev(mddev, tmp)
if (mddev_trylock(mddev)) { if (mddev_trylock(mddev)) {
do_md_stop (mddev, 1); do_md_stop (mddev, 1, 0);
mddev_unlock(mddev); mddev_unlock(mddev);
} }
/* /*
......
...@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
multipath_conf_t *conf = mddev->private; multipath_conf_t *conf = mddev->private;
struct request_queue *q; struct request_queue *q;
int found = 0; int err = -EEXIST;
int path; int path;
struct multipath_info *p; struct multipath_info *p;
int first = 0;
int last = mddev->raid_disks - 1;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
print_multipath_conf(conf); print_multipath_conf(conf);
for (path=0; path<mddev->raid_disks; path++) for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) { if ((p=conf->multipaths+path)->rdev == NULL) {
q = rdev->bdev->bd_disk->queue; q = rdev->bdev->bd_disk->queue;
blk_queue_stack_limits(mddev->queue, q); blk_queue_stack_limits(mddev->queue, q);
...@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->raid_disk = path; rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
found = 1; err = 0;
break;
} }
print_multipath_conf(conf); print_multipath_conf(conf);
return found;
return err;
} }
static int multipath_remove_disk(mddev_t *mddev, int number) static int multipath_remove_disk(mddev_t *mddev, int number)
...@@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev) ...@@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
mddev->array_size = mddev->size; mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = multipath_unplug; mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_fn = multipath_congested;
......
...@@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev) ...@@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev)
goto out_free_conf; goto out_free_conf;
/* calculate array device size */ /* calculate array device size */
mddev->array_size = 0; mddev->array_sectors = 0;
rdev_for_each(rdev, tmp, mddev) rdev_for_each(rdev, tmp, mddev)
mddev->array_size += rdev->size; mddev->array_sectors += rdev->size * 2;
printk("raid0 : md_size is %llu blocks.\n", printk("raid0 : md_size is %llu blocks.\n",
(unsigned long long)mddev->array_size); (unsigned long long)mddev->array_sectors / 2);
printk("raid0 : conf->hash_spacing is %llu blocks.\n", printk("raid0 : conf->hash_spacing is %llu blocks.\n",
(unsigned long long)conf->hash_spacing); (unsigned long long)conf->hash_spacing);
{ {
sector_t s = mddev->array_size; sector_t s = mddev->array_sectors / 2;
sector_t space = conf->hash_spacing; sector_t space = conf->hash_spacing;
int round; int round;
conf->preshift = 0; conf->preshift = 0;
......
...@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev) ...@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev)
static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
conf_t *conf = mddev->private; conf_t *conf = mddev->private;
int found = 0; int err = -EEXIST;
int mirror = 0; int mirror = 0;
mirror_info_t *p; mirror_info_t *p;
int first = 0;
int last = mddev->raid_disks - 1;
for (mirror=0; mirror < mddev->raid_disks; mirror++) if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
for (mirror = first; mirror <= last; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) { if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue, blk_queue_stack_limits(mddev->queue,
...@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0; p->head_position = 0;
rdev->raid_disk = mirror; rdev->raid_disk = mirror;
found = 1; err = 0;
/* As all devices are equivalent, we don't need a full recovery /* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array * if this was recently any drive of the array
*/ */
...@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
} }
print_conf(conf); print_conf(conf);
return found; return err;
} }
static int raid1_remove_disk(mddev_t *mddev, int number) static int raid1_remove_disk(mddev_t *mddev, int number)
...@@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev) ...@@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev)
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
mddev->array_size = mddev->size; mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = raid1_unplug; mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_fn = raid1_congested;
...@@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) ...@@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems * any io in the removed space completes, but it hardly seems
* worth it. * worth it.
*/ */
mddev->array_size = sectors>>1; mddev->array_sectors = sectors;
set_capacity(mddev->gendisk, mddev->array_size << 1); set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1; mddev->changed = 1;
if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { if (mddev->array_sectors / 2 > mddev->size &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1; mddev->recovery_cp = mddev->size << 1;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->size = mddev->array_size; mddev->size = mddev->array_sectors / 2;
mddev->resync_max_sectors = sectors; mddev->resync_max_sectors = sectors;
return 0; return 0;
} }
...@@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev) ...@@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev)
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks; int cnt, raid_disks;
unsigned long flags; unsigned long flags;
int d, d2; int d, d2, err;
/* Cannot change chunk_size, layout, or level */ /* Cannot change chunk_size, layout, or level */
if (mddev->chunk_size != mddev->new_chunk || if (mddev->chunk_size != mddev->new_chunk ||
...@@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev) ...@@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev)
return -EINVAL; return -EINVAL;
} }
md_allow_write(mddev); err = md_allow_write(mddev);
if (err)
return err;
raid_disks = mddev->raid_disks + mddev->delta_disks; raid_disks = mddev->raid_disks + mddev->delta_disks;
......
...@@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev) ...@@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev)
static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
conf_t *conf = mddev->private; conf_t *conf = mddev->private;
int found = 0; int err = -EEXIST;
int mirror; int mirror;
mirror_info_t *p; mirror_info_t *p;
int first = 0;
int last = mddev->raid_disks - 1;
if (mddev->recovery_cp < MaxSector) if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is /* only hot-add to in-sync arrays, as recovery is
* very different from resync * very different from resync
*/ */
return 0; return -EBUSY;
if (!enough(conf)) if (!enough(conf))
return 0; return -EINVAL;
if (rdev->raid_disk)
first = last = rdev->raid_disk;
if (rdev->saved_raid_disk >= 0 && if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL) conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk; mirror = rdev->saved_raid_disk;
else else
mirror = 0; mirror = first;
for ( ; mirror < mddev->raid_disks; mirror++) for ( ; mirror <= last ; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) { if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue, blk_queue_stack_limits(mddev->queue,
...@@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0; p->head_position = 0;
rdev->raid_disk = mirror; rdev->raid_disk = mirror;
found = 1; err = 0;
if (rdev->saved_raid_disk != mirror) if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1; conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
...@@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
} }
print_conf(conf); print_conf(conf);
return found; return err;
} }
static int raid10_remove_disk(mddev_t *mddev, int number) static int raid10_remove_disk(mddev_t *mddev, int number)
...@@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev) ...@@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev)
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
mddev->array_size = size << (conf->chunk_shift-1); mddev->array_sectors = size << conf->chunk_shift;
mddev->resync_max_sectors = size << conf->chunk_shift; mddev->resync_max_sectors = size << conf->chunk_shift;
mddev->queue->unplug_fn = raid10_unplug; mddev->queue->unplug_fn = raid10_unplug;
......
...@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi) ...@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next; return_bi = bi->bi_next;
bi->bi_next = NULL; bi->bi_next = NULL;
bi->bi_size = 0; bi->bi_size = 0;
bi->bi_end_io(bi, bio_endio(bi, 0);
test_bit(BIO_UPTODATE, &bi->bi_flags)
? 0 : -EIO);
bi = return_bi; bi = return_bi;
} }
} }
static void print_raid5_conf (raid5_conf_t *conf); static void print_raid5_conf (raid5_conf_t *conf);
static int stripe_operations_active(struct stripe_head *sh)
{
return sh->check_state || sh->reconstruct_state ||
test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
{ {
if (atomic_dec_and_test(&sh->count)) { if (atomic_dec_and_test(&sh->count)) {
...@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) ...@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
} }
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
} else { } else {
BUG_ON(sh->ops.pending); BUG_ON(stripe_operations_active(sh));
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes); atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
...@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int ...@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); BUG_ON(stripe_operations_active(sh));
CHECK_DEVLOCK(); CHECK_DEVLOCK();
pr_debug("init_stripe called, stripe %llu\n", pr_debug("init_stripe called, stripe %llu\n",
...@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ...@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
return sh; return sh;
} }
/* test_and_ack_op() ensures that we only dequeue an operation once */
#define test_and_ack_op(op, pend) \
do { \
if (test_bit(op, &sh->ops.pending) && \
!test_bit(op, &sh->ops.complete)) { \
if (test_and_set_bit(op, &sh->ops.ack)) \
clear_bit(op, &pend); \
else \
ack++; \
} else \
clear_bit(op, &pend); \
} while (0)
/* find new work to run, do not resubmit work that is already
* in flight
*/
static unsigned long get_stripe_work(struct stripe_head *sh)
{
unsigned long pending;
int ack = 0;
pending = sh->ops.pending;
test_and_ack_op(STRIPE_OP_BIOFILL, pending);
test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
test_and_ack_op(STRIPE_OP_PREXOR, pending);
test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
test_and_ack_op(STRIPE_OP_POSTXOR, pending);
test_and_ack_op(STRIPE_OP_CHECK, pending);
if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
ack++;
sh->ops.count -= ack;
if (unlikely(sh->ops.count < 0)) {
printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
"ops.complete: %#lx\n", pending, sh->ops.pending,
sh->ops.ack, sh->ops.complete);
BUG();
}
return pending;
}
static void static void
raid5_end_read_request(struct bio *bi, int error); raid5_end_read_request(struct bio *bi, int error);
static void static void
raid5_end_write_request(struct bio *bi, int error); raid5_end_write_request(struct bio *bi, int error);
static void ops_run_io(struct stripe_head *sh) static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{ {
raid5_conf_t *conf = sh->raid_conf; raid5_conf_t *conf = sh->raid_conf;
int i, disks = sh->disks; int i, disks = sh->disks;
might_sleep(); might_sleep();
set_bit(STRIPE_IO_STARTED, &sh->state);
for (i = disks; i--; ) { for (i = disks; i--; ) {
int rw; int rw;
struct bio *bi; struct bio *bi;
...@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) ...@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
rcu_read_unlock(); rcu_read_unlock();
if (rdev) { if (rdev) {
if (test_bit(STRIPE_SYNCING, &sh->state) || if (s->syncing || s->expanding || s->expanded)
test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
test_bit(STRIPE_EXPAND_READY, &sh->state))
md_sync_acct(rdev->bdev, STRIPE_SECTORS); md_sync_acct(rdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state);
bi->bi_bdev = rdev->bdev; bi->bi_bdev = rdev->bdev;
pr_debug("%s: for %llu schedule op %ld on disc %d\n", pr_debug("%s: for %llu schedule op %ld on disc %d\n",
__func__, (unsigned long long)sh->sector, __func__, (unsigned long long)sh->sector,
...@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) ...@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
/* clear completed biofills */ /* clear completed biofills */
spin_lock_irq(&conf->device_lock);
for (i = sh->disks; i--; ) { for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
/* acknowledge completion of a biofill operation */ /* acknowledge completion of a biofill operation */
/* and check if we need to reply to a read request, /* and check if we need to reply to a read request,
* new R5_Wantfill requests are held off until * new R5_Wantfill requests are held off until
* !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) * !STRIPE_BIOFILL_RUN
*/ */
if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
struct bio *rbi, *rbi2; struct bio *rbi, *rbi2;
/* The access to dev->read is outside of the
* spin_lock_irq(&conf->device_lock), but is protected
* by the STRIPE_OP_BIOFILL pending bit
*/
BUG_ON(!dev->read); BUG_ON(!dev->read);
rbi = dev->read; rbi = dev->read;
dev->read = NULL; dev->read = NULL;
while (rbi && rbi->bi_sector < while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector); rbi2 = r5_next_bio(rbi, dev->sector);
spin_lock_irq(&conf->device_lock);
if (--rbi->bi_phys_segments == 0) { if (--rbi->bi_phys_segments == 0) {
rbi->bi_next = return_bi; rbi->bi_next = return_bi;
return_bi = rbi; return_bi = rbi;
} }
spin_unlock_irq(&conf->device_lock);
rbi = rbi2; rbi = rbi2;
} }
} }
} }
set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); spin_unlock_irq(&conf->device_lock);
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
return_io(return_bi); return_io(return_bi);
...@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) ...@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
set_bit(R5_UPTODATE, &tgt->flags); set_bit(R5_UPTODATE, &tgt->flags);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
clear_bit(R5_Wantcompute, &tgt->flags); clear_bit(R5_Wantcompute, &tgt->flags);
set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
} }
static struct dma_async_tx_descriptor * static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
ops_run_compute5(struct stripe_head *sh, unsigned long pending)
{ {
/* kernel stack size limits the total number of disks */ /* kernel stack size limits the total number of disks */
int disks = sh->disks; int disks = sh->disks;
...@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) ...@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
ASYNC_TX_XOR_ZERO_DST, NULL, ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute5, sh); ops_complete_compute5, sh);
/* ack now if postxor is not set to be run */
if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
async_tx_ack(tx);
return tx; return tx;
} }
...@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) ...@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
} }
static struct dma_async_tx_descriptor * static struct dma_async_tx_descriptor *
...@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) ...@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
/* Only process blocks that are known to be uptodate */ /* Only process blocks that are known to be uptodate */
if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) if (test_bit(R5_Wantdrain, &dev->flags))
xor_srcs[count++] = dev->page; xor_srcs[count++] = dev->page;
} }
...@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) ...@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
} }
static struct dma_async_tx_descriptor * static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
unsigned long pending)
{ {
int disks = sh->disks; int disks = sh->disks;
int pd_idx = sh->pd_idx, i; int i;
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (Wantprexor)
*/
int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
...@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ...@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
struct bio *chosen; struct bio *chosen;
int towrite;
towrite = 0;
if (prexor) { /* rmw */
if (dev->towrite &&
test_bit(R5_Wantprexor, &dev->flags))
towrite = 1;
} else { /* rcw */
if (i != pd_idx && dev->towrite &&
test_bit(R5_LOCKED, &dev->flags))
towrite = 1;
}
if (towrite) { if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi; struct bio *wbi;
spin_lock(&sh->lock); spin_lock(&sh->lock);
...@@ -745,18 +679,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ...@@ -745,18 +679,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
} }
static void ops_complete_postxor(void *stripe_head_ref) static void ops_complete_postxor(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
static void ops_complete_write(void *stripe_head_ref)
{ {
struct stripe_head *sh = stripe_head_ref; struct stripe_head *sh = stripe_head_ref;
int disks = sh->disks, i, pd_idx = sh->pd_idx; int disks = sh->disks, i, pd_idx = sh->pd_idx;
...@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) ...@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
set_bit(R5_UPTODATE, &dev->flags); set_bit(R5_UPTODATE, &dev->flags);
} }
set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); if (sh->reconstruct_state == reconstruct_state_drain_run)
set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); sh->reconstruct_state = reconstruct_state_drain_result;
else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
sh->reconstruct_state = reconstruct_state_prexor_drain_result;
else {
BUG_ON(sh->reconstruct_state != reconstruct_state_run);
sh->reconstruct_state = reconstruct_state_result;
}
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
} }
static void static void
ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
unsigned long pending)
{ {
/* kernel stack size limits the total number of disks */ /* kernel stack size limits the total number of disks */
int disks = sh->disks; int disks = sh->disks;
...@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ...@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
int count = 0, pd_idx = sh->pd_idx, i; int count = 0, pd_idx = sh->pd_idx, i;
struct page *xor_dest; struct page *xor_dest;
int prexor = test_bit(STRIPE_OP_PREXOR, &pending); int prexor = 0;
unsigned long flags; unsigned long flags;
dma_async_tx_callback callback;
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
...@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ...@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
/* check if prexor is active which means only process blocks /* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written) * that are part of a read-modify-write (written)
*/ */
if (prexor) { if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
prexor = 1;
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ...@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
} }
} }
/* check whether this postxor is part of a write */
callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
ops_complete_write : ops_complete_postxor;
/* 1/ if we prexor'd then the dest is reused as a source /* 1/ if we prexor'd then the dest is reused as a source
* 2/ if we did not prexor then we are redoing the parity * 2/ if we did not prexor then we are redoing the parity
* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
...@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, ...@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
if (unlikely(count == 1)) { if (unlikely(count == 1)) {
flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
flags, tx, callback, sh); flags, tx, ops_complete_postxor, sh);
} else } else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
flags, tx, callback, sh); flags, tx, ops_complete_postxor, sh);
} }
static void ops_complete_check(void *stripe_head_ref) static void ops_complete_check(void *stripe_head_ref)
{ {
struct stripe_head *sh = stripe_head_ref; struct stripe_head *sh = stripe_head_ref;
int pd_idx = sh->pd_idx;
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && sh->check_state = check_state_check_result;
sh->ops.zero_sum_result == 0)
set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
} }
...@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) ...@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
&sh->ops.zero_sum_result, 0, NULL, NULL, NULL); &sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
if (tx)
set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
else
clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
atomic_inc(&sh->count); atomic_inc(&sh->count);
tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
ops_complete_check, sh); ops_complete_check, sh);
} }
static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
{ {
int overlap_clear = 0, i, disks = sh->disks; int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL; struct dma_async_tx_descriptor *tx = NULL;
if (test_bit(STRIPE_OP_BIOFILL, &pending)) { if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
ops_run_biofill(sh); ops_run_biofill(sh);
overlap_clear++; overlap_clear++;
} }
if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
tx = ops_run_compute5(sh, pending); tx = ops_run_compute5(sh);
/* terminate the chain if postxor is not set to be run */
if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
async_tx_ack(tx);
}
if (test_bit(STRIPE_OP_PREXOR, &pending)) if (test_bit(STRIPE_OP_PREXOR, &ops_request))
tx = ops_run_prexor(sh, tx); tx = ops_run_prexor(sh, tx);
if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx, pending); tx = ops_run_biodrain(sh, tx);
overlap_clear++; overlap_clear++;
} }
if (test_bit(STRIPE_OP_POSTXOR, &pending)) if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
ops_run_postxor(sh, tx, pending); ops_run_postxor(sh, tx);
if (test_bit(STRIPE_OP_CHECK, &pending)) if (test_bit(STRIPE_OP_CHECK, &ops_request))
ops_run_check(sh); ops_run_check(sh);
if (test_bit(STRIPE_OP_IO, &pending))
ops_run_io(sh);
if (overlap_clear) if (overlap_clear)
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) ...@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
struct stripe_head *osh, *nsh; struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes); LIST_HEAD(newstripes);
struct disk_info *ndisks; struct disk_info *ndisks;
int err = 0; int err;
struct kmem_cache *sc; struct kmem_cache *sc;
int i; int i;
if (newsize <= conf->pool_size) if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */ return 0; /* never bother to shrink */
md_allow_write(conf->mddev); err = md_allow_write(conf->mddev);
if (err)
return err;
/* Step 1 */ /* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name], sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
...@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) ...@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
} }
} }
static int static void
handle_write_operations5(struct stripe_head *sh, int rcw, int expand) schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
int rcw, int expand)
{ {
int i, pd_idx = sh->pd_idx, disks = sh->disks; int i, pd_idx = sh->pd_idx, disks = sh->disks;
int locked = 0;
if (rcw) { if (rcw) {
/* if we are not expanding this is a proper write request, and /* if we are not expanding this is a proper write request, and
...@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) ...@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
* stripe cache * stripe cache
*/ */
if (!expand) { if (!expand) {
set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); sh->reconstruct_state = reconstruct_state_drain_run;
sh->ops.count++; set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
} } else
sh->reconstruct_state = reconstruct_state_run;
set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
sh->ops.count++;
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
if (dev->towrite) { if (dev->towrite) {
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
if (!expand) if (!expand)
clear_bit(R5_UPTODATE, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags);
locked++; s->locked++;
} }
} }
if (locked + 1 == disks) if (s->locked + 1 == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&sh->raid_conf->pending_full_writes); atomic_inc(&sh->raid_conf->pending_full_writes);
} else { } else {
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); sh->reconstruct_state = reconstruct_state_prexor_drain_run;
set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); set_bit(STRIPE_OP_PREXOR, &s->ops_request);
set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
sh->ops.count += 3;
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
if (i == pd_idx) if (i == pd_idx)
continue; continue;
/* For a read-modify write there may be blocks that are
* locked for reading while others are ready to be
* written so we distinguish these blocks by the
* R5_Wantprexor bit
*/
if (dev->towrite && if (dev->towrite &&
(test_bit(R5_UPTODATE, &dev->flags) || (test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) { test_bit(R5_Wantcompute, &dev->flags))) {
set_bit(R5_Wantprexor, &dev->flags); set_bit(R5_Wantdrain, &dev->flags);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
clear_bit(R5_UPTODATE, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags);
locked++; s->locked++;
} }
} }
} }
...@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) ...@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
*/ */
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
locked++; s->locked++;
pr_debug("%s: stripe %llu locked: %d pending: %lx\n", pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector, __func__, (unsigned long long)sh->sector,
locked, sh->ops.pending); s->locked, s->ops_request);
return locked;
} }
/* /*
...@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) ...@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
} }
static void static void
handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks, struct stripe_head_state *s, int disks,
struct bio **return_bi) struct bio **return_bi)
{ {
...@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, ...@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
} }
/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks /* fetch_block5 - checks the given member device to see if its data needs
* to process * to be read or computed to satisfy a request.
*
* Returns 1 when no more member devices need to be checked, otherwise returns
* 0 to tell the loop in handle_stripe_fill5 to continue
*/ */
static int __handle_issuing_new_read_requests5(struct stripe_head *sh, static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
struct stripe_head_state *s, int disk_idx, int disks) int disk_idx, int disks)
{ {
struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *failed_dev = &sh->dev[s->failed_num]; struct r5dev *failed_dev = &sh->dev[s->failed_num];
/* don't schedule compute operations or reads on the parity block while
* a check is in flight
*/
if ((disk_idx == sh->pd_idx) &&
test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
return ~0;
/* is the data in this block needed, and can we get it? */ /* is the data in this block needed, and can we get it? */
if (!test_bit(R5_LOCKED, &dev->flags) && if (!test_bit(R5_LOCKED, &dev->flags) &&
!test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || !test_bit(R5_UPTODATE, &dev->flags) &&
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || (dev->toread ||
s->syncing || s->expanding || (s->failed && (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
(failed_dev->toread || (failed_dev->towrite && s->syncing || s->expanding ||
!test_bit(R5_OVERWRITE, &failed_dev->flags) (s->failed &&
))))) { (failed_dev->toread ||
/* 1/ We would like to get this block, possibly by computing it, (failed_dev->towrite &&
* but we might not be able to. !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
* /* We would like to get this block, possibly by computing it,
* 2/ Since parity check operations potentially make the parity * otherwise read it if the backing disk is insync
* block !uptodate it will need to be refreshed before any
* compute operations on data disks are scheduled.
*
* 3/ We hold off parity block re-reads until check operations
* have quiesced.
*/ */
if ((s->uptodate == disks - 1) && if ((s->uptodate == disks - 1) &&
(s->failed && disk_idx == s->failed_num) && (s->failed && disk_idx == s->failed_num)) {
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute, &dev->flags); set_bit(R5_Wantcompute, &dev->flags);
sh->ops.target = disk_idx; sh->ops.target = disk_idx;
s->req_compute = 1; s->req_compute = 1;
sh->ops.count++;
/* Careful: from this point on 'uptodate' is in the eye /* Careful: from this point on 'uptodate' is in the eye
* of raid5_run_ops which services 'compute' operations * of raid5_run_ops which services 'compute' operations
* before writes. R5_Wantcompute flags a block that will * before writes. R5_Wantcompute flags a block that will
...@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, ...@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
* subsequent operation. * subsequent operation.
*/ */
s->uptodate++; s->uptodate++;
return 0; /* uptodate + compute == disks */ return 1; /* uptodate + compute == disks */
} else if (test_bit(R5_Insync, &dev->flags)) { } else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags); set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
s->locked++; s->locked++;
pr_debug("Reading block %d (sync=%d)\n", disk_idx, pr_debug("Reading block %d (sync=%d)\n", disk_idx,
s->syncing); s->syncing);
} }
} }
return ~0; return 0;
} }
static void handle_issuing_new_read_requests5(struct stripe_head *sh, /**
* handle_stripe_fill5 - read or compute data to satisfy pending requests.
*/
static void handle_stripe_fill5(struct stripe_head *sh,
struct stripe_head_state *s, int disks) struct stripe_head_state *s, int disks)
{ {
int i; int i;
/* Clear completed compute operations. Parity recovery
* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
* later on in this routine
*/
if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
}
/* look for blocks to read/compute, skip this if a compute /* look for blocks to read/compute, skip this if a compute
* is already in flight, or if the stripe contents are in the * is already in flight, or if the stripe contents are in the
* midst of changing due to a write * midst of changing due to a write
*/ */
if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && !sh->reconstruct_state)
!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
for (i = disks; i--; ) for (i = disks; i--; )
if (__handle_issuing_new_read_requests5( if (fetch_block5(sh, s, i, disks))
sh, s, i, disks) == 0)
break; break;
}
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
} }
static void handle_issuing_new_read_requests6(struct stripe_head *sh, static void handle_stripe_fill6(struct stripe_head *sh,
struct stripe_head_state *s, struct r6_state *r6s, struct stripe_head_state *s, struct r6_state *r6s,
int disks) int disks)
{ {
...@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, ...@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
} }
/* handle_completed_write_requests /* handle_stripe_clean_event
* any written block on an uptodate or failed drive can be returned. * any written block on an uptodate or failed drive can be returned.
* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
* never LOCKED, so we don't need to test 'failed' directly. * never LOCKED, so we don't need to test 'failed' directly.
*/ */
static void handle_completed_write_requests(raid5_conf_t *conf, static void handle_stripe_clean_event(raid5_conf_t *conf,
struct stripe_head *sh, int disks, struct bio **return_bi) struct stripe_head *sh, int disks, struct bio **return_bi)
{ {
int i; int i;
...@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, ...@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
md_wakeup_thread(conf->mddev->thread); md_wakeup_thread(conf->mddev->thread);
} }
static void handle_issuing_new_write_requests5(raid5_conf_t *conf, static void handle_stripe_dirtying5(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s, int disks) struct stripe_head *sh, struct stripe_head_state *s, int disks)
{ {
int rmw = 0, rcw = 0, i; int rmw = 0, rcw = 0, i;
...@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, ...@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
"%d for r-m-w\n", i); "%d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags); set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(
STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
s->locked++; s->locked++;
} else { } else {
set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_DELAYED, &sh->state);
...@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, ...@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
"%d for Reconstruct\n", i); "%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags); set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(
STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
s->locked++; s->locked++;
} else { } else {
set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_DELAYED, &sh->state);
...@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, ...@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
* simultaneously. If this is not the case then new writes need to be * simultaneously. If this is not the case then new writes need to be
* held off until the compute completes. * held off until the compute completes.
*/ */
if ((s->req_compute || if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && (s->locked == 0 && (rcw == 0 || rmw == 0) &&
(s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state)))
!test_bit(STRIPE_BIT_DELAY, &sh->state))) schedule_reconstruction5(sh, s, rcw == 0, 0);
s->locked += handle_write_operations5(sh, rcw == 0, 0);
} }
static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_stripe_dirtying6(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s, struct stripe_head *sh, struct stripe_head_state *s,
struct r6_state *r6s, int disks) struct r6_state *r6s, int disks)
{ {
...@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, ...@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks) struct stripe_head_state *s, int disks)
{ {
int canceled_check = 0; struct r5dev *dev = NULL;
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
/* complete a check operation */ switch (sh->check_state) {
if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { case check_state_idle:
clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); /* start a new check operation if there are no failures */
clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
if (s->failed == 0) { if (s->failed == 0) {
if (sh->ops.zero_sum_result == 0)
/* parity is correct (on disc,
* not in buffer any more)
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
conf->mddev->resync_mismatches +=
STRIPE_SECTORS;
if (test_bit(
MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
else {
set_bit(STRIPE_OP_COMPUTE_BLK,
&sh->ops.pending);
set_bit(STRIPE_OP_MOD_REPAIR_PD,
&sh->ops.pending);
set_bit(R5_Wantcompute,
&sh->dev[sh->pd_idx].flags);
sh->ops.target = sh->pd_idx;
sh->ops.count++;
s->uptodate++;
}
}
} else
canceled_check = 1; /* STRIPE_INSYNC is not set */
}
/* start a new check operation if there are no failures, the stripe is
* not insync, and a repair is not in flight
*/
if (s->failed == 0 &&
!test_bit(STRIPE_INSYNC, &sh->state) &&
!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
BUG_ON(s->uptodate != disks); BUG_ON(s->uptodate != disks);
sh->check_state = check_state_run;
set_bit(STRIPE_OP_CHECK, &s->ops_request);
clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
sh->ops.count++;
s->uptodate--; s->uptodate--;
break;
} }
} dev = &sh->dev[s->failed_num];
/* fall through */
/* check if we can clear a parity disk reconstruct */ case check_state_compute_result:
if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && sh->check_state = check_state_idle;
test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { if (!dev)
dev = &sh->dev[sh->pd_idx];
clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); /* check that a write has not made the stripe insync */
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); if (test_bit(STRIPE_INSYNC, &sh->state))
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); break;
}
/* Wait for check parity and compute block operations to complete
* before write-back. If a failure occurred while the check operation
* was in flight we need to cycle this stripe through handle_stripe
* since the parity block may not be uptodate
*/
if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
struct r5dev *dev;
/* either failed parity check, or recovery is happening */ /* either failed parity check, or recovery is happening */
if (s->failed == 0)
s->failed_num = sh->pd_idx;
dev = &sh->dev[s->failed_num];
BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
BUG_ON(s->uptodate != disks); BUG_ON(s->uptodate != disks);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
s->locked++;
set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_Wantwrite, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
clear_bit(STRIPE_DEGRADED, &sh->state); clear_bit(STRIPE_DEGRADED, &sh->state);
s->locked++;
set_bit(STRIPE_INSYNC, &sh->state); set_bit(STRIPE_INSYNC, &sh->state);
break;
case check_state_run:
break; /* we will be called again upon completion */
case check_state_check_result:
sh->check_state = check_state_idle;
/* if a failure occurred during the check operation, leave
* STRIPE_INSYNC not set and let the stripe be handled again
*/
if (s->failed)
break;
/* handle a successful check operation, if parity is correct
* we are done. Otherwise update the mismatch count and repair
* parity if !MD_RECOVERY_CHECK
*/
if (sh->ops.zero_sum_result == 0)
/* parity is correct (on disc,
* not in buffer any more)
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
conf->mddev->resync_mismatches += STRIPE_SECTORS;
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute,
&sh->dev[sh->pd_idx].flags);
sh->ops.target = sh->pd_idx;
s->uptodate++;
}
}
break;
case check_state_compute_run:
break;
default:
printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
__func__, sh->check_state,
(unsigned long long) sh->sector);
BUG();
} }
} }
...@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
struct bio *return_bi = NULL; struct bio *return_bi = NULL;
struct stripe_head_state s; struct stripe_head_state s;
struct r5dev *dev; struct r5dev *dev;
unsigned long pending = 0;
mdk_rdev_t *blocked_rdev = NULL; mdk_rdev_t *blocked_rdev = NULL;
int prexor; int prexor;
memset(&s, 0, sizeof(s)); memset(&s, 0, sizeof(s));
pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
atomic_read(&sh->count), sh->pd_idx, atomic_read(&sh->count), sh->pd_idx, sh->check_state,
sh->ops.pending, sh->ops.ack, sh->ops.complete); sh->reconstruct_state);
spin_lock(&sh->lock); spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_HANDLE, &sh->state);
...@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
/* Now to look around and see what can be done */
/* clean-up completed biofill operations */
if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
}
/* Now to look around and see what can be done */
rcu_read_lock(); rcu_read_lock();
for (i=disks; i--; ) { for (i=disks; i--; ) {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
/* maybe we can request a biofill operation /* maybe we can request a biofill operation
* *
* new wantfill requests are only permitted while * new wantfill requests are only permitted while
* STRIPE_OP_BIOFILL is clear * ops_complete_biofill is guaranteed to be inactive
*/ */
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
set_bit(R5_Wantfill, &dev->flags); set_bit(R5_Wantfill, &dev->flags);
/* now count some things */ /* now count some things */
...@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
goto unlock; goto unlock;
} }
if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
sh->ops.count++; set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
set_bit(STRIPE_BIOFILL_RUN, &sh->state);
}
pr_debug("locked=%d uptodate=%d to_read=%d" pr_debug("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d\n", " to_write=%d failed=%d failed_num=%d\n",
...@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
* need to be failed * need to be failed
*/ */
if (s.failed > 1 && s.to_read+s.to_write+s.written) if (s.failed > 1 && s.to_read+s.to_write+s.written)
handle_requests_to_failed_array(conf, sh, &s, disks, handle_failed_stripe(conf, sh, &s, disks, &return_bi);
&return_bi);
if (s.failed > 1 && s.syncing) { if (s.failed > 1 && s.syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0); md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
...@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags)) || test_bit(R5_UPTODATE, &dev->flags)) ||
(s.failed == 1 && s.failed_num == sh->pd_idx))) (s.failed == 1 && s.failed_num == sh->pd_idx)))
handle_completed_write_requests(conf, sh, disks, &return_bi); handle_stripe_clean_event(conf, sh, disks, &return_bi);
/* Now we might consider reading some blocks, either to check/generate /* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests * parity, or to satisfy requests
* or to load a block that is being partially written. * or to load a block that is being partially written.
*/ */
if (s.to_read || s.non_overwrite || if (s.to_read || s.non_overwrite ||
(s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) handle_stripe_fill5(sh, &s, disks);
handle_issuing_new_read_requests5(sh, &s, disks);
/* Now we check to see if any write operations have recently /* Now we check to see if any write operations have recently
* completed * completed
*/ */
/* leave prexor set until postxor is done, allows us to distinguish
* a rmw from a rcw during biodrain
*/
prexor = 0; prexor = 0;
if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
prexor = 1; prexor = 1;
clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); if (sh->reconstruct_state == reconstruct_state_drain_result ||
clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); sh->reconstruct_state = reconstruct_state_idle;
for (i = disks; i--; )
clear_bit(R5_Wantprexor, &sh->dev[i].flags);
}
/* if only POSTXOR is set then this is an 'expand' postxor */
if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
/* All the 'written' buffers and the parity block are ready to /* All the 'written' buffers and the parity block are ready to
* be written back to disk * be written back to disk
...@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh)
(i == sh->pd_idx || dev->written)) { (i == sh->pd_idx || dev->written)) {
pr_debug("Writing block %d\n", i); pr_debug("Writing block %d\n", i);
set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_Wantwrite, &dev->flags);
if (!test_and_set_bit(
STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
if (prexor) if (prexor)
continue; continue;
if (!test_bit(R5_Insync, &dev->flags) || if (!test_bit(R5_Insync, &dev->flags) ||
...@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
* 2/ A 'check' operation is in flight, as it may clobber the parity * 2/ A 'check' operation is in flight, as it may clobber the parity
* block. * block.
*/ */
if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && if (s.to_write && !sh->reconstruct_state && !sh->check_state)
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) handle_stripe_dirtying5(conf, sh, &s, disks);
handle_issuing_new_write_requests5(conf, sh, &s, disks);
/* maybe we need to check and possibly fix the parity for this stripe /* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough * Any reads will already have been scheduled, so we just see if enough
* data is available. The parity check is held off while parity * data is available. The parity check is held off while parity
* dependent operations are in flight. * dependent operations are in flight.
*/ */
if ((s.syncing && s.locked == 0 && if (sh->check_state ||
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && (s.syncing && s.locked == 0 &&
!test_bit(STRIPE_INSYNC, &sh->state)) || !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || !test_bit(STRIPE_INSYNC, &sh->state)))
test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
handle_parity_checks5(conf, sh, &s, disks); handle_parity_checks5(conf, sh, &s, disks);
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
...@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[s.failed_num]; dev = &sh->dev[s.failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) { if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_Wantwrite, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
set_bit(R5_ReWrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
s.locked++; s.locked++;
} else { } else {
/* let's read it back */ /* let's read it back */
set_bit(R5_Wantread, &dev->flags); set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
s.locked++; s.locked++;
} }
} }
/* Finish postxor operations initiated by the expansion /* Finish reconstruct operations initiated by the expansion process */
* process if (sh->reconstruct_state == reconstruct_state_result) {
*/ sh->reconstruct_state = reconstruct_state_idle;
if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
clear_bit(STRIPE_EXPANDING, &sh->state); clear_bit(STRIPE_EXPANDING, &sh->state);
for (i = conf->raid_disks; i--; )
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
for (i = conf->raid_disks; i--; ) {
set_bit(R5_Wantwrite, &sh->dev[i].flags); set_bit(R5_Wantwrite, &sh->dev[i].flags);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
s.locked++; s.locked++;
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
}
} }
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { !sh->reconstruct_state) {
/* Need to write out all blocks after computing parity */ /* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks; sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
conf->raid_disks); conf->raid_disks);
s.locked += handle_write_operations5(sh, 1, 1); schedule_reconstruction5(sh, &s, 1, 1);
} else if (s.expanded && } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
s.locked == 0 &&
!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
clear_bit(STRIPE_EXPAND_READY, &sh->state); clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes); atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
...@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
} }
if (s.expanding && s.locked == 0 && if (s.expanding && s.locked == 0 &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh, NULL); handle_stripe_expansion(conf, sh, NULL);
if (sh->ops.count)
pending = get_stripe_work(sh);
unlock: unlock:
spin_unlock(&sh->lock); spin_unlock(&sh->lock);
...@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
if (unlikely(blocked_rdev)) if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
if (pending) if (s.ops_request)
raid5_run_ops(sh, pending); raid5_run_ops(sh, s.ops_request);
return_io(return_bi); ops_run_io(sh, &s);
return_io(return_bi);
} }
static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
...@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
* might need to be failed * might need to be failed
*/ */
if (s.failed > 2 && s.to_read+s.to_write+s.written) if (s.failed > 2 && s.to_read+s.to_write+s.written)
handle_requests_to_failed_array(conf, sh, &s, disks, handle_failed_stripe(conf, sh, &s, disks, &return_bi);
&return_bi);
if (s.failed > 2 && s.syncing) { if (s.failed > 2 && s.syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0); md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
...@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags)
&& test_bit(R5_UPTODATE, &qdev->flags))))) && test_bit(R5_UPTODATE, &qdev->flags)))))
handle_completed_write_requests(conf, sh, disks, &return_bi); handle_stripe_clean_event(conf, sh, disks, &return_bi);
/* Now we might consider reading some blocks, either to check/generate /* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests * parity, or to satisfy requests
...@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
*/ */
if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
(s.syncing && (s.uptodate < disks)) || s.expanding) (s.syncing && (s.uptodate < disks)) || s.expanding)
handle_issuing_new_read_requests6(sh, &s, &r6s, disks); handle_stripe_fill6(sh, &s, &r6s, disks);
/* now to consider writing and what else, if anything should be read */ /* now to consider writing and what else, if anything should be read */
if (s.to_write) if (s.to_write)
handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
/* maybe we need to check and possibly fix the parity for this stripe /* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough * Any reads will already have been scheduled, so we just see if enough
...@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
} }
if (s.expanding && s.locked == 0 && if (s.expanding && s.locked == 0 &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh, &r6s); handle_stripe_expansion(conf, sh, &r6s);
unlock: unlock:
...@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
if (unlikely(blocked_rdev)) if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
return_io(return_bi); ops_run_io(sh, &s);
for (i=disks; i-- ;) {
int rw;
struct bio *bi;
mdk_rdev_t *rdev;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
rw = WRITE;
else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
else
continue;
set_bit(STRIPE_IO_STARTED, &sh->state);
bi = &sh->dev[i].req;
bi->bi_rw = rw;
if (rw == WRITE)
bi->bi_end_io = raid5_end_write_request;
else
bi->bi_end_io = raid5_end_read_request;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (rdev) { return_io(return_bi);
if (s.syncing || s.expanding || s.expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
bi->bi_bdev = rdev->bdev;
pr_debug("for %llu schedule op %ld on disc %d\n",
(unsigned long long)sh->sector, bi->bi_rw, i);
atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_vcnt = 1;
bi->bi_max_vecs = 1;
bi->bi_idx = 0;
bi->bi_io_vec = &sh->dev[i].vec;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
bi->bi_next = NULL;
if (rw == WRITE &&
test_bit(R5_ReWrite, &sh->dev[i].flags))
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
generic_make_request(bi);
} else {
if (rw == WRITE)
set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
} }
static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
...@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi) ...@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
if ( rw == WRITE ) if ( rw == WRITE )
md_write_end(mddev); md_write_end(mddev);
bi->bi_end_io(bi, bio_endio(bi, 0);
test_bit(BIO_UPTODATE, &bi->bi_flags)
? 0 : -EIO);
} }
return 0; return 0;
} }
...@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped ...@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
j == raid6_next_disk(sh->pd_idx, sh->disks)) j == raid6_next_disk(sh->pd_idx, sh->disks))
continue; continue;
s = compute_blocknr(sh, j); s = compute_blocknr(sh, j);
if (s < (mddev->array_size<<1)) { if (s < mddev->array_sectors) {
skipped = 1; skipped = 1;
continue; continue;
} }
...@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) ...@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
remaining = --raid_bio->bi_phys_segments; remaining = --raid_bio->bi_phys_segments;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
if (remaining == 0) { if (remaining == 0)
bio_endio(raid_bio, 0);
raid_bio->bi_end_io(raid_bio,
test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
? 0 : -EIO);
}
if (atomic_dec_and_test(&conf->active_aligned_reads)) if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe); wake_up(&conf->wait_for_stripe);
return handled; return handled;
...@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) ...@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
{ {
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long new; unsigned long new;
int err;
if (len >= PAGE_SIZE) if (len >= PAGE_SIZE)
return -EINVAL; return -EINVAL;
if (!conf) if (!conf)
...@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) ...@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
else else
break; break;
} }
md_allow_write(mddev); err = md_allow_write(mddev);
if (err)
return err;
while (new > conf->max_nr_stripes) { while (new > conf->max_nr_stripes) {
if (grow_one_stripe(conf)) if (grow_one_stripe(conf))
conf->max_nr_stripes++; conf->max_nr_stripes++;
...@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev) ...@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev)
mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested; mddev->queue->backing_dev_info.congested_fn = raid5_congested;
mddev->array_size = mddev->size * (conf->previous_raid_disks - mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
conf->max_degraded); conf->max_degraded);
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
...@@ -4609,35 +4366,41 @@ static int raid5_remove_disk(mddev_t *mddev, int number) ...@@ -4609,35 +4366,41 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
raid5_conf_t *conf = mddev->private; raid5_conf_t *conf = mddev->private;
int found = 0; int err = -EEXIST;
int disk; int disk;
struct disk_info *p; struct disk_info *p;
int first = 0;
int last = conf->raid_disks - 1;
if (mddev->degraded > conf->max_degraded) if (mddev->degraded > conf->max_degraded)
/* no point adding a device */ /* no point adding a device */
return 0; return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
/* /*
* find the disk ... but prefer rdev->saved_raid_disk * find the disk ... but prefer rdev->saved_raid_disk
* if possible. * if possible.
*/ */
if (rdev->saved_raid_disk >= 0 && if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->disks[rdev->saved_raid_disk].rdev == NULL) conf->disks[rdev->saved_raid_disk].rdev == NULL)
disk = rdev->saved_raid_disk; disk = rdev->saved_raid_disk;
else else
disk = 0; disk = first;
for ( ; disk < conf->raid_disks; disk++) for ( ; disk <= last ; disk++)
if ((p=conf->disks + disk)->rdev == NULL) { if ((p=conf->disks + disk)->rdev == NULL) {
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk; rdev->raid_disk = disk;
found = 1; err = 0;
if (rdev->saved_raid_disk != disk) if (rdev->saved_raid_disk != disk)
conf->fullsync = 1; conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev); rcu_assign_pointer(p->rdev, rdev);
break; break;
} }
print_raid5_conf(conf); print_raid5_conf(conf);
return found; return err;
} }
static int raid5_resize(mddev_t *mddev, sector_t sectors) static int raid5_resize(mddev_t *mddev, sector_t sectors)
...@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) ...@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
sectors &= ~((sector_t)mddev->chunk_size/512 - 1); sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; mddev->array_sectors = sectors * (mddev->raid_disks
set_capacity(mddev->gendisk, mddev->array_size << 1); - conf->max_degraded);
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1; mddev->changed = 1;
if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1; mddev->recovery_cp = mddev->size << 1;
...@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev)
rdev_for_each(rdev, rtmp, mddev) rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk < 0 && if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) { !test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev)) { if (raid5_add_disk(mddev, rdev) == 0) {
char nm[20]; char nm[20];
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
added_devices++; added_devices++;
...@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf) ...@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf)
struct block_device *bdev; struct block_device *bdev;
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
conf->mddev->array_size = conf->mddev->size * conf->mddev->array_sectors = 2 * conf->mddev->size *
(conf->raid_disks - conf->max_degraded); (conf->raid_disks - conf->max_degraded);
set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
conf->mddev->changed = 1; conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0); bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) { if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex); mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); i_size_write(bdev->bd_inode,
(loff_t)conf->mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex); mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev); bdput(bdev);
} }
......
...@@ -221,6 +221,7 @@ struct bitmap { ...@@ -221,6 +221,7 @@ struct bitmap {
unsigned long syncchunk; unsigned long syncchunk;
__u64 events_cleared; __u64 events_cleared;
int need_sync;
/* bitmap spinlock */ /* bitmap spinlock */
spinlock_t lock; spinlock_t lock;
......
...@@ -16,7 +16,7 @@ struct linear_private_data ...@@ -16,7 +16,7 @@ struct linear_private_data
struct linear_private_data *prev; /* earlier version */ struct linear_private_data *prev; /* earlier version */
dev_info_t **hash_table; dev_info_t **hash_table;
sector_t hash_spacing; sector_t hash_spacing;
sector_t array_size; sector_t array_sectors;
int preshift; /* shift before dividing by hash_spacing */ int preshift; /* shift before dividing by hash_spacing */
dev_info_t disks[0]; dev_info_t disks[0];
}; };
......
...@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, ...@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw); struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev); extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev); extern void md_new_event(mddev_t *mddev);
extern void md_allow_write(mddev_t *mddev); extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* CONFIG_MD */ #endif /* CONFIG_MD */
......
...@@ -59,7 +59,7 @@ struct mdk_rdev_s ...@@ -59,7 +59,7 @@ struct mdk_rdev_s
int sb_loaded; int sb_loaded;
__u64 sb_events; __u64 sb_events;
sector_t data_offset; /* start of data in array */ sector_t data_offset; /* start of data in array */
sector_t sb_offset; sector_t sb_start; /* offset of the super block (in 512byte sectors) */
int sb_size; /* bytes in the superblock */ int sb_size; /* bytes in the superblock */
int preferred_minor; /* autorun support */ int preferred_minor; /* autorun support */
...@@ -87,6 +87,9 @@ struct mdk_rdev_s ...@@ -87,6 +87,9 @@ struct mdk_rdev_s
#define Blocked 8 /* An error occured on an externally #define Blocked 8 /* An error occured on an externally
* managed array, don't allow writes * managed array, don't allow writes
* until it is cleared */ * until it is cleared */
#define StateChanged 9 /* Faulty or Blocked has changed during
* interrupt, so it needs to be
* notified by the thread */
wait_queue_head_t blocked_wait; wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
...@@ -147,7 +150,7 @@ struct mddev_s ...@@ -147,7 +150,7 @@ struct mddev_s
int raid_disks; int raid_disks;
int max_disks; int max_disks;
sector_t size; /* used size of component devices */ sector_t size; /* used size of component devices */
sector_t array_size; /* exported array size */ sector_t array_sectors; /* exported array size */
__u64 events; __u64 events;
char uuid[16]; char uuid[16];
...@@ -188,6 +191,7 @@ struct mddev_s ...@@ -188,6 +191,7 @@ struct mddev_s
* NEEDED: we might need to start a resync/recover * NEEDED: we might need to start a resync/recover
* RUNNING: a thread is running, or about to be started * RUNNING: a thread is running, or about to be started
* SYNC: actually doing a resync, not a recovery * SYNC: actually doing a resync, not a recovery
* RECOVER: doing recovery, or need to try it.
* INTR: resync needs to be aborted for some reason * INTR: resync needs to be aborted for some reason
* DONE: thread is done and is waiting to be reaped * DONE: thread is done and is waiting to be reaped
* REQUEST: user-space has requested a sync (used with SYNC) * REQUEST: user-space has requested a sync (used with SYNC)
...@@ -198,6 +202,7 @@ struct mddev_s ...@@ -198,6 +202,7 @@ struct mddev_s
*/ */
#define MD_RECOVERY_RUNNING 0 #define MD_RECOVERY_RUNNING 0
#define MD_RECOVERY_SYNC 1 #define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_RECOVER 2
#define MD_RECOVERY_INTR 3 #define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4 #define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5 #define MD_RECOVERY_NEEDED 5
...@@ -210,7 +215,8 @@ struct mddev_s ...@@ -210,7 +215,8 @@ struct mddev_s
int in_sync; /* know to not need resync */ int in_sync; /* know to not need resync */
struct mutex reconfig_mutex; struct mutex reconfig_mutex;
atomic_t active; atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
int changed; /* true if we might need to reread partition info */ int changed; /* true if we might need to reread partition info */
int degraded; /* whether md should consider int degraded; /* whether md should consider
...@@ -227,6 +233,8 @@ struct mddev_s ...@@ -227,6 +233,8 @@ struct mddev_s
atomic_t recovery_active; /* blocks scheduled, but not written */ atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait; wait_queue_head_t recovery_wait;
sector_t recovery_cp; sector_t recovery_cp;
sector_t resync_min; /* user requested sync
* starts here */
sector_t resync_max; /* resync should pause sector_t resync_max; /* resync should pause
* when it gets here */ * when it gets here */
...@@ -331,6 +339,9 @@ static inline char * mdname (mddev_t * mddev) ...@@ -331,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
#define rdev_for_each(rdev, tmp, mddev) \ #define rdev_for_each(rdev, tmp, mddev) \
rdev_for_each_list(rdev, tmp, (mddev)->disks) rdev_for_each_list(rdev, tmp, (mddev)->disks)
#define rdev_for_each_rcu(rdev, mddev) \
list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
typedef struct mdk_thread_s { typedef struct mdk_thread_s {
void (*run) (mddev_t *mddev); void (*run) (mddev_t *mddev);
mddev_t *mddev; mddev_t *mddev;
......
...@@ -43,14 +43,11 @@ ...@@ -43,14 +43,11 @@
*/ */
#define MD_RESERVED_BYTES (64 * 1024) #define MD_RESERVED_BYTES (64 * 1024)
#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) #define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) #define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
#define MD_SB_BYTES 4096 #define MD_SB_BYTES 4096
#define MD_SB_WORDS (MD_SB_BYTES / 4) #define MD_SB_WORDS (MD_SB_BYTES / 4)
#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
#define MD_SB_SECTORS (MD_SB_BYTES / 512) #define MD_SB_SECTORS (MD_SB_BYTES / 512)
/* /*
......
...@@ -158,6 +158,43 @@ ...@@ -158,6 +158,43 @@
* the compute block completes. * the compute block completes.
*/ */
/*
* Operations state - intermediate states that are visible outside of sh->lock
* In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and
* compute that only have an _idle and _run state they are indicated with
* sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
*/
/**
* enum check_states - handles syncing / repairing a stripe
* @check_state_idle - check operations are quiesced
* @check_state_run - check operation is running
* @check_state_result - set outside lock when check result is valid
* @check_state_compute_run - check failed and we are repairing
* @check_state_compute_result - set outside lock when compute result is valid
*/
enum check_states {
check_state_idle = 0,
check_state_run, /* parity check */
check_state_check_result,
check_state_compute_run, /* parity repair */
check_state_compute_result,
};
/**
* enum reconstruct_states - handles writing or expanding a stripe
*/
enum reconstruct_states {
reconstruct_state_idle = 0,
reconstruct_state_prexor_drain_run, /* prexor-write */
reconstruct_state_drain_run, /* write */
reconstruct_state_run, /* expand */
reconstruct_state_prexor_drain_result,
reconstruct_state_drain_result,
reconstruct_state_result,
};
struct stripe_head { struct stripe_head {
struct hlist_node hash; struct hlist_node hash;
struct list_head lru; /* inactive_list or handle_list */ struct list_head lru; /* inactive_list or handle_list */
...@@ -169,19 +206,13 @@ struct stripe_head { ...@@ -169,19 +206,13 @@ struct stripe_head {
spinlock_t lock; spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */ int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */ int disks; /* disks in stripe */
enum check_states check_state;
enum reconstruct_states reconstruct_state;
/* stripe_operations /* stripe_operations
* @pending - pending ops flags (set for request->issue->complete)
* @ack - submitted ops flags (set for issue->complete)
* @complete - completed ops flags (set for complete)
* @target - STRIPE_OP_COMPUTE_BLK target * @target - STRIPE_OP_COMPUTE_BLK target
* @count - raid5_runs_ops is set to run when this is non-zero
*/ */
struct stripe_operations { struct stripe_operations {
unsigned long pending;
unsigned long ack;
unsigned long complete;
int target; int target;
int count;
u32 zero_sum_result; u32 zero_sum_result;
} ops; } ops;
struct r5dev { struct r5dev {
...@@ -202,6 +233,7 @@ struct stripe_head_state { ...@@ -202,6 +233,7 @@ struct stripe_head_state {
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int failed_num; int failed_num;
unsigned long ops_request;
}; };
/* r6_state - extra state data only relevant to r6 */ /* r6_state - extra state data only relevant to r6 */
...@@ -228,9 +260,7 @@ struct r6_state { ...@@ -228,9 +260,7 @@ struct r6_state {
#define R5_Wantfill 12 /* dev->toread contains a bio that needs #define R5_Wantfill 12 /* dev->toread contains a bio that needs
* filling * filling
*/ */
#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from #define R5_Wantdrain 13 /* dev->towrite needs to be drained */
* other "towrites"
*/
/* /*
* Write method * Write method
*/ */
...@@ -254,8 +284,10 @@ struct r6_state { ...@@ -254,8 +284,10 @@ struct r6_state {
#define STRIPE_EXPAND_READY 11 #define STRIPE_EXPAND_READY 11
#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */ #define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */ #define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
#define STRIPE_BIOFILL_RUN 14
#define STRIPE_COMPUTE_RUN 15
/* /*
* Operations flags (in issue order) * Operation request flags
*/ */
#define STRIPE_OP_BIOFILL 0 #define STRIPE_OP_BIOFILL 0
#define STRIPE_OP_COMPUTE_BLK 1 #define STRIPE_OP_COMPUTE_BLK 1
...@@ -263,14 +295,6 @@ struct r6_state { ...@@ -263,14 +295,6 @@ struct r6_state {
#define STRIPE_OP_BIODRAIN 3 #define STRIPE_OP_BIODRAIN 3
#define STRIPE_OP_POSTXOR 4 #define STRIPE_OP_POSTXOR 4
#define STRIPE_OP_CHECK 5 #define STRIPE_OP_CHECK 5
#define STRIPE_OP_IO 6
/* modifiers to the base operations
* STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
* STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
*/
#define STRIPE_OP_MOD_REPAIR_PD 7
#define STRIPE_OP_MOD_DMA_CHECK 8
/* /*
* Plugging: * Plugging:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment