Commit 8a392625 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (52 commits)
  md: Protect access to mddev->disks list using RCU
  md: only count actual openers as access which prevent a 'stop'
  md: linear: Make array_size sector-based and rename it to array_sectors.
  md: Make mddev->array_size sector-based.
  md: Make super_type->rdev_size_change() take sector-based sizes.
  md: Fix check for overlapping devices.
  md: Tidy up rdev_size_store a bit:
  md: Remove some unused macros.
  md: Turn rdev->sb_offset into a sector-based quantity.
  md: Make calc_dev_sboffset() return a sector count.
  md: Replace calc_dev_size() by calc_num_sectors().
  md: Make update_size() take the number of sectors.
  md: Better control of when do_md_stop is allowed to stop the array.
  md: get_disk_info(): Don't convert between signed and unsigned and back.
  md: Simplify restart_array().
  md: alloc_disk_sb(): Return proper error value.
  md: Simplify sb_equal().
  md: Simplify uuid_equal().
  md: sb_equal(): Fix misleading printk.
  md: Fix a typo in the comment to cmd_match().
  ...
parents 519f0141 4b80991c
......@@ -236,6 +236,11 @@ All md devices contain:
writing the word for the desired state, however some states
cannot be explicitly set, and some transitions are not allowed.
Select/poll works on this file. All changes except between
active_idle and active (which can be frequent and are not
very interesting) are notified. active->active_idle is
reported if the metadata is externally managed.
clear
No devices, no size, no level
Writing is equivalent to STOP_ARRAY ioctl
......@@ -292,6 +297,10 @@ Each directory contains:
writemostly - device will only be subject to read
requests if there are no other options.
This applies only to raid1 arrays.
blocked - device has failed, metadata is "external",
and the failure hasn't been acknowledged yet.
Writes that would write to this device if
it were not faulty are blocked.
spare - device is working, but not a full member.
This includes spares that are in the process
of being recovered to
......@@ -301,6 +310,12 @@ Each directory contains:
Writing "remove" removes the device from the array.
Writing "writemostly" sets the writemostly flag.
Writing "-writemostly" clears the writemostly flag.
Writing "blocked" sets the "blocked" flag.
Writing "-blocked" clear the "blocked" flag and allows writes
to complete.
This file responds to select/poll. Any change to 'faulty'
or 'blocked' causes an event.
errors
An approximate count of read errors that have been detected on
......@@ -332,7 +347,7 @@ Each directory contains:
for storage of data. This will normally be the same as the
component_size. This can be written while assembling an
array. If a value less than the current component_size is
written, component_size will be reduced to this value.
written, it will be rejected.
An active md device will also contain and entry for each active device
......@@ -381,6 +396,19 @@ also have
'check' and 'repair' will start the appropriate process
providing the current state is 'idle'.
This file responds to select/poll. Any important change in the value
triggers a poll event. Sometimes the value will briefly be
"recover" if a recovery seems to be needed, but cannot be
achieved. In that case, the transition to "recover" isn't
notified, but the transition away is.
degraded
This contains a count of the number of devices by which the
arrays is degraded. So an optimal array with show '0'. A
single failed/missing drive will show '1', etc.
This file responds to select/poll, any increase or decrease
in the count of missing devices will trigger an event.
mismatch_count
When performing 'check' and 'repair', and possibly when
performing 'resync', md will count the number of errors that are
......
......@@ -225,7 +225,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
|| test_bit(Faulty, &rdev->flags))
continue;
target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) {
page->index = index;
......@@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev;
struct list_head *tmp;
mddev_t *mddev = bitmap->mddev;
rdev_for_each(rdev, tmp, mddev)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) {
int size = PAGE_SIZE;
......@@ -260,32 +260,37 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ (long)(page->index * (PAGE_SIZE/512))
+ size/512 > 0)
/* bitmap runs in to metadata */
return -EINVAL;
goto bad_alignment;
if (rdev->data_offset + mddev->size*2
> rdev->sb_offset*2 + bitmap->offset)
> rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */
return -EINVAL;
} else if (rdev->sb_offset*2 < rdev->data_offset) {
goto bad_alignment;
} else if (rdev->sb_start < rdev->data_offset) {
/* METADATA BITMAP DATA */
if (rdev->sb_offset*2
if (rdev->sb_start
+ bitmap->offset
+ page->index*(PAGE_SIZE/512) + size/512
> rdev->data_offset)
/* bitmap runs in to data */
return -EINVAL;
goto bad_alignment;
} else {
/* DATA METADATA BITMAP - no problems */
}
md_super_write(mddev, rdev,
(rdev->sb_offset<<1) + bitmap->offset
rdev->sb_start + bitmap->offset
+ page->index * (PAGE_SIZE/512),
size,
page);
}
rcu_read_unlock();
if (wait)
md_super_wait(mddev);
return 0;
bad_alignment:
rcu_read_unlock();
return -EINVAL;
}
static void bitmap_file_kick(struct bitmap *bitmap);
......@@ -454,8 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
spin_unlock_irqrestore(&bitmap->lock, flags);
sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events = cpu_to_le64(bitmap->mddev->events);
if (!bitmap->mddev->degraded)
sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared) {
/* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
}
kunmap_atomic(sb, KM_USER0);
write_page(bitmap, bitmap->sb_page, 1);
}
......@@ -1085,9 +1093,19 @@ void bitmap_daemon_work(struct bitmap *bitmap)
} else
spin_unlock_irqrestore(&bitmap->lock, flags);
lastpage = page;
/*
printk("bitmap clean at page %lu\n", j);
*/
/* We are possibly going to clear some bits, so make
* sure that events_cleared is up-to-date.
*/
if (bitmap->need_sync) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page, KM_USER0);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
kunmap_atomic(sb, KM_USER0);
write_page(bitmap, bitmap->sb_page, 1);
}
spin_lock_irqsave(&bitmap->lock, flags);
clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
}
......@@ -1257,6 +1275,12 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
return;
}
if (success &&
bitmap->events_cleared < bitmap->mddev->events) {
bitmap->events_cleared = bitmap->mddev->events;
bitmap->need_sync = 1;
}
if (!success && ! (*bmc & NEEDED_MASK))
*bmc |= NEEDED_MASK;
......
......@@ -297,7 +297,7 @@ static int run(mddev_t *mddev)
rdev_for_each(rdev, tmp, mddev)
conf->rdev = rdev;
mddev->array_size = mddev->size;
mddev->array_sectors = mddev->size * 2;
mddev->private = conf;
reconfig(mddev, mddev->layout, -1);
......
......@@ -122,13 +122,13 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
return NULL;
cnt = 0;
conf->array_size = 0;
conf->array_sectors = 0;
rdev_for_each(rdev, tmp, mddev) {
int j = rdev->raid_disk;
dev_info_t *disk = conf->disks + j;
if (j < 0 || j > raid_disks || disk->rdev) {
if (j < 0 || j >= raid_disks || disk->rdev) {
printk("linear: disk numbering problem. Aborting!\n");
goto out;
}
......@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->size = rdev->size;
conf->array_size += rdev->size;
conf->array_sectors += rdev->size * 2;
cnt++;
}
......@@ -155,7 +155,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
goto out;
}
min_spacing = conf->array_size;
min_spacing = conf->array_sectors / 2;
sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
/* min_spacing is the minimum spacing that will fit the hash
......@@ -164,7 +164,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
* that is larger than min_spacing as use the size of that as
* the actual spacing
*/
conf->hash_spacing = conf->array_size;
conf->hash_spacing = conf->array_sectors / 2;
for (i=0; i < cnt-1 ; i++) {
sector_t sz = 0;
int j;
......@@ -194,7 +194,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
unsigned round;
unsigned long base;
sz = conf->array_size >> conf->preshift;
sz = conf->array_sectors >> (conf->preshift + 1);
sz += 1; /* force round-up */
base = conf->hash_spacing >> conf->preshift;
round = sector_div(sz, base);
......@@ -221,7 +221,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
curr_offset = 0;
i = 0;
for (curr_offset = 0;
curr_offset < conf->array_size;
curr_offset < conf->array_sectors / 2;
curr_offset += conf->hash_spacing) {
while (i < raid_disks-1 &&
......@@ -258,7 +258,7 @@ static int linear_run (mddev_t *mddev)
if (!conf)
return 1;
mddev->private = conf;
mddev->array_size = conf->array_size;
mddev->array_sectors = conf->array_sectors;
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug;
......@@ -292,8 +292,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
newconf->prev = mddev_to_conf(mddev);
mddev->private = newconf;
mddev->raid_disks++;
mddev->array_size = newconf->array_size;
set_capacity(mddev->gendisk, mddev->array_size << 1);
mddev->array_sectors = newconf->array_sectors;
set_capacity(mddev->gendisk, mddev->array_sectors);
return 0;
}
......
......@@ -169,7 +169,6 @@ void md_new_event(mddev_t *mddev)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
}
EXPORT_SYMBOL_GPL(md_new_event);
......@@ -274,10 +273,12 @@ static mddev_t * mddev_find(dev_t unit)
INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer);
atomic_set(&new->active, 1);
atomic_set(&new->openers, 0);
spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait);
init_waitqueue_head(&new->recovery_wait);
new->reshape_position = MaxSector;
new->resync_min = 0;
new->resync_max = MaxSector;
new->level = LEVEL_NONE;
......@@ -347,21 +348,20 @@ static struct mdk_personality *find_pers(int level, char *clevel)
return NULL;
}
/* return the offset of the super block in 512byte sectors */
static inline sector_t calc_dev_sboffset(struct block_device *bdev)
{
sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
return MD_NEW_SIZE_BLOCKS(size);
sector_t num_sectors = bdev->bd_inode->i_size / 512;
return MD_NEW_SIZE_SECTORS(num_sectors);
}
static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
{
sector_t size;
size = rdev->sb_offset;
sector_t num_sectors = rdev->sb_start;
if (chunk_size)
size &= ~((sector_t)chunk_size/1024 - 1);
return size;
num_sectors &= ~((sector_t)chunk_size/512 - 1);
return num_sectors;
}
static int alloc_disk_sb(mdk_rdev_t * rdev)
......@@ -372,7 +372,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
rdev->sb_page = alloc_page(GFP_KERNEL);
if (!rdev->sb_page) {
printk(KERN_ALERT "md: out of memory.\n");
return -EINVAL;
return -ENOMEM;
}
return 0;
......@@ -384,7 +384,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
put_page(rdev->sb_page);
rdev->sb_loaded = 0;
rdev->sb_page = NULL;
rdev->sb_offset = 0;
rdev->sb_start = 0;
rdev->size = 0;
}
}
......@@ -530,7 +530,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0;
if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
goto fail;
rdev->sb_loaded = 1;
return 0;
......@@ -543,17 +543,12 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
(sb1->set_uuid1 == sb2->set_uuid1) &&
(sb1->set_uuid2 == sb2->set_uuid2) &&
(sb1->set_uuid3 == sb2->set_uuid3))
return 1;
return 0;
return sb1->set_uuid0 == sb2->set_uuid0 &&
sb1->set_uuid1 == sb2->set_uuid1 &&
sb1->set_uuid2 == sb2->set_uuid2 &&
sb1->set_uuid3 == sb2->set_uuid3;
}
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
{
int ret;
......@@ -564,7 +559,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
if (!tmp1 || !tmp2) {
ret = 0;
printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
goto abort;
}
......@@ -577,11 +572,7 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
tmp1->nr_disks = 0;
tmp2->nr_disks = 0;
if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
ret = 0;
else
ret = 1;
ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
abort:
kfree(tmp1);
kfree(tmp2);
......@@ -660,9 +651,12 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
struct super_type {
char *name;
struct module *owner;
int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
int minor_version);
int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
sector_t num_sectors);
};
/*
......@@ -673,16 +667,14 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
mdp_super_t *sb;
int ret;
sector_t sb_offset;
/*
* Calculate the position of the superblock,
* Calculate the position of the superblock (512byte sectors),
* it's at the end of the disk.
*
* It also happens to be a multiple of 4Kb.
*/
sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->sb_offset = sb_offset;
rdev->sb_start = calc_dev_sboffset(rdev->bdev);
ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
......@@ -759,7 +751,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
else
ret = 0;
}
rdev->size = calc_dev_size(rdev, sb->chunk_size);
rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2;
if (rdev->size < sb->size && sb->level > 1)
/* "this cannot possibly happen" ... */
......@@ -1003,6 +995,26 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->sb_csum = calc_sb_csum(sb);
}
/*
* rdev_size_change for 0.90.0
*/
static unsigned long long
super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{
if (num_sectors && num_sectors < rdev->mddev->size * 2)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_offset)
return 0; /* can't move bitmap */
rdev->sb_start = calc_dev_sboffset(rdev->bdev);
if (!num_sectors || num_sectors > rdev->sb_start)
num_sectors = rdev->sb_start;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
return num_sectors / 2; /* kB for sysfs */
}
/*
* version 1 superblock
*/
......@@ -1034,12 +1046,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
{
struct mdp_superblock_1 *sb;
int ret;
sector_t sb_offset;
sector_t sb_start;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask;
/*
* Calculate the position of the superblock.
* Calculate the position of the superblock in 512byte sectors.
* It is always aligned to a 4K boundary and
* depeding on minor_version, it can be:
* 0: At least 8K, but less than 12K, from end of device
......@@ -1048,22 +1060,20 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
*/
switch(minor_version) {
case 0:
sb_offset = rdev->bdev->bd_inode->i_size >> 9;
sb_offset -= 8*2;
sb_offset &= ~(sector_t)(4*2-1);
/* convert from sectors to K */
sb_offset /= 2;
sb_start = rdev->bdev->bd_inode->i_size >> 9;
sb_start -= 8*2;
sb_start &= ~(sector_t)(4*2-1);
break;
case 1:
sb_offset = 0;
sb_start = 0;
break;
case 2:
sb_offset = 4;
sb_start = 8;
break;
default:
return -EINVAL;
}
rdev->sb_offset = sb_offset;
rdev->sb_start = sb_start;
/* superblock is rarely larger than 1K, but it can be larger,
* and it is safe to read 4k, so we do that
......@@ -1077,7 +1087,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
sb->major_version != cpu_to_le32(1) ||
le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
le64_to_cpu(sb->super_offset) != rdev->sb_start ||
(le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
return -EINVAL;
......@@ -1113,7 +1123,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
rdev->sb_size = (rdev->sb_size | bmask) + 1;
if (minor_version
&& rdev->data_offset < sb_offset + (rdev->sb_size/512))
&& rdev->data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
......@@ -1149,7 +1159,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
if (minor_version)
rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
else
rdev->size = rdev->sb_offset;
rdev->size = rdev->sb_start / 2;
if (rdev->size < le64_to_cpu(sb->data_size)/2)
return -EINVAL;
rdev->size = le64_to_cpu(sb->data_size)/2;
......@@ -1328,6 +1338,41 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->sb_csum = calc_sb_1_csum(sb);
}
static unsigned long long
super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{
struct mdp_superblock_1 *sb;
sector_t max_sectors;
if (num_sectors && num_sectors < rdev->mddev->size * 2)
return 0; /* component must fit device */
if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */
max_sectors = rdev->bdev->bd_inode->i_size >> 9;
max_sectors -= rdev->data_offset;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
} else if (rdev->mddev->bitmap_offset) {
/* minor version 0 with bitmap we can't move */
return 0;
} else {
/* minor version 0; superblock after data */
sector_t sb_start;
sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
max_sectors = rdev->size * 2 + sb_start - rdev->sb_start;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
rdev->sb_start = sb_start;
}
sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
sb->data_size = cpu_to_le64(num_sectors);
sb->super_offset = rdev->sb_start;
sb->sb_csum = calc_sb_1_csum(sb);
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
return num_sectors / 2; /* kB for sysfs */
}
static struct super_type super_types[] = {
[0] = {
......@@ -1336,6 +1381,7 @@ static struct super_type super_types[] = {
.load_super = super_90_load,
.validate_super = super_90_validate,
.sync_super = super_90_sync,
.rdev_size_change = super_90_rdev_size_change,
},
[1] = {
.name = "md-1",
......@@ -1343,20 +1389,23 @@ static struct super_type super_types[] = {
.load_super = super_1_load,
.validate_super = super_1_validate,
.sync_super = super_1_sync,
.rdev_size_change = super_1_rdev_size_change,
},
};
static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
{
struct list_head *tmp, *tmp2;
mdk_rdev_t *rdev, *rdev2;
rdev_for_each(rdev, tmp, mddev1)
rdev_for_each(rdev2, tmp2, mddev2)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev1)
rdev_for_each_rcu(rdev2, mddev2)
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains)
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
rcu_read_unlock();
return 0;
}
......@@ -1423,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
kobject_del(&rdev->kobj);
goto fail;
}
list_add(&rdev->same_set, &mddev->disks);
list_add_rcu(&rdev->same_set, &mddev->disks);
bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
return 0;
......@@ -1448,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
return;
}
bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
list_del_init(&rdev->same_set);
list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state"
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
*/
synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj);
schedule_work(&rdev->del_work);
......@@ -1511,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev)
if (rdev->mddev)
MD_BUG();
free_disk_sb(rdev);
list_del_init(&rdev->same_set);
#ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
......@@ -1758,11 +1808,11 @@ static void md_update_sb(mddev_t * mddev, int force_change)
dprintk("%s ", bdevname(rdev->bdev,b));
if (!test_bit(Faulty, &rdev->flags)) {
md_super_write(mddev,rdev,
rdev->sb_offset<<1, rdev->sb_size,
rdev->sb_start, rdev->sb_size,
rdev->sb_page);
dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
bdevname(rdev->bdev,b),
(unsigned long long)rdev->sb_offset);
(unsigned long long)rdev->sb_start);
rdev->sb_events = mddev->events;
} else
......@@ -1787,7 +1837,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
}
/* words written to sysfs files may, or my not, be \n terminated.
/* words written to sysfs files may, or may not, be \n terminated.
* We want to accept with case. For this we use cmd_match.
*/
static int cmd_match(const char *cmd, const char *str)
......@@ -1886,6 +1936,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
err = 0;
}
if (!err)
sysfs_notify(&rdev->kobj, NULL, "state");
return err ? err : len;
}
static struct rdev_sysfs_entry rdev_state =
......@@ -1931,7 +1983,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
slot = -1;
else if (e==buf || (*e && *e!= '\n'))
return -EINVAL;
if (rdev->mddev->pers) {
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
* with the personality with ->hot_*_disk.
......@@ -1939,8 +1991,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
* failed/spare devices. This normally happens automatically,
* but not when the metadata is externally managed.
*/
if (slot != -1)
return -EBUSY;
if (rdev->raid_disk == -1)
return -EEXIST;
/* personality does all needed checks */
......@@ -1954,6 +2004,43 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
sysfs_remove_link(&rdev->mddev->kobj, nm);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
mdk_rdev_t *rdev2;
struct list_head *tmp;
/* Activating a spare .. or possibly reactivating
* if we every get bitmaps working here.
*/
if (rdev->raid_disk != -1)
return -EBUSY;
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
rdev_for_each(rdev2, tmp, rdev->mddev)
if (rdev2->raid_disk == slot)
return -EEXIST;
rdev->raid_disk = slot;
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = slot;
else
rdev->saved_raid_disk = -1;
err = rdev->mddev->pers->
hot_add_disk(rdev->mddev, rdev);
if (err) {
rdev->raid_disk = -1;
return err;
} else
sysfs_notify(&rdev->kobj, NULL, "state");
sprintf(nm, "rd%d", rdev->raid_disk);
if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
printk(KERN_WARNING
"md: cannot register "
"%s for %s\n",
nm, mdname(rdev->mddev));
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks)
return -ENOSPC;
......@@ -1962,6 +2049,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
clear_bit(Faulty, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
set_bit(In_sync, &rdev->flags);
sysfs_notify(&rdev->kobj, NULL, "state");
}
return len;
}
......@@ -1983,7 +2071,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
unsigned long long offset = simple_strtoull(buf, &e, 10);
if (e==buf || (*e && *e != '\n'))
return -EINVAL;
if (rdev->mddev->pers)
if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY;
if (rdev->size && rdev->mddev->external)
/* Must set offset before size, so overlap checks
......@@ -2015,17 +2103,30 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
static ssize_t
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{
char *e;
unsigned long long size = simple_strtoull(buf, &e, 10);
unsigned long long size;
unsigned long long oldsize = rdev->size;
mddev_t *my_mddev = rdev->mddev;
if (e==buf || (*e && *e != '\n'))
if (strict_strtoull(buf, 10, &size) < 0)
return -EINVAL;
if (size < my_mddev->size)
return -EINVAL;
if (my_mddev->pers)
if (my_mddev->pers && rdev->raid_disk >= 0) {
if (my_mddev->persistent) {
size = super_types[my_mddev->major_version].
rdev_size_change(rdev, size * 2);
if (!size)
return -EBUSY;
} else if (!size) {
size = (rdev->bdev->bd_inode->i_size >> 10);
size -= rdev->data_offset/2;
}
if (size < my_mddev->size)
return -EINVAL; /* component must fit device */
}
rdev->size = size;
if (size > oldsize && rdev->mddev->external) {
if (size > oldsize && my_mddev->external) {
/* need to check that all other rdevs with the same ->bdev
* do not overlap. We need to unlock the mddev to avoid
* a deadlock. We have already changed rdev->size, and if
......@@ -2044,8 +2145,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (test_bit(AllReserved, &rdev2->flags) ||
(rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
overlaps(rdev->data_offset, rdev->size,
rdev2->data_offset, rdev2->size))) {
overlaps(rdev->data_offset, rdev->size * 2,
rdev2->data_offset,
rdev2->size * 2))) {
overlap = 1;
break;
}
......@@ -2067,8 +2169,6 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EBUSY;
}
}
if (size < my_mddev->size || my_mddev->size == 0)
my_mddev->size = size;
return len;
}
......@@ -2512,7 +2612,7 @@ __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
* When written, doesn't tear down array, but just stops it
* suspended (not supported yet)
* All IO requests will block. The array can be reconfigured.
* Writing this, if accepted, will block until array is quiessent
* Writing this, if accepted, will block until array is quiescent
* readonly
* no resync can happen. no superblocks get written.
* write requests fail
......@@ -2585,7 +2685,7 @@ array_state_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]);
}
static int do_md_stop(mddev_t * mddev, int ro);
static int do_md_stop(mddev_t * mddev, int ro, int is_open);
static int do_md_run(mddev_t * mddev);
static int restart_array(mddev_t *mddev);
......@@ -2599,16 +2699,16 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break;
case clear:
/* stopping an active array */
if (atomic_read(&mddev->active) > 1)
if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
err = do_md_stop(mddev, 0);
err = do_md_stop(mddev, 0, 0);
break;
case inactive:
/* stopping an active array */
if (mddev->pers) {
if (atomic_read(&mddev->active) > 1)
if (atomic_read(&mddev->openers) > 0)
return -EBUSY;
err = do_md_stop(mddev, 2);
err = do_md_stop(mddev, 2, 0);
} else
err = 0; /* already inactive */
break;
......@@ -2616,7 +2716,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
break; /* not supported yet */
case readonly:
if (mddev->pers)
err = do_md_stop(mddev, 1);
err = do_md_stop(mddev, 1, 0);
else {
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
......@@ -2626,7 +2726,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
case read_auto:
if (mddev->pers) {
if (mddev->ro != 1)
err = do_md_stop(mddev, 1);
err = do_md_stop(mddev, 1, 0);
else
err = restart_array(mddev);
if (err == 0) {
......@@ -2681,8 +2781,10 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
}
if (err)
return err;
else
else {
sysfs_notify(&mddev->kobj, NULL, "array_state");
return len;
}
}
static struct md_sysfs_entry md_array_state =
__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
......@@ -2785,7 +2887,7 @@ size_show(mddev_t *mddev, char *page)
return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
}
static int update_size(mddev_t *mddev, unsigned long size);
static int update_size(mddev_t *mddev, sector_t num_sectors);
static ssize_t
size_store(mddev_t *mddev, const char *buf, size_t len)
......@@ -2802,7 +2904,7 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
return -EINVAL;
if (mddev->pers) {
err = update_size(mddev, size);
err = update_size(mddev, size * 2);
md_update_sb(mddev, 1);
} else {
if (mddev->size == 0 ||
......@@ -2899,7 +3001,7 @@ action_show(mddev_t *mddev, char *page)
type = "check";
else
type = "repair";
} else
} else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
type = "recover";
}
return sprintf(page, "%s\n", type);
......@@ -2921,15 +3023,19 @@ action_store(mddev_t *mddev, const char *page, size_t len)
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
else if (cmd_match(page, "resync"))
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
else if (cmd_match(page, "recover")) {
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
else if (cmd_match(page, "reshape")) {
} else if (cmd_match(page, "reshape")) {
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
err = mddev->pers->start_reshape(mddev);
if (err)
return err;
sysfs_notify(&mddev->kobj, NULL, "degraded");
} else {
if (cmd_match(page, "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
......@@ -2940,6 +3046,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
return len;
}
......@@ -3049,11 +3156,11 @@ static ssize_t
sync_speed_show(mddev_t *mddev, char *page)
{
unsigned long resync, dt, db;
resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active));
dt = ((jiffies - mddev->resync_mark) / HZ);
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ;
if (!dt) dt++;
db = resync - (mddev->resync_mark_cnt);
return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
db = resync - mddev->resync_mark_cnt;
return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
}
static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
......@@ -3074,6 +3181,36 @@ sync_completed_show(mddev_t *mddev, char *page)
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
static ssize_t
min_sync_show(mddev_t *mddev, char *page)
{
return sprintf(page, "%llu\n",
(unsigned long long)mddev->resync_min);
}
static ssize_t
min_sync_store(mddev_t *mddev, const char *buf, size_t len)
{
unsigned long long min;
if (strict_strtoull(buf, 10, &min))
return -EINVAL;
if (min > mddev->resync_max)
return -EINVAL;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
/* Must be a multiple of chunk_size */
if (mddev->chunk_size) {
if (min & (sector_t)((mddev->chunk_size>>9)-1))
return -EINVAL;
}
mddev->resync_min = min;
return len;
}
static struct md_sysfs_entry md_min_sync =
__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
static ssize_t
max_sync_show(mddev_t *mddev, char *page)
{
......@@ -3089,9 +3226,10 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
if (strncmp(buf, "max", 3) == 0)
mddev->resync_max = MaxSector;
else {
char *ep;
unsigned long long max = simple_strtoull(buf, &ep, 10);
if (ep == buf || (*ep != 0 && *ep != '\n'))
unsigned long long max;
if (strict_strtoull(buf, 10, &max))
return -EINVAL;
if (max < mddev->resync_min)
return -EINVAL;
if (max < mddev->resync_max &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
......@@ -3222,6 +3360,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
&md_min_sync.attr,
&md_max_sync.attr,
&md_suspend_lo.attr,
&md_suspend_hi.attr,
......@@ -3326,9 +3465,9 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
disk->queue = mddev->queue;
add_disk(disk);
mddev->gendisk = disk;
mutex_unlock(&disks_mutex);
error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
"%s", "md");
mutex_unlock(&disks_mutex);
if (error)
printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
disk->disk_name);
......@@ -3341,7 +3480,11 @@ static void md_safemode_timeout(unsigned long data)
{
mddev_t *mddev = (mddev_t *) data;
if (!atomic_read(&mddev->writes_pending)) {
mddev->safemode = 1;
if (mddev->external)
sysfs_notify(&mddev->kobj, NULL, "array_state");
}
md_wakeup_thread(mddev->thread);
}
......@@ -3432,22 +3575,23 @@ static int do_md_run(mddev_t * mddev)
* We don't want the data to overlap the metadata,
* Internal Bitmap issues has handled elsewhere.
*/
if (rdev->data_offset < rdev->sb_offset) {
if (rdev->data_offset < rdev->sb_start) {
if (mddev->size &&
rdev->data_offset + mddev->size*2
> rdev->sb_offset*2) {
> rdev->sb_start) {
printk("md: %s: data overlaps metadata\n",
mdname(mddev));
return -EINVAL;
}
} else {
if (rdev->sb_offset*2 + rdev->sb_size/512
if (rdev->sb_start + rdev->sb_size/512
> rdev->data_offset) {
printk("md: %s: metadata overlaps data\n",
mdname(mddev));
return -EINVAL;
}
}
sysfs_notify(&rdev->kobj, NULL, "state");
}
md_probe(mddev->unit, NULL, NULL);
......@@ -3519,7 +3663,9 @@ static int do_md_run(mddev_t * mddev)
mddev->ro = 2; /* read-only, but switch on first write */
err = mddev->pers->run(mddev);
if (!err && mddev->pers->sync_request) {
if (err)
printk(KERN_ERR "md: pers->run() failed ...\n");
else if (mddev->pers->sync_request) {
err = bitmap_create(mddev);
if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
......@@ -3528,7 +3674,6 @@ static int do_md_run(mddev_t * mddev)
}
}
if (err) {
printk(KERN_ERR "md: pers->run() failed ...\n");
module_put(mddev->pers->owner);
mddev->pers = NULL;
bitmap_destroy(mddev);
......@@ -3563,7 +3708,7 @@ static int do_md_run(mddev_t * mddev)
if (mddev->flags)
md_update_sb(mddev, 0);
set_capacity(disk, mddev->array_size<<1);
set_capacity(disk, mddev->array_sectors);
/* If we call blk_queue_make_request here, it will
* re-initialise max_sectors etc which may have been
......@@ -3608,6 +3753,9 @@ static int do_md_run(mddev_t * mddev)
mddev->changed = 1;
md_new_event(mddev);
sysfs_notify(&mddev->kobj, NULL, "array_state");
sysfs_notify(&mddev->kobj, NULL, "sync_action");
sysfs_notify(&mddev->kobj, NULL, "degraded");
kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
return 0;
}
......@@ -3615,38 +3763,25 @@ static int do_md_run(mddev_t * mddev)
static int restart_array(mddev_t *mddev)
{
struct gendisk *disk = mddev->gendisk;
int err;
/*
* Complain if it has no devices
*/
err = -ENXIO;
/* Complain if it has no devices */
if (list_empty(&mddev->disks))
goto out;
if (mddev->pers) {
err = -EBUSY;
return -ENXIO;
if (!mddev->pers)
return -EINVAL;
if (!mddev->ro)
goto out;
return -EBUSY;
mddev->safemode = 0;
mddev->ro = 0;
set_disk_ro(disk, 0);
printk(KERN_INFO "md: %s switched to read-write mode.\n",
mdname(mddev));
/*
* Kick recovery or resync if necessary
*/
/* Kick recovery or resync if necessary */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
err = 0;
} else
err = -EINVAL;
out:
return err;
sysfs_notify(&mddev->kobj, NULL, "array_state");
return 0;
}
/* similar to deny_write_access, but accounts for our holding a reference
......@@ -3680,17 +3815,18 @@ static void restore_bitmap_write_access(struct file *file)
* 1 - switch to readonly
* 2 - stop but do not disassemble array
*/
static int do_md_stop(mddev_t * mddev, int mode)
static int do_md_stop(mddev_t * mddev, int mode, int is_open)
{
int err = 0;
struct gendisk *disk = mddev->gendisk;
if (mddev->pers) {
if (atomic_read(&mddev->active)>2) {
if (atomic_read(&mddev->openers) > is_open) {
printk("md: %s still in use.\n",mdname(mddev));
return -EBUSY;
}
if (mddev->pers) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
......@@ -3773,10 +3909,11 @@ static int do_md_stop(mddev_t * mddev, int mode)
export_array(mddev);
mddev->array_size = 0;
mddev->array_sectors = 0;
mddev->size = 0;
mddev->raid_disks = 0;
mddev->recovery_cp = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector;
mddev->external = 0;
......@@ -3811,6 +3948,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
mdname(mddev));
err = 0;
md_new_event(mddev);
sysfs_notify(&mddev->kobj, NULL, "array_state");
out:
return err;
}
......@@ -3836,7 +3974,7 @@ static void autorun_array(mddev_t *mddev)
err = do_md_run (mddev);
if (err) {
printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
do_md_stop (mddev, 0);
do_md_stop (mddev, 0, 0);
}
}
......@@ -3927,8 +4065,10 @@ static void autorun_devices(int part)
/* on success, candidates will be empty, on error
* it won't...
*/
rdev_for_each_list(rdev, tmp, candidates)
rdev_for_each_list(rdev, tmp, candidates) {
list_del_init(&rdev->same_set);
export_rdev(rdev);
}
mddev_put(mddev);
}
printk(KERN_INFO "md: ... autorun DONE.\n");
......@@ -4009,9 +4149,11 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
char *ptr, *buf = NULL;
int err = -ENOMEM;
md_allow_write(mddev);
if (md_allow_write(mddev))
file = kmalloc(sizeof(*file), GFP_NOIO);
else
file = kmalloc(sizeof(*file), GFP_KERNEL);
if (!file)
goto out;
......@@ -4044,15 +4186,12 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg)
static int get_disk_info(mddev_t * mddev, void __user * arg)
{
mdu_disk_info_t info;
unsigned int nr;
mdk_rdev_t *rdev;
if (copy_from_user(&info, arg, sizeof(info)))
return -EFAULT;
nr = info.number;
rdev = find_rdev_nr(mddev, nr);
rdev = find_rdev_nr(mddev, info.number);
if (rdev) {
info.major = MAJOR(rdev->bdev->bd_dev);
info.minor = MINOR(rdev->bdev->bd_dev);
......@@ -4172,8 +4311,12 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
}
if (err)
export_rdev(rdev);
else
sysfs_notify(&rdev->kobj, NULL, "state");
md_update_sb(mddev, 1);
if (mddev->degraded)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return err;
......@@ -4212,10 +4355,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (!mddev->persistent) {
printk(KERN_INFO "md: nonpersistent superblock ...\n");
rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
} else
rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->size = calc_dev_size(rdev, mddev->chunk_size);
rdev->sb_start = calc_dev_sboffset(rdev->bdev);
rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
err = bind_rdev_to_array(rdev, mddev);
if (err) {
......@@ -4232,9 +4375,6 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
char b[BDEVNAME_SIZE];
mdk_rdev_t *rdev;
if (!mddev->pers)
return -ENODEV;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
......@@ -4257,7 +4397,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
int err;
unsigned int size;
mdk_rdev_t *rdev;
if (!mddev->pers)
......@@ -4285,13 +4424,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}
if (mddev->persistent)
rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
rdev->sb_start = calc_dev_sboffset(rdev->bdev);
else
rdev->sb_offset =
rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
size = calc_dev_size(rdev, mddev->chunk_size);
rdev->size = size;
rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2;
if (test_bit(Faulty, &rdev->flags)) {
printk(KERN_WARNING
......@@ -4476,24 +4613,24 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
return 0;
}
static int update_size(mddev_t *mddev, unsigned long size)
static int update_size(mddev_t *mddev, sector_t num_sectors)
{
mdk_rdev_t * rdev;
int rv;
struct list_head *tmp;
int fit = (size == 0);
int fit = (num_sectors == 0);
if (mddev->pers->resize == NULL)
return -EINVAL;
/* The "size" is the amount of each device that is used.
* This can only make sense for arrays with redundancy.
* linear and raid0 always use whatever space is available
* We can only consider changing the size if no resync
* or reconstruction is happening, and if the new size
* is acceptable. It must fit before the sb_offset or,
* if that is <data_offset, it must fit before the
* size of each device.
* If size is zero, we find the largest size that fits.
/* The "num_sectors" is the number of sectors of each device that
* is used. This can only make sense for arrays with redundancy.
* linear and raid0 always use whatever space is available. We can only
* consider changing this number if no resync or reconstruction is
* happening, and if the new size is acceptable. It must fit before the
* sb_start or, if that is <data_offset, it must fit before the size
* of each device. If num_sectors is zero, we find the largest size
* that fits.
*/
if (mddev->sync_thread)
return -EBUSY;
......@@ -4501,19 +4638,20 @@ static int update_size(mddev_t *mddev, unsigned long size)
sector_t avail;
avail = rdev->size * 2;
if (fit && (size == 0 || size > avail/2))
size = avail/2;
if (avail < ((sector_t)size << 1))
if (fit && (num_sectors == 0 || num_sectors > avail))
num_sectors = avail;
if (avail < num_sectors)
return -ENOSPC;
}
rv = mddev->pers->resize(mddev, (sector_t)size *2);
rv = mddev->pers->resize(mddev, num_sectors);
if (!rv) {
struct block_device *bdev;
bdev = bdget_disk(mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
i_size_write(bdev->bd_inode,
(loff_t)mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
......@@ -4588,7 +4726,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
return mddev->pers->reconfig(mddev, info->layout, -1);
}
if (info->size >= 0 && mddev->size != info->size)
rv = update_size(mddev, info->size);
rv = update_size(mddev, (sector_t)info->size * 2);
if (mddev->raid_disks != info->raid_disks)
rv = update_raid_disks(mddev, info->raid_disks);
......@@ -4641,6 +4779,12 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
return 0;
}
/*
* We have a problem here : there is no easy way to give a CHS
* virtual geometry. We currently pretend that we have a 2 heads
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
mddev_t *mddev = bdev->bd_disk->private_data;
......@@ -4785,19 +4929,13 @@ static int md_ioctl(struct inode *inode, struct file *file,
goto done_unlock;
case STOP_ARRAY:
err = do_md_stop (mddev, 0);
err = do_md_stop (mddev, 0, 1);
goto done_unlock;
case STOP_ARRAY_RO:
err = do_md_stop (mddev, 1);
err = do_md_stop (mddev, 1, 1);
goto done_unlock;
/*
* We have a problem here : there is no easy way to give a CHS
* virtual geometry. We currently pretend that we have a 2 heads
* 4 sectors (with a BIG number of cylinders...). This drives
* dosfs just mad... ;-)
*/
}
/*
......@@ -4807,13 +4945,12 @@ static int md_ioctl(struct inode *inode, struct file *file,
* here and hit the 'default' below, so only disallow
* 'md' ioctls, and switch to rw mode if started auto-readonly.
*/
if (_IOC_TYPE(cmd) == MD_MAJOR &&
mddev->ro && mddev->pers) {
if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
if (mddev->ro == 2) {
mddev->ro = 0;
sysfs_notify(&mddev->kobj, NULL, "array_state");
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
} else {
err = -EROFS;
goto abort_unlock;
......@@ -4883,6 +5020,7 @@ static int md_open(struct inode *inode, struct file *file)
err = 0;
mddev_get(mddev);
atomic_inc(&mddev->openers);
mddev_unlock(mddev);
check_disk_change(inode->i_bdev);
......@@ -4895,6 +5033,7 @@ static int md_release(struct inode *inode, struct file * file)
mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
BUG_ON(!mddev);
atomic_dec(&mddev->openers);
mddev_put(mddev);
return 0;
......@@ -5029,6 +5168,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
if (mddev->degraded)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(StateChanged, &rdev->flags);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
......@@ -5258,7 +5400,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
if (!list_empty(&mddev->disks)) {
if (mddev->pers)
seq_printf(seq, "\n %llu blocks",
(unsigned long long)mddev->array_size);
(unsigned long long)
mddev->array_sectors / 2);
else
seq_printf(seq, "\n %llu blocks",
(unsigned long long)size);
......@@ -5391,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p)
static int is_mddev_idle(mddev_t *mddev)
{
mdk_rdev_t * rdev;
struct list_head *tmp;
int idle;
long curr_events;
idle = 1;
rdev_for_each(rdev, tmp, mddev) {
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = disk_stat_read(disk, sectors[0]) +
disk_stat_read(disk, sectors[1]) -
......@@ -5428,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev)
idle = 0;
}
}
rcu_read_unlock();
return idle;
}
......@@ -5451,6 +5595,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
*/
void md_write_start(mddev_t *mddev, struct bio *bi)
{
int did_change = 0;
if (bio_data_dir(bi) != WRITE)
return;
......@@ -5461,6 +5606,7 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
did_change = 1;
}
atomic_inc(&mddev->writes_pending);
if (mddev->safemode == 1)
......@@ -5471,10 +5617,12 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
mddev->in_sync = 0;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
md_wakeup_thread(mddev->thread);
did_change = 1;
}
spin_unlock_irq(&mddev->write_lock);
sysfs_notify(&mddev->kobj, NULL, "array_state");
}
if (did_change)
sysfs_notify(&mddev->kobj, NULL, "array_state");
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
......@@ -5495,13 +5643,18 @@ void md_write_end(mddev_t *mddev)
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
*
* In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
*/
void md_allow_write(mddev_t *mddev)
int md_allow_write(mddev_t *mddev)
{
if (!mddev->pers)
return;
return 0;
if (mddev->ro)
return;
return 0;
if (!mddev->pers->sync_request)
return 0;
spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) {
......@@ -5512,14 +5665,14 @@ void md_allow_write(mddev_t *mddev)
mddev->safemode = 1;
spin_unlock_irq(&mddev->write_lock);
md_update_sb(mddev, 0);
sysfs_notify(&mddev->kobj, NULL, "array_state");
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
} else
spin_unlock_irq(&mddev->write_lock);
if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
return -EAGAIN;
else
return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);
......@@ -5625,9 +5778,11 @@ void md_do_sync(mddev_t *mddev)
max_sectors = mddev->resync_max_sectors;
mddev->resync_mismatches = 0;
/* we don't use the checkpoint if there's a bitmap */
if (!mddev->bitmap &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
j = mddev->resync_min;
else if (!mddev->bitmap)
j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->size << 1;
else {
......@@ -5796,6 +5951,7 @@ void md_do_sync(mddev_t *mddev)
skip:
mddev->curr_resync = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
wake_up(&resync_wait);
......@@ -5845,7 +6001,8 @@ static int remove_and_add_spares(mddev_t *mddev)
if (rdev->raid_disk < 0
&& !test_bit(Faulty, &rdev->flags)) {
rdev->recovery_offset = 0;
if (mddev->pers->hot_add_disk(mddev,rdev)) {
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
char nm[20];
sprintf(nm, "rd%d", rdev->raid_disk);
if (sysfs_create_link(&mddev->kobj,
......@@ -5920,23 +6077,31 @@ void md_check_recovery(mddev_t *mddev)
int spares = 0;
if (!mddev->external) {
int did_change = 0;
spin_lock_irq(&mddev->write_lock);
if (mddev->safemode &&
!atomic_read(&mddev->writes_pending) &&
!mddev->in_sync &&
mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
did_change = 1;
if (mddev->persistent)
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
if (mddev->safemode == 1)
mddev->safemode = 0;
spin_unlock_irq(&mddev->write_lock);
if (did_change)
sysfs_notify(&mddev->kobj, NULL, "array_state");
}
if (mddev->flags)
md_update_sb(mddev, 0);
rdev_for_each(rdev, rtmp, mddev)
if (test_and_clear_bit(StateChanged, &rdev->flags))
sysfs_notify(&rdev->kobj, NULL, "state");
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
......@@ -5951,7 +6116,9 @@ void md_check_recovery(mddev_t *mddev)
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* success...*/
/* activate any spares */
mddev->pers->spare_active(mddev);
if (mddev->pers->spare_active(mddev))
sysfs_notify(&mddev->kobj, NULL,
"degraded");
}
md_update_sb(mddev, 1);
......@@ -5965,13 +6132,18 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery = 0;
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
md_new_event(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
* any transients in the value of "sync_action".
*/
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
/* Clear some bits that don't mean anything, but
* might be left set
*/
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
......@@ -5989,17 +6161,19 @@ void md_check_recovery(mddev_t *mddev)
/* Cannot proceed */
goto unlock;
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if ((spares = remove_and_add_spares(mddev))) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
} else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
/* nothing to be done ... */
goto unlock;
if (mddev->pers->sync_request) {
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (spares && mddev->bitmap && ! mddev->bitmap->file) {
/* We are adding a device or devices to an array
* which has the bitmap stored on all devices.
......@@ -6018,9 +6192,16 @@ void md_check_recovery(mddev_t *mddev)
mddev->recovery = 0;
} else
md_wakeup_thread(mddev->sync_thread);
sysfs_notify(&mddev->kobj, NULL, "sync_action");
md_new_event(mddev);
}
unlock:
if (!mddev->sync_thread) {
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
if (test_and_clear_bit(MD_RECOVERY_RECOVER,
&mddev->recovery))
sysfs_notify(&mddev->kobj, NULL, "sync_action");
}
mddev_unlock(mddev);
}
}
......@@ -6047,7 +6228,7 @@ static int md_notify_reboot(struct notifier_block *this,
for_each_mddev(mddev, tmp)
if (mddev_trylock(mddev)) {
do_md_stop (mddev, 1);
do_md_stop (mddev, 1, 0);
mddev_unlock(mddev);
}
/*
......
......@@ -281,13 +281,18 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
multipath_conf_t *conf = mddev->private;
struct request_queue *q;
int found = 0;
int err = -EEXIST;
int path;
struct multipath_info *p;
int first = 0;
int last = mddev->raid_disks - 1;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
print_multipath_conf(conf);
for (path=0; path<mddev->raid_disks; path++)
for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
q = rdev->bdev->bd_disk->queue;
blk_queue_stack_limits(mddev->queue, q);
......@@ -307,11 +312,13 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rdev->raid_disk = path;
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
found = 1;
err = 0;
break;
}
print_multipath_conf(conf);
return found;
return err;
}
static int multipath_remove_disk(mddev_t *mddev, int number)
......@@ -497,7 +504,7 @@ static int multipath_run (mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
mddev->array_size = mddev->size;
mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
......
......@@ -295,16 +295,16 @@ static int raid0_run (mddev_t *mddev)
goto out_free_conf;
/* calculate array device size */
mddev->array_size = 0;
mddev->array_sectors = 0;
rdev_for_each(rdev, tmp, mddev)
mddev->array_size += rdev->size;
mddev->array_sectors += rdev->size * 2;
printk("raid0 : md_size is %llu blocks.\n",
(unsigned long long)mddev->array_size);
(unsigned long long)mddev->array_sectors / 2);
printk("raid0 : conf->hash_spacing is %llu blocks.\n",
(unsigned long long)conf->hash_spacing);
{
sector_t s = mddev->array_size;
sector_t s = mddev->array_sectors / 2;
sector_t space = conf->hash_spacing;
int round;
conf->preshift = 0;
......
......@@ -1100,11 +1100,16 @@ static int raid1_spare_active(mddev_t *mddev)
static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev->private;
int found = 0;
int err = -EEXIST;
int mirror = 0;
mirror_info_t *p;
int first = 0;
int last = mddev->raid_disks - 1;
for (mirror=0; mirror < mddev->raid_disks; mirror++)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
for (mirror = first; mirror <= last; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
......@@ -1119,7 +1124,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
......@@ -1130,7 +1135,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
}
print_conf(conf);
return found;
return err;
}
static int raid1_remove_disk(mddev_t *mddev, int number)
......@@ -2038,7 +2043,7 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
mddev->array_size = mddev->size;
mddev->array_sectors = mddev->size * 2;
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
......@@ -2100,14 +2105,15 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
mddev->array_size = sectors>>1;
set_capacity(mddev->gendisk, mddev->array_size << 1);
mddev->array_sectors = sectors;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
if (mddev->array_sectors / 2 > mddev->size &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
mddev->size = mddev->array_size;
mddev->size = mddev->array_sectors / 2;
mddev->resync_max_sectors = sectors;
return 0;
}
......@@ -2131,7 +2137,7 @@ static int raid1_reshape(mddev_t *mddev)
conf_t *conf = mddev_to_conf(mddev);
int cnt, raid_disks;
unsigned long flags;
int d, d2;
int d, d2, err;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_size != mddev->new_chunk ||
......@@ -2143,7 +2149,9 @@ static int raid1_reshape(mddev_t *mddev)
return -EINVAL;
}
md_allow_write(mddev);
err = md_allow_write(mddev);
if (err)
return err;
raid_disks = mddev->raid_disks + mddev->delta_disks;
......
......@@ -1114,24 +1114,30 @@ static int raid10_spare_active(mddev_t *mddev)
static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
conf_t *conf = mddev->private;
int found = 0;
int err = -EEXIST;
int mirror;
mirror_info_t *p;
int first = 0;
int last = mddev->raid_disks - 1;
if (mddev->recovery_cp < MaxSector)
/* only hot-add to in-sync arrays, as recovery is
* very different from resync
*/
return 0;
return -EBUSY;
if (!enough(conf))
return 0;
return -EINVAL;
if (rdev->raid_disk)
first = last = rdev->raid_disk;
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
else
mirror = 0;
for ( ; mirror < mddev->raid_disks; mirror++)
mirror = first;
for ( ; mirror <= last ; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
......@@ -1146,7 +1152,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
err = 0;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
......@@ -1154,7 +1160,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
}
print_conf(conf);
return found;
return err;
}
static int raid10_remove_disk(mddev_t *mddev, int number)
......@@ -2159,7 +2165,7 @@ static int run(mddev_t *mddev)
/*
* Ok, everything is just fine now
*/
mddev->array_size = size << (conf->chunk_shift-1);
mddev->array_sectors = size << conf->chunk_shift;
mddev->resync_max_sectors = size << conf->chunk_shift;
mddev->queue->unplug_fn = raid10_unplug;
......
......@@ -115,15 +115,20 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
bi->bi_end_io(bi,
test_bit(BIO_UPTODATE, &bi->bi_flags)
? 0 : -EIO);
bio_endio(bi, 0);
bi = return_bi;
}
}
static void print_raid5_conf (raid5_conf_t *conf);
static int stripe_operations_active(struct stripe_head *sh)
{
return sh->check_state || sh->reconstruct_state ||
test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
test_bit(STRIPE_COMPUTE_RUN, &sh->state);
}
static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
{
if (atomic_dec_and_test(&sh->count)) {
......@@ -143,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
}
md_wakeup_thread(conf->mddev->thread);
} else {
BUG_ON(sh->ops.pending);
BUG_ON(stripe_operations_active(sh));
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
......@@ -245,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
BUG_ON(stripe_operations_active(sh));
CHECK_DEVLOCK();
pr_debug("init_stripe called, stripe %llu\n",
......@@ -346,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
return sh;
}
/* test_and_ack_op() ensures that we only dequeue an operation once */
#define test_and_ack_op(op, pend) \
do { \
if (test_bit(op, &sh->ops.pending) && \
!test_bit(op, &sh->ops.complete)) { \
if (test_and_set_bit(op, &sh->ops.ack)) \
clear_bit(op, &pend); \
else \
ack++; \
} else \
clear_bit(op, &pend); \
} while (0)
/* find new work to run, do not resubmit work that is already
* in flight
*/
static unsigned long get_stripe_work(struct stripe_head *sh)
{
unsigned long pending;
int ack = 0;
pending = sh->ops.pending;
test_and_ack_op(STRIPE_OP_BIOFILL, pending);
test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending);
test_and_ack_op(STRIPE_OP_PREXOR, pending);
test_and_ack_op(STRIPE_OP_BIODRAIN, pending);
test_and_ack_op(STRIPE_OP_POSTXOR, pending);
test_and_ack_op(STRIPE_OP_CHECK, pending);
if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
ack++;
sh->ops.count -= ack;
if (unlikely(sh->ops.count < 0)) {
printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx "
"ops.complete: %#lx\n", pending, sh->ops.pending,
sh->ops.ack, sh->ops.complete);
BUG();
}
return pending;
}
static void
raid5_end_read_request(struct bio *bi, int error);
static void
raid5_end_write_request(struct bio *bi, int error);
static void ops_run_io(struct stripe_head *sh)
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
raid5_conf_t *conf = sh->raid_conf;
int i, disks = sh->disks;
might_sleep();
set_bit(STRIPE_IO_STARTED, &sh->state);
for (i = disks; i--; ) {
int rw;
struct bio *bi;
......@@ -430,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh)
rcu_read_unlock();
if (rdev) {
if (test_bit(STRIPE_SYNCING, &sh->state) ||
test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
test_bit(STRIPE_EXPAND_READY, &sh->state))
if (s->syncing || s->expanding || s->expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state);
bi->bi_bdev = rdev->bdev;
pr_debug("%s: for %llu schedule op %ld on disc %d\n",
__func__, (unsigned long long)sh->sector,
......@@ -528,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref)
(unsigned long long)sh->sector);
/* clear completed biofills */
spin_lock_irq(&conf->device_lock);
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* acknowledge completion of a biofill operation */
/* and check if we need to reply to a read request,
* new R5_Wantfill requests are held off until
* !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)
* !STRIPE_BIOFILL_RUN
*/
if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
struct bio *rbi, *rbi2;
/* The access to dev->read is outside of the
* spin_lock_irq(&conf->device_lock), but is protected
* by the STRIPE_OP_BIOFILL pending bit
*/
BUG_ON(!dev->read);
rbi = dev->read;
dev->read = NULL;
while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
spin_lock_irq(&conf->device_lock);
if (--rbi->bi_phys_segments == 0) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
spin_unlock_irq(&conf->device_lock);
rbi = rbi2;
}
}
}
set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
spin_unlock_irq(&conf->device_lock);
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
return_io(return_bi);
......@@ -610,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref)
set_bit(R5_UPTODATE, &tgt->flags);
BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
clear_bit(R5_Wantcompute, &tgt->flags);
set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, unsigned long pending)
static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
......@@ -646,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute5, sh);
/* ack now if postxor is not set to be run */
if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending))
async_tx_ack(tx);
return tx;
}
......@@ -659,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref)
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
}
static struct dma_async_tx_descriptor *
......@@ -680,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
/* Only process blocks that are known to be uptodate */
if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags))
if (test_bit(R5_Wantdrain, &dev->flags))
xor_srcs[count++] = dev->page;
}
......@@ -692,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
}
static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
unsigned long pending)
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
int disks = sh->disks;
int pd_idx = sh->pd_idx, i;
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (Wantprexor)
*/
int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
int i;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
......@@ -709,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
struct bio *chosen;
int towrite;
towrite = 0;
if (prexor) { /* rmw */
if (dev->towrite &&
test_bit(R5_Wantprexor, &dev->flags))
towrite = 1;
} else { /* rcw */
if (i != pd_idx && dev->towrite &&
test_bit(R5_LOCKED, &dev->flags))
towrite = 1;
}
if (towrite) {
if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
struct bio *wbi;
spin_lock(&sh->lock);
......@@ -745,18 +679,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
}
static void ops_complete_postxor(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
static void ops_complete_write(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
int disks = sh->disks, i, pd_idx = sh->pd_idx;
......@@ -770,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref)
set_bit(R5_UPTODATE, &dev->flags);
}
set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
if (sh->reconstruct_state == reconstruct_state_drain_run)
sh->reconstruct_state = reconstruct_state_drain_result;
else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
sh->reconstruct_state = reconstruct_state_prexor_drain_result;
else {
BUG_ON(sh->reconstruct_state != reconstruct_state_run);
sh->reconstruct_state = reconstruct_state_result;
}
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
static void
ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
unsigned long pending)
ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
/* kernel stack size limits the total number of disks */
int disks = sh->disks;
......@@ -787,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
int count = 0, pd_idx = sh->pd_idx, i;
struct page *xor_dest;
int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
int prexor = 0;
unsigned long flags;
dma_async_tx_callback callback;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
......@@ -797,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
/* check if prexor is active which means only process blocks
* that are part of a read-modify-write (written)
*/
if (prexor) {
if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
prexor = 1;
xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
......@@ -813,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
}
}
/* check whether this postxor is part of a write */
callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ?
ops_complete_write : ops_complete_postxor;
/* 1/ if we prexor'd then the dest is reused as a source
* 2/ if we did not prexor then we are redoing the parity
* set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
......@@ -830,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
if (unlikely(count == 1)) {
flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST);
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE,
flags, tx, callback, sh);
flags, tx, ops_complete_postxor, sh);
} else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
flags, tx, callback, sh);
flags, tx, ops_complete_postxor, sh);
}
static void ops_complete_check(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
int pd_idx = sh->pd_idx;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
sh->ops.zero_sum_result == 0)
set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
set_bit(STRIPE_OP_CHECK, &sh->ops.complete);
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}
......@@ -875,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh)
tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
&sh->ops.zero_sum_result, 0, NULL, NULL, NULL);
if (tx)
set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
else
clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending);
atomic_inc(&sh->count);
tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx,
ops_complete_check, sh);
}
static void raid5_run_ops(struct stripe_head *sh, unsigned long pending)
static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
if (test_bit(STRIPE_OP_BIOFILL, &pending)) {
if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
ops_run_biofill(sh);
overlap_clear++;
}
if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending))
tx = ops_run_compute5(sh, pending);
if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
tx = ops_run_compute5(sh);
/* terminate the chain if postxor is not set to be run */
if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request))
async_tx_ack(tx);
}
if (test_bit(STRIPE_OP_PREXOR, &pending))
if (test_bit(STRIPE_OP_PREXOR, &ops_request))
tx = ops_run_prexor(sh, tx);
if (test_bit(STRIPE_OP_BIODRAIN, &pending)) {
tx = ops_run_biodrain(sh, tx, pending);
if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
tx = ops_run_biodrain(sh, tx);
overlap_clear++;
}
if (test_bit(STRIPE_OP_POSTXOR, &pending))
ops_run_postxor(sh, tx, pending);
if (test_bit(STRIPE_OP_POSTXOR, &ops_request))
ops_run_postxor(sh, tx);
if (test_bit(STRIPE_OP_CHECK, &pending))
if (test_bit(STRIPE_OP_CHECK, &ops_request))
ops_run_check(sh);
if (test_bit(STRIPE_OP_IO, &pending))
ops_run_io(sh);
if (overlap_clear)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
......@@ -997,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
int err = 0;
int err;
struct kmem_cache *sc;
int i;
if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */
md_allow_write(conf->mddev);
err = md_allow_write(conf->mddev);
if (err)
return err;
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
......@@ -1703,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
}
}
static int
handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
static void
schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s,
int rcw, int expand)
{
int i, pd_idx = sh->pd_idx, disks = sh->disks;
int locked = 0;
if (rcw) {
/* if we are not expanding this is a proper write request, and
......@@ -1715,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
* stripe cache
*/
if (!expand) {
set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
sh->ops.count++;
}
sh->reconstruct_state = reconstruct_state_drain_run;
set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
} else
sh->reconstruct_state = reconstruct_state_run;
set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
sh->ops.count++;
set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (dev->towrite) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
if (!expand)
clear_bit(R5_UPTODATE, &dev->flags);
locked++;
s->locked++;
}
}
if (locked + 1 == disks)
if (s->locked + 1 == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&sh->raid_conf->pending_full_writes);
} else {
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
set_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
sh->ops.count += 3;
sh->reconstruct_state = reconstruct_state_prexor_drain_run;
set_bit(STRIPE_OP_PREXOR, &s->ops_request);
set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
set_bit(STRIPE_OP_POSTXOR, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i == pd_idx)
continue;
/* For a read-modify write there may be blocks that are
* locked for reading while others are ready to be
* written so we distinguish these blocks by the
* R5_Wantprexor bit
*/
if (dev->towrite &&
(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
set_bit(R5_Wantprexor, &dev->flags);
set_bit(R5_Wantdrain, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
clear_bit(R5_UPTODATE, &dev->flags);
locked++;
s->locked++;
}
}
}
......@@ -1771,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
*/
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
locked++;
s->locked++;
pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
locked, sh->ops.pending);
return locked;
s->locked, s->ops_request);
}
/*
......@@ -1876,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks)
}
static void
handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks,
struct bio **return_bi)
{
......@@ -1967,48 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
md_wakeup_thread(conf->mddev->thread);
}
/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
* to process
/* fetch_block5 - checks the given member device to see if its data needs
* to be read or computed to satisfy a request.
*
* Returns 1 when no more member devices need to be checked, otherwise returns
* 0 to tell the loop in handle_stripe_fill5 to continue
*/
static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
struct stripe_head_state *s, int disk_idx, int disks)
static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *failed_dev = &sh->dev[s->failed_num];
/* don't schedule compute operations or reads on the parity block while
* a check is in flight
*/
if ((disk_idx == sh->pd_idx) &&
test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
return ~0;
/* is the data in this block needed, and can we get it? */
if (!test_bit(R5_LOCKED, &dev->flags) &&
!test_bit(R5_UPTODATE, &dev->flags) && (dev->toread ||
!test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
s->syncing || s->expanding || (s->failed &&
(failed_dev->toread || (failed_dev->towrite &&
!test_bit(R5_OVERWRITE, &failed_dev->flags)
))))) {
/* 1/ We would like to get this block, possibly by computing it,
* but we might not be able to.
*
* 2/ Since parity check operations potentially make the parity
* block !uptodate it will need to be refreshed before any
* compute operations on data disks are scheduled.
*
* 3/ We hold off parity block re-reads until check operations
* have quiesced.
s->syncing || s->expanding ||
(s->failed &&
(failed_dev->toread ||
(failed_dev->towrite &&
!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
/* We would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
if ((s->uptodate == disks - 1) &&
(s->failed && disk_idx == s->failed_num) &&
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
(s->failed && disk_idx == s->failed_num)) {
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute, &dev->flags);
sh->ops.target = disk_idx;
s->req_compute = 1;
sh->ops.count++;
/* Careful: from this point on 'uptodate' is in the eye
* of raid5_run_ops which services 'compute' operations
* before writes. R5_Wantcompute flags a block that will
......@@ -2016,53 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
* subsequent operation.
*/
s->uptodate++;
return 0; /* uptodate + compute == disks */
return 1; /* uptodate + compute == disks */
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
s->locked++;
pr_debug("Reading block %d (sync=%d)\n", disk_idx,
s->syncing);
}
}
return ~0;
return 0;
}
static void handle_issuing_new_read_requests5(struct stripe_head *sh,
/**
* handle_stripe_fill5 - read or compute data to satisfy pending requests.
*/
static void handle_stripe_fill5(struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
int i;
/* Clear completed compute operations. Parity recovery
* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled
* later on in this routine
*/
if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
}
/* look for blocks to read/compute, skip this if a compute
* is already in flight, or if the stripe contents are in the
* midst of changing due to a write
*/
if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
!test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
!sh->reconstruct_state)
for (i = disks; i--; )
if (__handle_issuing_new_read_requests5(
sh, s, i, disks) == 0)
if (fetch_block5(sh, s, i, disks))
break;
}
set_bit(STRIPE_HANDLE, &sh->state);
}
static void handle_issuing_new_read_requests6(struct stripe_head *sh,
static void handle_stripe_fill6(struct stripe_head *sh,
struct stripe_head_state *s, struct r6_state *r6s,
int disks)
{
......@@ -2121,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
}
/* handle_completed_write_requests
/* handle_stripe_clean_event
* any written block on an uptodate or failed drive can be returned.
* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_completed_write_requests(raid5_conf_t *conf,
static void handle_stripe_clean_event(raid5_conf_t *conf,
struct stripe_head *sh, int disks, struct bio **return_bi)
{
int i;
......@@ -2171,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
md_wakeup_thread(conf->mddev->thread);
}
static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
static void handle_stripe_dirtying5(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s, int disks)
{
int rmw = 0, rcw = 0, i;
......@@ -2215,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
"%d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(
STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
s->locked++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
......@@ -2241,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
"%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(
STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
s->locked++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
......@@ -2261,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
* simultaneously. If this is not the case then new writes need to be
* held off until the compute completes.
*/
if ((s->req_compute ||
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) &&
if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
(s->locked == 0 && (rcw == 0 || rmw == 0) &&
!test_bit(STRIPE_BIT_DELAY, &sh->state)))
s->locked += handle_write_operations5(sh, rcw == 0, 0);
schedule_reconstruction5(sh, s, rcw == 0, 0);
}
static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
static void handle_stripe_dirtying6(raid5_conf_t *conf,
struct stripe_head *sh, struct stripe_head_state *s,
struct r6_state *r6s, int disks)
{
......@@ -2371,92 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
int canceled_check = 0;
struct r5dev *dev = NULL;
set_bit(STRIPE_HANDLE, &sh->state);
/* complete a check operation */
if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
switch (sh->check_state) {
case check_state_idle:
/* start a new check operation if there are no failures */
if (s->failed == 0) {
if (sh->ops.zero_sum_result == 0)
/* parity is correct (on disc,
* not in buffer any more)
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
conf->mddev->resync_mismatches +=
STRIPE_SECTORS;
if (test_bit(
MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
else {
set_bit(STRIPE_OP_COMPUTE_BLK,
&sh->ops.pending);
set_bit(STRIPE_OP_MOD_REPAIR_PD,
&sh->ops.pending);
set_bit(R5_Wantcompute,
&sh->dev[sh->pd_idx].flags);
sh->ops.target = sh->pd_idx;
sh->ops.count++;
s->uptodate++;
}
}
} else
canceled_check = 1; /* STRIPE_INSYNC is not set */
}
/* start a new check operation if there are no failures, the stripe is
* not insync, and a repair is not in flight
*/
if (s->failed == 0 &&
!test_bit(STRIPE_INSYNC, &sh->state) &&
!test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
BUG_ON(s->uptodate != disks);
sh->check_state = check_state_run;
set_bit(STRIPE_OP_CHECK, &s->ops_request);
clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
sh->ops.count++;
s->uptodate--;
break;
}
}
/* check if we can clear a parity disk reconstruct */
if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
}
dev = &sh->dev[s->failed_num];
/* fall through */
case check_state_compute_result:
sh->check_state = check_state_idle;
if (!dev)
dev = &sh->dev[sh->pd_idx];
/* check that a write has not made the stripe insync */
if (test_bit(STRIPE_INSYNC, &sh->state))
break;
/* Wait for check parity and compute block operations to complete
* before write-back. If a failure occurred while the check operation
* was in flight we need to cycle this stripe through handle_stripe
* since the parity block may not be uptodate
*/
if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) &&
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending) &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
struct r5dev *dev;
/* either failed parity check, or recovery is happening */
if (s->failed == 0)
s->failed_num = sh->pd_idx;
dev = &sh->dev[s->failed_num];
BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
BUG_ON(s->uptodate != disks);
set_bit(R5_LOCKED, &dev->flags);
s->locked++;
set_bit(R5_Wantwrite, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
clear_bit(STRIPE_DEGRADED, &sh->state);
s->locked++;
set_bit(STRIPE_INSYNC, &sh->state);
break;
case check_state_run:
break; /* we will be called again upon completion */
case check_state_check_result:
sh->check_state = check_state_idle;
/* if a failure occurred during the check operation, leave
* STRIPE_INSYNC not set and let the stripe be handled again
*/
if (s->failed)
break;
/* handle a successful check operation, if parity is correct
* we are done. Otherwise update the mismatch count and repair
* parity if !MD_RECOVERY_CHECK
*/
if (sh->ops.zero_sum_result == 0)
/* parity is correct (on disc,
* not in buffer any more)
*/
set_bit(STRIPE_INSYNC, &sh->state);
else {
conf->mddev->resync_mismatches += STRIPE_SECTORS;
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute,
&sh->dev[sh->pd_idx].flags);
sh->ops.target = sh->pd_idx;
s->uptodate++;
}
}
break;
case check_state_compute_run:
break;
default:
printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
__func__, sh->check_state,
(unsigned long long) sh->sector);
BUG();
}
}
......@@ -2641,15 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh)
struct bio *return_bi = NULL;
struct stripe_head_state s;
struct r5dev *dev;
unsigned long pending = 0;
mdk_rdev_t *blocked_rdev = NULL;
int prexor;
memset(&s, 0, sizeof(s));
pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
"ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state,
atomic_read(&sh->count), sh->pd_idx,
sh->ops.pending, sh->ops.ack, sh->ops.complete);
pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
"reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
atomic_read(&sh->count), sh->pd_idx, sh->check_state,
sh->reconstruct_state);
spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state);
......@@ -2658,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh)
s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
/* Now to look around and see what can be done */
/* clean-up completed biofill operations */
if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete);
}
/* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
......@@ -2680,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh)
/* maybe we can request a biofill operation
*
* new wantfill requests are only permitted while
* STRIPE_OP_BIOFILL is clear
* ops_complete_biofill is guaranteed to be inactive
*/
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
!test_bit(STRIPE_BIOFILL_RUN, &sh->state))
set_bit(R5_Wantfill, &dev->flags);
/* now count some things */
......@@ -2727,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh)
goto unlock;
}
if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
sh->ops.count++;
if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
set_bit(STRIPE_BIOFILL_RUN, &sh->state);
}
pr_debug("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d\n",
......@@ -2738,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh)
* need to be failed
*/
if (s.failed > 1 && s.to_read+s.to_write+s.written)
handle_requests_to_failed_array(conf, sh, &s, disks,
&return_bi);
handle_failed_stripe(conf, sh, &s, disks, &return_bi);
if (s.failed > 1 && s.syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
......@@ -2755,48 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh)
!test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags)) ||
(s.failed == 1 && s.failed_num == sh->pd_idx)))
handle_completed_write_requests(conf, sh, disks, &return_bi);
handle_stripe_clean_event(conf, sh, disks, &return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
if (s.to_read || s.non_overwrite ||
(s.syncing && (s.uptodate + s.compute < disks)) || s.expanding ||
test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
handle_issuing_new_read_requests5(sh, &s, disks);
(s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
handle_stripe_fill5(sh, &s, disks);
/* Now we check to see if any write operations have recently
* completed
*/
/* leave prexor set until postxor is done, allows us to distinguish
* a rmw from a rcw during biodrain
*/
prexor = 0;
if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
prexor = 1;
clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
for (i = disks; i--; )
clear_bit(R5_Wantprexor, &sh->dev[i].flags);
}
/* if only POSTXOR is set then this is an 'expand' postxor */
if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
if (sh->reconstruct_state == reconstruct_state_drain_result ||
sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
sh->reconstruct_state = reconstruct_state_idle;
/* All the 'written' buffers and the parity block are ready to
* be written back to disk
......@@ -2808,9 +2651,6 @@ static void handle_stripe5(struct stripe_head *sh)
(i == sh->pd_idx || dev->written)) {
pr_debug("Writing block %d\n", i);
set_bit(R5_Wantwrite, &dev->flags);
if (!test_and_set_bit(
STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
if (prexor)
continue;
if (!test_bit(R5_Insync, &dev->flags) ||
......@@ -2832,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh)
* 2/ A 'check' operation is in flight, as it may clobber the parity
* block.
*/
if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) &&
!test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
handle_issuing_new_write_requests5(conf, sh, &s, disks);
if (s.to_write && !sh->reconstruct_state && !sh->check_state)
handle_stripe_dirtying5(conf, sh, &s, disks);
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
* data is available. The parity check is held off while parity
* dependent operations are in flight.
*/
if ((s.syncing && s.locked == 0 &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
!test_bit(STRIPE_INSYNC, &sh->state)) ||
test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending))
if (sh->check_state ||
(s.syncing && s.locked == 0 &&
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
!test_bit(STRIPE_INSYNC, &sh->state)))
handle_parity_checks5(conf, sh, &s, disks);
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
......@@ -2864,52 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[s.failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
}
}
/* Finish postxor operations initiated by the expansion
* process
*/
if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
!test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
for (i = conf->raid_disks; i--; ) {
for (i = conf->raid_disks; i--; )
set_bit(R5_Wantwrite, &sh->dev[i].flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
sh->ops.count++;
}
}
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
!sh->reconstruct_state) {
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf,
conf->raid_disks);
s.locked += handle_write_operations5(sh, 1, 1);
} else if (s.expanded &&
s.locked == 0 &&
!test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
schedule_reconstruction5(sh, &s, 1, 1);
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
......@@ -2917,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh)
}
if (s.expanding && s.locked == 0 &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh, NULL);
if (sh->ops.count)
pending = get_stripe_work(sh);
unlock:
spin_unlock(&sh->lock);
......@@ -2930,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh)
if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
if (pending)
raid5_run_ops(sh, pending);
if (s.ops_request)
raid5_run_ops(sh, s.ops_request);
return_io(return_bi);
ops_run_io(sh, &s);
return_io(return_bi);
}
static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
......@@ -3042,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
* might need to be failed
*/
if (s.failed > 2 && s.to_read+s.to_write+s.written)
handle_requests_to_failed_array(conf, sh, &s, disks,
&return_bi);
handle_failed_stripe(conf, sh, &s, disks, &return_bi);
if (s.failed > 2 && s.syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
......@@ -3068,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
&& !test_bit(R5_LOCKED, &qdev->flags)
&& test_bit(R5_UPTODATE, &qdev->flags)))))
handle_completed_write_requests(conf, sh, disks, &return_bi);
handle_stripe_clean_event(conf, sh, disks, &return_bi);
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
......@@ -3076,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
*/
if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
(s.syncing && (s.uptodate < disks)) || s.expanding)
handle_issuing_new_read_requests6(sh, &s, &r6s, disks);
handle_stripe_fill6(sh, &s, &r6s, disks);
/* now to consider writing and what else, if anything should be read */
if (s.to_write)
handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks);
handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
......@@ -3136,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
}
if (s.expanding && s.locked == 0 &&
!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
!test_bit(STRIPE_COMPUTE_RUN, &sh->state))
handle_stripe_expansion(conf, sh, &r6s);
unlock:
......@@ -3146,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
if (unlikely(blocked_rdev))
md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
return_io(return_bi);
for (i=disks; i-- ;) {
int rw;
struct bio *bi;
mdk_rdev_t *rdev;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
rw = WRITE;
else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ;
else
continue;
ops_run_io(sh, &s);
set_bit(STRIPE_IO_STARTED, &sh->state);
bi = &sh->dev[i].req;
bi->bi_rw = rw;
if (rw == WRITE)
bi->bi_end_io = raid5_end_write_request;
else
bi->bi_end_io = raid5_end_read_request;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (rdev) {
if (s.syncing || s.expanding || s.expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
bi->bi_bdev = rdev->bdev;
pr_debug("for %llu schedule op %ld on disc %d\n",
(unsigned long long)sh->sector, bi->bi_rw, i);
atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_vcnt = 1;
bi->bi_max_vecs = 1;
bi->bi_idx = 0;
bi->bi_io_vec = &sh->dev[i].vec;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
bi->bi_next = NULL;
if (rw == WRITE &&
test_bit(R5_ReWrite, &sh->dev[i].flags))
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
generic_make_request(bi);
} else {
if (rw == WRITE)
set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
return_io(return_bi);
}
static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
......@@ -3697,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
bi->bi_end_io(bi,
test_bit(BIO_UPTODATE, &bi->bi_flags)
? 0 : -EIO);
bio_endio(bi, 0);
}
return 0;
}
......@@ -3785,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
j == raid6_next_disk(sh->pd_idx, sh->disks))
continue;
s = compute_blocknr(sh, j);
if (s < (mddev->array_size<<1)) {
if (s < mddev->array_sectors) {
skipped = 1;
continue;
}
......@@ -4002,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
spin_lock_irq(&conf->device_lock);
remaining = --raid_bio->bi_phys_segments;
spin_unlock_irq(&conf->device_lock);
if (remaining == 0) {
raid_bio->bi_end_io(raid_bio,
test_bit(BIO_UPTODATE, &raid_bio->bi_flags)
? 0 : -EIO);
}
if (remaining == 0)
bio_endio(raid_bio, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
return handled;
......@@ -4094,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned long new;
int err;
if (len >= PAGE_SIZE)
return -EINVAL;
if (!conf)
......@@ -4109,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
else
break;
}
md_allow_write(mddev);
err = md_allow_write(mddev);
if (err)
return err;
while (new > conf->max_nr_stripes) {
if (grow_one_stripe(conf))
conf->max_nr_stripes++;
......@@ -4434,7 +4191,7 @@ static int run(mddev_t *mddev)
mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested;
mddev->array_size = mddev->size * (conf->previous_raid_disks -
mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks -
conf->max_degraded);
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
......@@ -4609,35 +4366,41 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{
raid5_conf_t *conf = mddev->private;
int found = 0;
int err = -EEXIST;
int disk;
struct disk_info *p;
int first = 0;
int last = conf->raid_disks - 1;
if (mddev->degraded > conf->max_degraded)
/* no point adding a device */
return 0;
return -EINVAL;
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
/*
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->disks[rdev->saved_raid_disk].rdev == NULL)
disk = rdev->saved_raid_disk;
else
disk = 0;
for ( ; disk < conf->raid_disks; disk++)
disk = first;
for ( ; disk <= last ; disk++)
if ((p=conf->disks + disk)->rdev == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->raid_disk = disk;
found = 1;
err = 0;
if (rdev->saved_raid_disk != disk)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
break;
}
print_raid5_conf(conf);
return found;
return err;
}
static int raid5_resize(mddev_t *mddev, sector_t sectors)
......@@ -4652,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
raid5_conf_t *conf = mddev_to_conf(mddev);
sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1;
set_capacity(mddev->gendisk, mddev->array_size << 1);
mddev->array_sectors = sectors * (mddev->raid_disks
- conf->max_degraded);
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1;
......@@ -4738,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev)
rdev_for_each(rdev, rtmp, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid5_add_disk(mddev, rdev)) {
if (raid5_add_disk(mddev, rdev) == 0) {
char nm[20];
set_bit(In_sync, &rdev->flags);
added_devices++;
......@@ -4786,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf)
struct block_device *bdev;
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
conf->mddev->array_size = conf->mddev->size *
conf->mddev->array_sectors = 2 * conf->mddev->size *
(conf->raid_disks - conf->max_degraded);
set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1);
set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10);
i_size_write(bdev->bd_inode,
(loff_t)conf->mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
......
......@@ -221,6 +221,7 @@ struct bitmap {
unsigned long syncchunk;
__u64 events_cleared;
int need_sync;
/* bitmap spinlock */
spinlock_t lock;
......
......@@ -16,7 +16,7 @@ struct linear_private_data
struct linear_private_data *prev; /* earlier version */
dev_info_t **hash_table;
sector_t hash_spacing;
sector_t array_size;
sector_t array_sectors;
int preshift; /* shift before dividing by hash_spacing */
dev_info_t disks[0];
};
......
......@@ -95,7 +95,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern void md_allow_write(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* CONFIG_MD */
......
......@@ -59,7 +59,7 @@ struct mdk_rdev_s
int sb_loaded;
__u64 sb_events;
sector_t data_offset; /* start of data in array */
sector_t sb_offset;
sector_t sb_start; /* offset of the super block (in 512byte sectors) */
int sb_size; /* bytes in the superblock */
int preferred_minor; /* autorun support */
......@@ -87,6 +87,9 @@ struct mdk_rdev_s
#define Blocked 8 /* An error occured on an externally
* managed array, don't allow writes
* until it is cleared */
#define StateChanged 9 /* Faulty or Blocked has changed during
* interrupt, so it needs to be
* notified by the thread */
wait_queue_head_t blocked_wait;
int desc_nr; /* descriptor index in the superblock */
......@@ -147,7 +150,7 @@ struct mddev_s
int raid_disks;
int max_disks;
sector_t size; /* used size of component devices */
sector_t array_size; /* exported array size */
sector_t array_sectors; /* exported array size */
__u64 events;
char uuid[16];
......@@ -188,6 +191,7 @@ struct mddev_s
* NEEDED: we might need to start a resync/recover
* RUNNING: a thread is running, or about to be started
* SYNC: actually doing a resync, not a recovery
* RECOVER: doing recovery, or need to try it.
* INTR: resync needs to be aborted for some reason
* DONE: thread is done and is waiting to be reaped
* REQUEST: user-space has requested a sync (used with SYNC)
......@@ -198,6 +202,7 @@ struct mddev_s
*/
#define MD_RECOVERY_RUNNING 0
#define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_RECOVER 2
#define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5
......@@ -210,7 +215,8 @@ struct mddev_s
int in_sync; /* know to not need resync */
struct mutex reconfig_mutex;
atomic_t active;
atomic_t active; /* general refcount */
atomic_t openers; /* number of active opens */
int changed; /* true if we might need to reread partition info */
int degraded; /* whether md should consider
......@@ -227,6 +233,8 @@ struct mddev_s
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
sector_t resync_min; /* user requested sync
* starts here */
sector_t resync_max; /* resync should pause
* when it gets here */
......@@ -331,6 +339,9 @@ static inline char * mdname (mddev_t * mddev)
#define rdev_for_each(rdev, tmp, mddev) \
rdev_for_each_list(rdev, tmp, (mddev)->disks)
#define rdev_for_each_rcu(rdev, mddev) \
list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
typedef struct mdk_thread_s {
void (*run) (mddev_t *mddev);
mddev_t *mddev;
......
......@@ -43,14 +43,11 @@
*/
#define MD_RESERVED_BYTES (64 * 1024)
#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
#define MD_SB_BYTES 4096
#define MD_SB_WORDS (MD_SB_BYTES / 4)
#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
#define MD_SB_SECTORS (MD_SB_BYTES / 512)
/*
......
......@@ -158,6 +158,43 @@
* the compute block completes.
*/
/*
* Operations state - intermediate states that are visible outside of sh->lock
* In general _idle indicates nothing is running, _run indicates a data
* processing operation is active, and _result means the data processing result
* is stable and can be acted upon. For simple operations like biofill and
* compute that only have an _idle and _run state they are indicated with
* sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
*/
/**
* enum check_states - handles syncing / repairing a stripe
* @check_state_idle - check operations are quiesced
* @check_state_run - check operation is running
* @check_state_result - set outside lock when check result is valid
* @check_state_compute_run - check failed and we are repairing
* @check_state_compute_result - set outside lock when compute result is valid
*/
enum check_states {
check_state_idle = 0,
check_state_run, /* parity check */
check_state_check_result,
check_state_compute_run, /* parity repair */
check_state_compute_result,
};
/**
* enum reconstruct_states - handles writing or expanding a stripe
*/
enum reconstruct_states {
reconstruct_state_idle = 0,
reconstruct_state_prexor_drain_run, /* prexor-write */
reconstruct_state_drain_run, /* write */
reconstruct_state_run, /* expand */
reconstruct_state_prexor_drain_result,
reconstruct_state_drain_result,
reconstruct_state_result,
};
struct stripe_head {
struct hlist_node hash;
struct list_head lru; /* inactive_list or handle_list */
......@@ -169,19 +206,13 @@ struct stripe_head {
spinlock_t lock;
int bm_seq; /* sequence number for bitmap flushes */
int disks; /* disks in stripe */
enum check_states check_state;
enum reconstruct_states reconstruct_state;
/* stripe_operations
* @pending - pending ops flags (set for request->issue->complete)
* @ack - submitted ops flags (set for issue->complete)
* @complete - completed ops flags (set for complete)
* @target - STRIPE_OP_COMPUTE_BLK target
* @count - raid5_runs_ops is set to run when this is non-zero
*/
struct stripe_operations {
unsigned long pending;
unsigned long ack;
unsigned long complete;
int target;
int count;
u32 zero_sum_result;
} ops;
struct r5dev {
......@@ -202,6 +233,7 @@ struct stripe_head_state {
int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite;
int failed_num;
unsigned long ops_request;
};
/* r6_state - extra state data only relevant to r6 */
......@@ -228,9 +260,7 @@ struct r6_state {
#define R5_Wantfill 12 /* dev->toread contains a bio that needs
* filling
*/
#define R5_Wantprexor 13 /* distinguish blocks ready for rmw from
* other "towrites"
*/
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
/*
* Write method
*/
......@@ -254,8 +284,10 @@ struct r6_state {
#define STRIPE_EXPAND_READY 11
#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
#define STRIPE_BIOFILL_RUN 14
#define STRIPE_COMPUTE_RUN 15
/*
* Operations flags (in issue order)
* Operation request flags
*/
#define STRIPE_OP_BIOFILL 0
#define STRIPE_OP_COMPUTE_BLK 1
......@@ -263,14 +295,6 @@ struct r6_state {
#define STRIPE_OP_BIODRAIN 3
#define STRIPE_OP_POSTXOR 4
#define STRIPE_OP_CHECK 5
#define STRIPE_OP_IO 6
/* modifiers to the base operations
* STRIPE_OP_MOD_REPAIR_PD - compute the parity block and write it back
* STRIPE_OP_MOD_DMA_CHECK - parity is not corrupted by the check
*/
#define STRIPE_OP_MOD_REPAIR_PD 7
#define STRIPE_OP_MOD_DMA_CHECK 8
/*
* Plugging:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment