Commit a5e0d731 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md-3.9' of git://neil.brown.name/md

Pull md updates from NeilBrown:
 "Mostly little bugfixes.

  Only "feature" is a new RAID10 layout which slightly improves the
  number of sets of devices that can concurrently fail, without data
  loss."

* tag 'md-3.9' of git://neil.brown.name/md:
  md: expedite metadata update when switching  read-auto -> active
  md: remove CONFIG_MULTICORE_RAID456
  md/raid1,raid10: fix deadlock with freeze_array()
  md/raid0: improve error message when converting RAID4-with-spares to RAID0
  md: raid0: fix error return from create_stripe_zones.
  md: fix two bugs when attempting to resize RAID0 array.
  DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms
  MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 2)
  MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)
  MD RAID10: Minor non-functional code changes
  md: raid1,10: Handle REQ_WRITE_SAME flag in write bios
  md: protect against crash upon fsync on ro array
parents 6dbe51c2 f3378b48
......@@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters:
raid10 Various RAID10 inspired algorithms chosen by additional params
- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
- RAID1E: Integrated Adjacent Stripe Mirroring
- RAID1E: Integrated Offset Stripe Mirroring
- and other similar RAID10 variants
Reference: Chapter 4 of
......@@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters:
synchronisation state for each region.
[raid10_copies <# copies>]
[raid10_format near]
[raid10_format <near|far|offset>]
These two options are used to alter the default layout of
a RAID10 configuration. The number of copies is can be
specified, but the default is 2. There are other variations
to how the copies are laid down - the default and only current
option is "near". Near copies are what most people think of
with respect to mirroring. If these options are left
unspecified, or 'raid10_copies 2' and/or 'raid10_format near'
are given, then the layouts for 2, 3 and 4 devices are:
specified, but the default is 2. There are also three
variations to how the copies are laid down - the default
is "near". Near copies are what most people think of with
respect to mirroring. If these options are left unspecified,
or 'raid10_copies 2' and/or 'raid10_format near' are given,
then the layouts for 2, 3 and 4 devices are:
2 drives 3 drives 4 drives
-------- ---------- --------------
A1 A1 A1 A1 A2 A1 A1 A2 A2
......@@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters:
3-device layout is what might be called a 'RAID1E - Integrated
Adjacent Stripe Mirroring'.
If 'raid10_copies 2' and 'raid10_format far', then the layouts
for 2, 3 and 4 devices are:
2 drives 3 drives 4 drives
-------- -------------- --------------------
A1 A2 A1 A2 A3 A1 A2 A3 A4
A3 A4 A4 A5 A6 A5 A6 A7 A8
A5 A6 A7 A8 A9 A9 A10 A11 A12
.. .. .. .. .. .. .. .. ..
A2 A1 A3 A1 A2 A2 A1 A4 A3
A4 A3 A6 A4 A5 A6 A5 A8 A7
A6 A5 A9 A7 A8 A10 A9 A12 A11
.. .. .. .. .. .. .. .. ..
If 'raid10_copies 2' and 'raid10_format offset', then the
layouts for 2, 3 and 4 devices are:
2 drives 3 drives 4 drives
-------- ------------ -----------------
A1 A2 A1 A2 A3 A1 A2 A3 A4
A2 A1 A3 A1 A2 A2 A1 A4 A3
A3 A4 A4 A5 A6 A5 A6 A7 A8
A4 A3 A6 A4 A5 A6 A5 A8 A7
A5 A6 A7 A8 A9 A9 A10 A11 A12
A6 A5 A9 A7 A8 A10 A9 A12 A11
.. .. .. .. .. .. .. .. ..
Here we see layouts closely akin to 'RAID1E - Integrated
Offset Stripe Mirroring'.
<#raid_devs>: The number of devices composing the array.
Each device consists of two entries. The first is the device
containing the metadata (if any); the second is the one containing the
......@@ -142,3 +170,5 @@ Version History
1.3.0 Added support for RAID 10
1.3.1 Allow device replacement/rebuild for RAID 10
1.3.2 Fix/improve redundancy checking for RAID10
1.4.0 Non-functional change. Removes arg from mapping function.
1.4.1 Add RAID10 "far" and "offset" algorithm support.
......@@ -154,17 +154,6 @@ config MD_RAID456
If unsure, say Y.
config MULTICORE_RAID456
bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
depends on MD_RAID456
depends on SMP
depends on EXPERIMENTAL
---help---
Enable the raid456 module to dispatch per-stripe raid operations to a
thread pool.
If unsure, say N.
config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
......
......@@ -91,15 +91,44 @@ static struct raid_type {
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
};
static char *raid10_md_layout_to_format(int layout)
{
/*
* Bit 16 and 17 stand for "offset" and "use_far_sets"
* Refer to MD's raid10.c for details
*/
if ((layout & 0x10000) && (layout & 0x20000))
return "offset";
if ((layout & 0xFF) > 1)
return "near";
return "far";
}
static unsigned raid10_md_layout_to_copies(int layout)
{
return layout & 0xFF;
if ((layout & 0xFF) > 1)
return layout & 0xFF;
return (layout >> 8) & 0xFF;
}
static int raid10_format_to_md_layout(char *format, unsigned copies)
{
/* 1 "far" copy, and 'copies' "near" copies */
return (1 << 8) | (copies & 0xFF);
unsigned n = 1, f = 1;
if (!strcmp("near", format))
n = copies;
else
f = copies;
if (!strcmp("offset", format))
return 0x30000 | (f << 8) | n;
if (!strcmp("far", format))
return 0x20000 | (f << 8) | n;
return (f << 8) | n;
}
static struct raid_type *get_raid_type(char *name)
......@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
{
unsigned i, rebuild_cnt = 0;
unsigned rebuilds_per_group, copies, d;
unsigned group_size, last_group_start;
for (i = 0; i < rs->md.raid_disks; i++)
if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
......@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
* as long as the failed devices occur in different mirror
* groups (i.e. different stripes).
*
* Right now, we only allow for "near" copies. When other
* formats are added, we will have to check those too.
*
* When checking "near" format, make sure no adjacent devices
* have failed beyond what can be handled. In addition to the
* simple case where the number of devices is a multiple of the
......@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
* A A B B C
* C D D E E
*/
for (i = 0; i < rs->md.raid_disks * copies; i++) {
if (!(i % copies))
if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
for (i = 0; i < rs->md.raid_disks * copies; i++) {
if (!(i % copies))
rebuilds_per_group = 0;
d = i % rs->md.raid_disks;
if ((!rs->dev[d].rdev.sb_page ||
!test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
(++rebuilds_per_group >= copies))
goto too_many;
}
break;
}
/*
* When checking "far" and "offset" formats, we need to ensure
* that the device that holds its copy is not also dead or
* being rebuilt. (Note that "far" and "offset" formats only
* support two copies right now. These formats also only ever
* use the 'use_far_sets' variant.)
*
* This check is somewhat complicated by the need to account
* for arrays that are not a multiple of (far) copies. This
* results in the need to treat the last (potentially larger)
* set differently.
*/
group_size = (rs->md.raid_disks / copies);
last_group_start = (rs->md.raid_disks / group_size) - 1;
last_group_start *= group_size;
for (i = 0; i < rs->md.raid_disks; i++) {
if (!(i % copies) && !(i > last_group_start))
rebuilds_per_group = 0;
d = i % rs->md.raid_disks;
if ((!rs->dev[d].rdev.sb_page ||
!test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
if ((!rs->dev[i].rdev.sb_page ||
!test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
(++rebuilds_per_group >= copies))
goto too_many;
goto too_many;
}
break;
default:
......@@ -433,7 +487,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
* [raid10_format <near>] Layout algorithm. (Default: near)
* [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
*/
static int parse_raid_params(struct raid_set *rs, char **argv,
unsigned num_raid_params)
......@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
return -EINVAL;
}
if (strcmp("near", argv[i])) {
if (strcmp("near", argv[i]) &&
strcmp("far", argv[i]) &&
strcmp("offset", argv[i])) {
rs->ti->error = "Invalid 'raid10_format' value given";
return -EINVAL;
}
......@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
return -EINVAL;
}
/*
* If the format is not "near", we only support
* two copies at the moment.
*/
if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
rs->ti->error = "Too many copies for given RAID10 format.";
return -EINVAL;
}
/* (Len * #mirrors) / #devices */
sectors_per_dev = rs->ti->len * raid10_copies;
sector_div(sectors_per_dev, rs->md.raid_disks);
......@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
/*
* Reshaping is not currently allowed
*/
if ((le32_to_cpu(sb->level) != mddev->level) ||
(le32_to_cpu(sb->layout) != mddev->layout) ||
(le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
DMERR("Reshaping arrays not yet supported.");
if (le32_to_cpu(sb->level) != mddev->level) {
DMERR("Reshaping arrays not yet supported. (RAID level change)");
return -EINVAL;
}
if (le32_to_cpu(sb->layout) != mddev->layout) {
DMERR("Reshaping arrays not yet supported. (RAID layout change)");
DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
DMERR(" Old layout: %s w/ %d copies",
raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
DMERR(" New layout: %s w/ %d copies",
raid10_md_layout_to_format(mddev->layout),
raid10_md_layout_to_copies(mddev->layout));
return -EINVAL;
}
if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
return -EINVAL;
}
/* We can only change the number of devices in RAID1 right now */
if ((rs->raid_type->level != 1) &&
(le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
DMERR("Reshaping arrays not yet supported.");
DMERR("Reshaping arrays not yet supported. (device count change)");
return -EINVAL;
}
......@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid10_md_layout_to_copies(rs->md.layout));
if (rs->print_flags & DMPF_RAID10_FORMAT)
DMEMIT(" raid10_format near");
DMEMIT(" raid10_format %s",
raid10_md_layout_to_format(rs->md.layout));
DMEMIT(" %d", rs->md.raid_disks);
for (i = 0; i < rs->md.raid_disks; i++) {
......@@ -1418,6 +1497,10 @@ static struct target_type raid_target = {
static int __init dm_raid_init(void)
{
DMINFO("Loading target version %u.%u.%u",
raid_target.version[0],
raid_target.version[1],
raid_target.version[2]);
return dm_register_target(&raid_target);
}
......
......@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
bio_io_error(bio);
return;
}
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
return;
}
smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
if (mddev->suspended) {
......@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (!sectors)
sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
rdev->data_offset;
if (!my_mddev->pers->resize)
/* Cannot change size for RAID0 or Linear etc */
return -EINVAL;
}
if (sectors < my_mddev->dev_sectors)
return -EINVAL; /* component must fit device */
......@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
mddev->ro = 0;
sysfs_notify_dirent_safe(mddev->sysfs_state);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
/* mddev_unlock will wake thread */
/* If a device failed while we were read-only, we
* need to make sure the metadata is updated now.
*/
if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
mddev_unlock(mddev);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
mddev_lock(mddev);
}
} else {
err = -EROFS;
goto abort_unlock;
......
......@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
rdev1->new_raid_disk = j;
}
if (j < 0 || j >= mddev->raid_disks) {
if (j < 0) {
printk(KERN_ERR
"md/raid0:%s: remove inactive devices before converting to RAID0\n",
mdname(mddev));
goto abort;
}
if (j >= mddev->raid_disks) {
printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
"aborting!\n", mdname(mddev), j);
goto abort;
......@@ -289,7 +295,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
kfree(conf->strip_zone);
kfree(conf->devlist);
kfree(conf);
*private_conf = NULL;
*private_conf = ERR_PTR(err);
return err;
}
......@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
"%s does not support generic reshape\n", __func__);
rdev_for_each(rdev, mddev)
array_sectors += rdev->sectors;
array_sectors += (rdev->sectors &
~(sector_t)(mddev->chunk_sectors-1));
return array_sectors;
}
......
......@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
......@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
struct raid1_plug_cb *plug = NULL;
......@@ -1301,7 +1303,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
conf->mirrors[i].rdev->data_offset);
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
mbio->bi_rw =
WRITE | do_flush_fua | do_sync | do_discard | do_same;
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
......@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
if (mddev->queue)
blk_queue_max_write_same_sectors(mddev->queue,
mddev->chunk_sectors);
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
......
......@@ -38,21 +38,36 @@
* near_copies (stored in low byte of layout)
* far_copies (stored in second byte of layout)
* far_offset (stored in bit 16 of layout )
* use_far_sets (stored in bit 17 of layout )
*
* The data to be stored is divided into chunks using chunksize.
* Each device is divided into far_copies sections.
* In each section, chunks are laid out in a style similar to raid0, but
* near_copies copies of each chunk is stored (each on a different drive).
* The starting device for each section is offset near_copies from the starting
* device of the previous section.
* Thus they are (near_copies*far_copies) of each chunk, and each is on a different
* drive.
* near_copies and far_copies must be at least one, and their product is at most
* raid_disks.
* The data to be stored is divided into chunks using chunksize. Each device
* is divided into far_copies sections. In each section, chunks are laid out
* in a style similar to raid0, but near_copies copies of each chunk is stored
* (each on a different drive). The starting device for each section is offset
* near_copies from the starting device of the previous section. Thus there
* are (near_copies * far_copies) of each chunk, and each is on a different
* drive. near_copies and far_copies must be at least one, and their product
* is at most raid_disks.
*
* If far_offset is true, then the far_copies are handled a bit differently.
* The copies are still in different stripes, but instead of be very far apart
* on disk, there are adjacent stripes.
* The copies are still in different stripes, but instead of being very far
* apart on disk, there are adjacent stripes.
*
* The far and offset algorithms are handled slightly differently if
* 'use_far_sets' is true. In this case, the array's devices are grouped into
* sets that are (near_copies * far_copies) in size. The far copied stripes
* are still shifted by 'near_copies' devices, but this shifting stays confined
* to the set rather than the entire array. This is done to improve the number
* of device combinations that can fail without causing the array to fail.
* Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
* on a device):
* A B C D A B C D E
* ... ...
* D A B C E A B C D
* Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
* [A B] [C D] [A B] [C D E]
* |...| |...| |...| | ... |
* [B A] [D C] [B A] [E C D]
*/
/*
......@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
sector_t stripe;
int dev;
int slot = 0;
int last_far_set_start, last_far_set_size;
last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
last_far_set_start *= geo->far_set_size;
last_far_set_size = geo->far_set_size;
last_far_set_size += (geo->raid_disks % geo->far_set_size);
/* now calculate first sector/dev */
chunk = r10bio->sector >> geo->chunk_shift;
......@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
/* and calculate all the others */
for (n = 0; n < geo->near_copies; n++) {
int d = dev;
int set;
sector_t s = sector;
r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
slot++;
for (f = 1; f < geo->far_copies; f++) {
set = d / geo->far_set_size;
d += geo->near_copies;
if (d >= geo->raid_disks)
d -= geo->raid_disks;
if ((geo->raid_disks % geo->far_set_size) &&
(d > last_far_set_start)) {
d -= last_far_set_start;
d %= last_far_set_size;
d += last_far_set_start;
} else {
d %= geo->far_set_size;
d += geo->far_set_size * set;
}
s += geo->stride;
r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s;
......@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
int far_set_size = geo->far_set_size;
int last_far_set_start;
if (geo->raid_disks % geo->far_set_size) {
last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
last_far_set_start *= geo->far_set_size;
if (dev >= last_far_set_start) {
far_set_size = geo->far_set_size;
far_set_size += (geo->raid_disks % geo->far_set_size);
far_set_start = last_far_set_start;
}
}
offset = sector & geo->chunk_mask;
if (geo->far_offset) {
......@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, geo->far_copies);
dev -= fc * geo->near_copies;
if (dev < 0)
dev += geo->raid_disks;
if (dev < far_set_start)
dev += far_set_size;
} else {
while (sector >= geo->stride) {
sector -= geo->stride;
if (dev < geo->near_copies)
dev += geo->raid_disks - geo->near_copies;
if (dev < (geo->near_copies + far_set_start))
dev += far_set_size - geo->near_copies;
else
dev -= geo->near_copies;
}
......@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_barrier);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
......@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
const unsigned long do_discard = (bio->bi_rw
& (REQ_DISCARD | REQ_SECURE));
const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
unsigned long flags;
struct md_rdev *blocked_rdev;
struct blk_plug_cb *cb;
......@@ -1460,7 +1508,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_rw =
WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
......@@ -1502,7 +1551,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_rw =
WRITE | do_sync | do_fua | do_discard | do_same;
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
......@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
if (layout >> 17)
if (layout >> 18)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
......@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
......@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
......
......@@ -33,6 +33,11 @@ struct r10conf {
* far_offset, in which case it is
* 1 stripe.
*/
int far_set_size; /* The number of devices in a set,
* where a 'set' are devices that
* contain far/offset copies of
* each other.
*/
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
} prev, geo;
......
......@@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
struct dma_async_tx_descriptor *tx = NULL;
......@@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
#ifdef CONFIG_MULTICORE_RAID456
static void async_run_ops(void *param, async_cookie_t cookie)
{
struct stripe_head *sh = param;
unsigned long ops_request = sh->ops.request;
clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
wake_up(&sh->ops.wait_for_ops);
__raid_run_ops(sh, ops_request);
release_stripe(sh);
}
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
/* since handle_stripe can be called outside of raid5d context
* we need to ensure sh->ops.request is de-staged before another
* request arrives
*/
wait_event(sh->ops.wait_for_ops,
!test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
sh->ops.request = ops_request;
atomic_inc(&sh->count);
async_schedule(async_run_ops, sh);
}
#else
#define raid_run_ops __raid_run_ops
#endif
static int grow_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
......@@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf)
return 0;
sh->raid_conf = conf;
#ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&sh->ops.wait_for_ops);
#endif
spin_lock_init(&sh->stripe_lock);
......@@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
break;
nsh->raid_conf = conf;
#ifdef CONFIG_MULTICORE_RAID456
init_waitqueue_head(&nsh->ops.wait_for_ops);
#endif
spin_lock_init(&nsh->stripe_lock);
list_add(&nsh->lru, &newstripes);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment