Commit 775a2e29 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'dm-4.10-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - various fixes and improvements to request-based DM and DM multipath

 - some locking improvements in DM bufio

 - add Kconfig option to disable the DM block manager's extra locking
   which mainly serves as a developer tool

 - a few bug fixes to DM's persistent-data

 - a couple changes to prepare for multipage biovec support in the block
   layer

 - various improvements and cleanups in the DM core, DM cache, DM raid
   and DM crypt

 - add ability to have DM crypt use keys from the kernel key retention
   service

 - add a new "error_writes" feature to the DM flakey target, reads are
   left unchanged in this mode

* tag 'dm-4.10-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (40 commits)
  dm flakey: introduce "error_writes" feature
  dm cache policy smq: use hash_32() instead of hash_32_generic()
  dm crypt: reject key strings containing whitespace chars
  dm space map: always set ev if sm_ll_mutate() succeeds
  dm space map metadata: skip useless memcpy in metadata_ll_init_index()
  dm space map metadata: fix 'struct sm_metadata' leak on failed create
  Documentation: dm raid: define data_offset status field
  dm raid: fix discard support regression
  dm raid: don't allow "write behind" with raid4/5/6
  dm mpath: use hw_handler_params if attached hw_handler is same as requested
  dm crypt: add ability to use keys from the kernel key retention service
  dm array: remove a dead assignment in populate_ablock_with_values()
  dm ioctl: use offsetof() instead of open-coding it
  dm rq: simplify use_blk_mq initialization
  dm: use blk_set_queue_dying() in __dm_destroy()
  dm bufio: drop the lock when doing GFP_NOIO allocation
  dm bufio: don't take the lock in dm_bufio_shrink_count
  dm bufio: avoid sleeping while holding the dm_bufio lock
  dm table: simplify dm_table_determine_type()
  dm table: an 'all_blk_mq' table must be loaded for a blk-mq DM device
  ...
parents 2a4c32ed ef548c55
......@@ -21,13 +21,30 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
/proc/crypto contains supported crypto modes
<key>
Key used for encryption. It is encoded as a hexadecimal number.
Key used for encryption. It is encoded either as a hexadecimal number
or it can be passed as <key_string> prefixed with single colon
character (':') for keys residing in kernel keyring service.
You can only use key sizes that are valid for the selected cipher
in combination with the selected iv mode.
Note that for some iv modes the key string can contain additional
keys (for example IV seed) so the key contains more parts concatenated
into a single string.
<key_string>
The kernel keyring key is identified by string in following format:
<key_size>:<key_type>:<key_description>.
<key_size>
The encryption key size in bytes. The kernel key payload size must match
the value passed in <key_size>.
<key_type>
Either 'logon' or 'user' kernel key type.
<key_description>
The kernel keyring key description crypt target should look for
when loading key of <key_type>.
<keycount>
Multi-key compatibility mode. You can define <keycount> keys and
then sectors are encrypted according to their offsets (sector 0 uses key0;
......@@ -88,6 +105,12 @@ https://gitlab.com/cryptsetup/cryptsetup
dmsetup create crypt1 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0"
]]
[[
#!/bin/sh
# Create a crypt device using dmsetup when encryption key is stored in keyring service
dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0"
]]
[[
#!/bin/sh
# Create a crypt device using cryptsetup and LUKS header with default cipher
......
......@@ -242,6 +242,10 @@ recovery. Here is a fuller description of the individual fields:
in RAID1/10 or wrong parity values found in RAID4/5/6.
This value is valid only after a "check" of the array
is performed. A healthy array has a 'mismatch_cnt' of 0.
<data_offset> The current data offset to the start of the user data on
each component device of a raid set (see the respective
raid parameter to support out-of-place reshaping).
Message Interface
-----------------
......
......@@ -240,9 +240,17 @@ config DM_BUFIO
as a cache, holding recently-read blocks in memory and performing
delayed writes.
config DM_DEBUG_BLOCK_MANAGER_LOCKING
bool "Block manager locking"
depends on DM_BUFIO
---help---
Block manager locking can catch various metadata corruption issues.
If unsure, say N.
config DM_DEBUG_BLOCK_STACK_TRACING
bool "Keep stack trace of persistent data block lock holders"
depends on STACKTRACE_SUPPORT && DM_BUFIO
depends on STACKTRACE_SUPPORT && DM_DEBUG_BLOCK_MANAGER_LOCKING
select STACKTRACE
---help---
Enable this for messages that may help debug problems with the
......
......@@ -820,12 +820,14 @@ enum new_flag {
static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
{
struct dm_buffer *b;
bool tried_noio_alloc = false;
/*
* dm-bufio is resistant to allocation failures (it just keeps
* one buffer reserved in cases all the allocations fail).
* So set flags to not try too hard:
* GFP_NOIO: don't recurse into the I/O layer
* GFP_NOWAIT: don't wait; if we need to sleep we'll release our
* mutex and wait ourselves.
* __GFP_NORETRY: don't retry and rather return failure
* __GFP_NOMEMALLOC: don't use emergency reserves
* __GFP_NOWARN: don't print a warning in case of failure
......@@ -835,7 +837,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
*/
while (1) {
if (dm_bufio_cache_size_latch != 1) {
b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (b)
return b;
}
......@@ -843,6 +845,15 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
if (nf == NF_PREFETCH)
return NULL;
if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
dm_bufio_unlock(c);
b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
dm_bufio_lock(c);
if (b)
return b;
tried_noio_alloc = true;
}
if (!list_empty(&c->reserved_buffers)) {
b = list_entry(c->reserved_buffers.next,
struct dm_buffer, lru_list);
......@@ -1585,18 +1596,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long
dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
struct dm_bufio_client *c;
unsigned long count;
struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
c = container_of(shrink, struct dm_bufio_client, shrinker);
if (sc->gfp_mask & __GFP_FS)
dm_bufio_lock(c);
else if (!dm_bufio_trylock(c))
return 0;
count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
dm_bufio_unlock(c);
return count;
return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]);
}
/*
......
......@@ -383,7 +383,6 @@ static int __format_metadata(struct dm_cache_metadata *cmd)
goto bad;
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
if (r < 0)
goto bad;
......@@ -789,7 +788,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
{
if (cmd->data_block_size != data_block_size) {
DMERR("data_block_size (%llu) different from that in metadata (%llu)\n",
DMERR("data_block_size (%llu) different from that in metadata (%llu)",
(unsigned long long) data_block_size,
(unsigned long long) cmd->data_block_size);
return false;
......
......@@ -1361,7 +1361,7 @@ static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
static unsigned random_level(dm_cblock_t cblock)
{
return hash_32_generic(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
}
static int smq_load_mapping(struct dm_cache_policy *p,
......
......@@ -989,7 +989,8 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod
enum cache_metadata_mode old_mode = get_cache_mode(cache);
if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
DMERR("unable to read needs_check flag, setting failure mode");
DMERR("%s: unable to read needs_check flag, setting failure mode.",
cache_device_name(cache));
new_mode = CM_FAIL;
}
......
This diff is collapsed.
......@@ -36,7 +36,8 @@ struct flakey_c {
};
enum feature_flag_bits {
DROP_WRITES
DROP_WRITES,
ERROR_WRITES
};
struct per_bio_data {
......@@ -76,6 +77,25 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
ti->error = "Feature drop_writes duplicated";
return -EINVAL;
} else if (test_bit(ERROR_WRITES, &fc->flags)) {
ti->error = "Feature drop_writes conflicts with feature error_writes";
return -EINVAL;
}
continue;
}
/*
* error_writes
*/
if (!strcasecmp(arg_name, "error_writes")) {
if (test_and_set_bit(ERROR_WRITES, &fc->flags)) {
ti->error = "Feature error_writes duplicated";
return -EINVAL;
} else if (test_bit(DROP_WRITES, &fc->flags)) {
ti->error = "Feature error_writes conflicts with feature drop_writes";
return -EINVAL;
}
continue;
......@@ -135,6 +155,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
return -EINVAL;
} else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
return -EINVAL;
}
return 0;
......@@ -200,11 +224,13 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (!(fc->up_interval + fc->down_interval)) {
ti->error = "Total (up + down) interval is zero";
r = -EINVAL;
goto bad;
}
if (fc->up_interval + fc->down_interval < fc->up_interval) {
ti->error = "Interval overflow";
r = -EINVAL;
goto bad;
}
......@@ -289,22 +315,27 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
pb->bio_submitted = true;
/*
* Error reads if neither corrupt_bio_byte or drop_writes are set.
* Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set.
* Otherwise, flakey_end_io() will decide if the reads should be modified.
*/
if (bio_data_dir(bio) == READ) {
if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags))
if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) &&
!test_bit(ERROR_WRITES, &fc->flags))
return -EIO;
goto map_bio;
}
/*
* Drop writes?
* Drop or error writes?
*/
if (test_bit(DROP_WRITES, &fc->flags)) {
bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
else if (test_bit(ERROR_WRITES, &fc->flags)) {
bio_io_error(bio);
return DM_MAPIO_SUBMITTED;
}
/*
* Corrupt matching writes.
......@@ -340,10 +371,11 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error)
*/
corrupt_bio_data(bio, fc);
} else if (!test_bit(DROP_WRITES, &fc->flags)) {
} else if (!test_bit(DROP_WRITES, &fc->flags) &&
!test_bit(ERROR_WRITES, &fc->flags)) {
/*
* Error read during the down_interval if drop_writes
* wasn't configured.
* and error_writes were not configured.
*/
return -EIO;
}
......@@ -357,7 +389,7 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
{
unsigned sz = 0;
struct flakey_c *fc = ti->private;
unsigned drop_writes;
unsigned drop_writes, error_writes;
switch (type) {
case STATUSTYPE_INFO:
......@@ -370,10 +402,13 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
fc->down_interval);
drop_writes = test_bit(DROP_WRITES, &fc->flags);
DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
error_writes = test_bit(ERROR_WRITES, &fc->flags);
DMEMIT("%u ", drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5);
if (drop_writes)
DMEMIT("drop_writes ");
else if (error_writes)
DMEMIT("error_writes ");
if (fc->corrupt_bio_byte)
DMEMIT("corrupt_bio_byte %u %c %u %u ",
......@@ -410,7 +445,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
.version = {1, 3, 1},
.version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
......
......@@ -162,7 +162,10 @@ struct dpages {
struct page **p, unsigned long *len, unsigned *offset);
void (*next_page)(struct dpages *dp);
union {
unsigned context_u;
struct bvec_iter context_bi;
};
void *context_ptr;
void *vma_invalidate_address;
......@@ -204,25 +207,36 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
static void bio_get_page(struct dpages *dp, struct page **p,
unsigned long *len, unsigned *offset)
{
struct bio_vec *bvec = dp->context_ptr;
*p = bvec->bv_page;
*len = bvec->bv_len - dp->context_u;
*offset = bvec->bv_offset + dp->context_u;
struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr,
dp->context_bi);
*p = bvec.bv_page;
*len = bvec.bv_len;
*offset = bvec.bv_offset;
/* avoid figuring it out again in bio_next_page() */
dp->context_bi.bi_sector = (sector_t)bvec.bv_len;
}
static void bio_next_page(struct dpages *dp)
{
struct bio_vec *bvec = dp->context_ptr;
dp->context_ptr = bvec + 1;
dp->context_u = 0;
unsigned int len = (unsigned int)dp->context_bi.bi_sector;
bvec_iter_advance((struct bio_vec *)dp->context_ptr,
&dp->context_bi, len);
}
static void bio_dp_init(struct dpages *dp, struct bio *bio)
{
dp->get_page = bio_get_page;
dp->next_page = bio_next_page;
dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
dp->context_u = bio->bi_iter.bi_bvec_done;
/*
* We just use bvec iterator to retrieve pages, so it is ok to
* access the bvec table directly here
*/
dp->context_ptr = bio->bi_io_vec;
dp->context_bi = bio->bi_iter;
}
/*
......
......@@ -1697,7 +1697,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
{
struct dm_ioctl *dmi;
int secure_data;
const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data);
const size_t minimum_data_size = offsetof(struct dm_ioctl, data);
if (copy_from_user(param_kernel, user, minimum_data_size))
return -EFAULT;
......
......@@ -372,16 +372,13 @@ static int __pg_init_all_paths(struct multipath *m)
return atomic_read(&m->pg_init_in_progress);
}
static int pg_init_all_paths(struct multipath *m)
static void pg_init_all_paths(struct multipath *m)
{
int r;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
r = __pg_init_all_paths(m);
__pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
return r;
}
static void __switch_pg(struct multipath *m, struct priority_group *pg)
......@@ -583,16 +580,17 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
* .request_fn stacked on blk-mq path(s) and
* blk-mq stacked on blk-mq path(s).
*/
*__clone = blk_mq_alloc_request(bdev_get_queue(bdev),
clone = blk_mq_alloc_request(bdev_get_queue(bdev),
rq_data_dir(rq), BLK_MQ_REQ_NOWAIT);
if (IS_ERR(*__clone)) {
/* ENOMEM, requeue */
if (IS_ERR(clone)) {
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
clear_request_fn_mpio(m, map_context);
return r;
}
(*__clone)->bio = (*__clone)->biotail = NULL;
(*__clone)->rq_disk = bdev->bd_disk;
(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
clone->bio = clone->biotail = NULL;
clone->rq_disk = bdev->bd_disk;
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
*__clone = clone;
}
if (pgpath->pg->ps.type->start_io)
......@@ -851,19 +849,23 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
retain:
attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
if (attached_handler_name) {
/*
* Clear any hw_handler_params associated with a
* handler that isn't already attached.
*/
if (m->hw_handler_name && strcmp(attached_handler_name, m->hw_handler_name)) {
kfree(m->hw_handler_params);
m->hw_handler_params = NULL;
}
/*
* Reset hw_handler_name to match the attached handler
* and clear any hw_handler_params associated with the
* ignored handler.
*
* NB. This modifies the table line to show the actual
* handler instead of the original table passed in.
*/
kfree(m->hw_handler_name);
m->hw_handler_name = attached_handler_name;
kfree(m->hw_handler_params);
m->hw_handler_params = NULL;
}
}
......@@ -1002,6 +1004,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
}
m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
if (!m->hw_handler_name)
return -EINVAL;
if (hw_argc > 1) {
char *p;
......@@ -1362,7 +1366,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
char dummy;
if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
(pgnum > m->nr_priority_groups)) {
!m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
DMWARN("invalid PG number supplied to switch_pg_num");
return -EINVAL;
}
......@@ -1394,7 +1398,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
char dummy;
if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
(pgnum > m->nr_priority_groups)) {
!m->nr_priority_groups || (pgnum > m->nr_priority_groups)) {
DMWARN("invalid PG number supplied to bypass_pg");
return -EINVAL;
}
......
......@@ -160,7 +160,6 @@ struct raid_dev {
CTR_FLAG_DAEMON_SLEEP | \
CTR_FLAG_MIN_RECOVERY_RATE | \
CTR_FLAG_MAX_RECOVERY_RATE | \
CTR_FLAG_MAX_WRITE_BEHIND | \
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
......@@ -171,7 +170,6 @@ struct raid_dev {
CTR_FLAG_DAEMON_SLEEP | \
CTR_FLAG_MIN_RECOVERY_RATE | \
CTR_FLAG_MAX_RECOVERY_RATE | \
CTR_FLAG_MAX_WRITE_BEHIND | \
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
......@@ -2050,16 +2048,17 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
mddev->reshape_position = MaxSector;
mddev->raid_disks = le32_to_cpu(sb->num_devices);
mddev->level = le32_to_cpu(sb->level);
mddev->layout = le32_to_cpu(sb->layout);
mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
/*
* Reshaping is supported, e.g. reshape_position is valid
* in superblock and superblock content is authoritative.
*/
if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) {
/* Superblock is authoritative wrt given raid set layout! */
mddev->raid_disks = le32_to_cpu(sb->num_devices);
mddev->level = le32_to_cpu(sb->level);
mddev->layout = le32_to_cpu(sb->layout);
mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors);
mddev->new_level = le32_to_cpu(sb->new_level);
mddev->new_layout = le32_to_cpu(sb->new_layout);
mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors);
......@@ -2087,38 +2086,44 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
/*
* No takeover/reshaping, because we don't have the extended v1.9.0 metadata
*/
if (le32_to_cpu(sb->level) != mddev->new_level) {
DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)");
struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout);
struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout);
if (rs_takeover_requested(rs)) {
if (rt_cur && rt_new)
DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)",
rt_cur->name, rt_new->name);
else
DMERR("Takeover raid sets not yet supported by metadata. (raid level change)");
return -EINVAL;
} else if (rs_reshape_requested(rs)) {
DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)");
if (mddev->layout != mddev->new_layout) {
if (rt_cur && rt_new)
DMERR(" current layout %s vs new layout %s",
rt_cur->name, rt_new->name);
else
DMERR(" current layout 0x%X vs new layout 0x%X",
le32_to_cpu(sb->layout), mddev->new_layout);
}
if (le32_to_cpu(sb->layout) != mddev->new_layout) {
DMERR("Reshaping raid sets not yet supported. (raid layout change)");
DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
DMERR(" Old layout: %s w/ %d copies",
raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
DMERR(" New layout: %s w/ %d copies",
if (mddev->chunk_sectors != mddev->new_chunk_sectors)
DMERR(" current stripe sectors %u vs new stripe sectors %u",
mddev->chunk_sectors, mddev->new_chunk_sectors);
if (rs->delta_disks)
DMERR(" current %u disks vs new %u disks",
mddev->raid_disks, mddev->raid_disks + rs->delta_disks);
if (rs_is_raid10(rs)) {
DMERR(" Old layout: %s w/ %u copies",
raid10_md_layout_to_format(mddev->layout),
raid10_md_layout_to_copies(mddev->layout));
return -EINVAL;
DMERR(" New layout: %s w/ %u copies",
raid10_md_layout_to_format(mddev->new_layout),
raid10_md_layout_to_copies(mddev->new_layout));
}
if (le32_to_cpu(sb->stripe_sectors) != mddev->new_chunk_sectors) {
DMERR("Reshaping raid sets not yet supported. (stripe sectors change)");
return -EINVAL;
}
/* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */
if (!rt_is_raid1(rs->raid_type) &&
(le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)",
sb->num_devices, mddev->raid_disks);
return -EINVAL;
}
DMINFO("Discovered old metadata format; upgrading to extended metadata format");
/* Table line is checked vs. authoritative superblock */
rs_set_new(rs);
}
if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
......@@ -2211,7 +2216,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
continue;
if (role != r->raid_disk) {
if (__is_raid10_near(mddev->layout)) {
if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) {
if (mddev->raid_disks % __raid10_near_copies(mddev->layout) ||
rs->raid_disks % rs->raid10_copies) {
rs->ti->error =
......@@ -2994,6 +2999,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
}
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
mddev_unlock(&rs->md);
return 0;
......@@ -3580,12 +3588,6 @@ static int raid_preresume(struct dm_target *ti)
if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags))
rs_update_sbs(rs);
/*
* Disable/enable discard support on raid set after any
* conversion, because devices can have been added
*/
configure_discard_support(rs);
/* Load the bitmap from disk unless raid0 */
r = __load_dirty_region_bitmap(rs);
if (r)
......
......@@ -23,11 +23,7 @@ static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
#define RESERVED_REQUEST_BASED_IOS 256
static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
#ifdef CONFIG_DM_MQ_DEFAULT
static bool use_blk_mq = true;
#else
static bool use_blk_mq = false;
#endif
static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT);
bool dm_use_blk_mq_default(void)
{
......@@ -210,6 +206,9 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
struct request_queue *q = md->queue;
unsigned long flags;
atomic_dec(&md->pending[rw]);
/* nudge anyone waiting on suspend queue */
......@@ -222,8 +221,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
* back into ->request_fn() could deadlock attempting to grab the
* queue lock again.
*/
if (!md->queue->mq_ops && run_queue)
blk_run_queue_async(md->queue);
if (!q->mq_ops && run_queue) {
spin_lock_irqsave(q->queue_lock, flags);
blk_run_queue_async(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
/*
* dm_put() must be at the end of this function. See the comment above
......@@ -798,7 +800,7 @@ static void dm_old_request_fn(struct request_queue *q)
pos = blk_rq_pos(rq);
if ((dm_old_request_peeked_before_merge_deadline(md) &&
md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) &&
md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
(ti->type->busy && ti->type->busy(ti))) {
blk_delay_queue(q, 10);
......
......@@ -871,7 +871,7 @@ static int dm_table_determine_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
bool verify_blk_mq = false;
unsigned sq_count = 0, mq_count = 0;
struct dm_target *tgt;
struct dm_dev_internal *dd;
struct list_head *devices = dm_table_get_devices(t);
......@@ -924,12 +924,6 @@ static int dm_table_determine_type(struct dm_table *t)
BUG_ON(!request_based); /* No targets in this table */
if (list_empty(devices) && __table_type_request_based(live_md_type)) {
/* inherit live MD type */
t->type = live_md_type;
return 0;
}
/*
* The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
* having a compatible target use dm_table_set_type.
......@@ -948,6 +942,19 @@ static int dm_table_determine_type(struct dm_table *t)
return -EINVAL;
}
if (list_empty(devices)) {
int srcu_idx;
struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx);
/* inherit live table's type and all_blk_mq */
if (live_table) {
t->type = live_table->type;
t->all_blk_mq = live_table->all_blk_mq;
}
dm_put_live_table(t->md, srcu_idx);
return 0;
}
/* Non-request-stackable devices can't be used for request-based dm */
list_for_each_entry(dd, devices, list) {
struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
......@@ -959,19 +966,19 @@ static int dm_table_determine_type(struct dm_table *t)
}
if (q->mq_ops)
verify_blk_mq = true;
mq_count++;
else
sq_count++;
}
if (verify_blk_mq) {
/* verify _all_ devices in the table are blk-mq devices */
list_for_each_entry(dd, devices, list)
if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
DMERR("table load rejected: not all devices"
" are blk-mq request-stackable");
if (sq_count && mq_count) {
DMERR("table load rejected: not all devices are blk-mq request-stackable");
return -EINVAL;
}
t->all_blk_mq = mq_count > 0;
t->all_blk_mq = true;
if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) {
DMERR("table load rejected: all devices are not blk-mq request-stackable");
return -EINVAL;
}
return 0;
......
......@@ -868,7 +868,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
if (r) {
ti->error = "Data device lookup failed";
ti->error = "Hash device lookup failed";
goto bad;
}
......
......@@ -1886,9 +1886,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
set_bit(DMF_FREEING, &md->flags);
spin_unlock(&_minor_lock);
spin_lock_irq(q->queue_lock);
queue_flag_set(QUEUE_FLAG_DYING, q);
spin_unlock_irq(q->queue_lock);
blk_set_queue_dying(q);
if (dm_request_based(md) && md->kworker_task)
kthread_flush_worker(&md->kworker);
......
......@@ -700,13 +700,11 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_
{
int r;
unsigned i;
uint32_t nr_entries;
struct dm_btree_value_type *vt = &info->value_type;
BUG_ON(le32_to_cpu(ab->nr_entries));
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
nr_entries = le32_to_cpu(ab->nr_entries);
for (i = 0; i < new_nr; i++) {
r = fn(base + i, element_at(info, ab, i), context);
if (r)
......
......@@ -18,6 +18,8 @@
/*----------------------------------------------------------------*/
#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING
/*
* This is a read/write semaphore with a couple of differences.
*
......@@ -302,6 +304,18 @@ static void report_recursive_bug(dm_block_t b, int r)
(unsigned long long) b);
}
#else /* !CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */
#define bl_init(x) do { } while (0)
#define bl_down_read(x) 0
#define bl_down_read_nonblock(x) 0
#define bl_up_read(x) do { } while (0)
#define bl_down_write(x) 0
#define bl_up_write(x) do { } while (0)
#define report_recursive_bug(x, y) do { } while (0)
#endif /* CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */
/*----------------------------------------------------------------*/
/*
......@@ -330,8 +344,11 @@ EXPORT_SYMBOL_GPL(dm_block_data);
struct buffer_aux {
struct dm_block_validator *validator;
struct block_lock lock;
int write_locked;
#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING
struct block_lock lock;
#endif
};
static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
......
......@@ -464,7 +464,8 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
ll->nr_allocated--;
le32_add_cpu(&ie_disk.nr_free, 1);
ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
}
} else
*ev = SM_NONE;
return ll->save_ie(ll, index, &ie_disk);
}
......@@ -547,7 +548,6 @@ static int metadata_ll_init_index(struct ll_disk *ll)
if (r < 0)
return r;
memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
ll->bitmap_root = dm_block_location(b);
dm_tm_unlock(ll->tm, b);
......
......@@ -775,17 +775,15 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
r = sm_ll_new_metadata(&smm->ll, tm);
if (r)
return r;
if (!r) {
if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
r = sm_ll_extend(&smm->ll, nr_blocks);
}
memcpy(&smm->sm, &ops, sizeof(smm->sm));
if (r)
return r;
memcpy(&smm->sm, &ops, sizeof(smm->sm));
/*
* Now we need to update the newly created data structures with the
* allocated blocks that they were built from.
......
......@@ -7,6 +7,7 @@
#ifndef __DM_LOG_USERSPACE_H__
#define __DM_LOG_USERSPACE_H__
#include <linux/types.h>
#include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
/*
......@@ -147,12 +148,12 @@
/*
* DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h):
* uint32_t (*get_region_size)(struct dm_dirty_log *log);
* __u32 (*get_region_size)(struct dm_dirty_log *log);
*
* Payload-to-userspace:
* None.
* Payload-to-kernel:
* uint64_t - contains the region size
* __u64 - contains the region size
*
* The region size is something that was determined at constructor time.
* It is returned in the payload area and 'data_size' is set to
......@@ -168,11 +169,11 @@
* int (*is_clean)(struct dm_dirty_log *log, region_t region);
*
* Payload-to-userspace:
* uint64_t - the region to get clean status on
* __u64 - the region to get clean status on
* Payload-to-kernel:
* int64_t - 1 if clean, 0 otherwise
* __s64 - 1 if clean, 0 otherwise
*
* Payload is sizeof(uint64_t) and contains the region for which the clean
* Payload is sizeof(__u64) and contains the region for which the clean
* status is being made.
*
* When the request has been processed, user-space must return the
......@@ -187,9 +188,9 @@
* int can_block);
*
* Payload-to-userspace:
* uint64_t - the region to get sync status on
* __u64 - the region to get sync status on
* Payload-to-kernel:
* int64_t - 1 if in-sync, 0 otherwise
* __s64 - 1 if in-sync, 0 otherwise
*
* Exactly the same as 'is_clean' above, except this time asking "has the
* region been recovered?" vs. "is the region not being modified?"
......@@ -203,7 +204,7 @@
* Payload-to-userspace:
* If the 'integrated_flush' directive is present in the constructor
* table, the payload is as same as DM_ULOG_MARK_REGION:
* uint64_t [] - region(s) to mark
* __u64 [] - region(s) to mark
* else
* None
* Payload-to-kernel:
......@@ -225,13 +226,13 @@
* void (*mark_region)(struct dm_dirty_log *log, region_t region);
*
* Payload-to-userspace:
* uint64_t [] - region(s) to mark
* __u64 [] - region(s) to mark
* Payload-to-kernel:
* None.
*
* Incoming payload contains the one or more regions to mark dirty.
* The number of regions contained in the payload can be determined from
* 'data_size/sizeof(uint64_t)'.
* 'data_size/sizeof(__u64)'.
*
* When the request has been processed, user-space must return the
* dm_ulog_request to the kernel - setting the 'error' field and clearing
......@@ -244,13 +245,13 @@
* void (*clear_region)(struct dm_dirty_log *log, region_t region);
*
* Payload-to-userspace:
* uint64_t [] - region(s) to clear
* __u64 [] - region(s) to clear
* Payload-to-kernel:
* None.
*
* Incoming payload contains the one or more regions to mark clean.
* The number of regions contained in the payload can be determined from
* 'data_size/sizeof(uint64_t)'.
* 'data_size/sizeof(__u64)'.
*
* When the request has been processed, user-space must return the
* dm_ulog_request to the kernel - setting the 'error' field and clearing
......@@ -266,8 +267,8 @@
* None.
* Payload-to-kernel:
* {
* int64_t i; -- 1 if recovery necessary, 0 otherwise
* uint64_t r; -- The region to recover if i=1
* __s64 i; -- 1 if recovery necessary, 0 otherwise
* __u64 r; -- The region to recover if i=1
* }
* 'data_size' should be set appropriately.
*
......@@ -283,8 +284,8 @@
*
* Payload-to-userspace:
* {
* uint64_t - region to set sync state on
* int64_t - 0 if not-in-sync, 1 if in-sync
* __u64 - region to set sync state on
* __s64 - 0 if not-in-sync, 1 if in-sync
* }
* Payload-to-kernel:
* None.
......@@ -302,7 +303,7 @@
* Payload-to-userspace:
* None.
* Payload-to-kernel:
* uint64_t - the number of in-sync regions
* __u64 - the number of in-sync regions
*
* No incoming payload. Kernel-bound payload contains the number of
* regions that are in-sync (in a size_t).
......@@ -350,11 +351,11 @@
* int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
*
* Payload-to-userspace:
* uint64_t - region to determine recovery status on
* __u64 - region to determine recovery status on
* Payload-to-kernel:
* {
* int64_t is_recovering; -- 0 if no, 1 if yes
* uint64_t in_sync_hint; -- lowest region still needing resync
* __s64 is_recovering; -- 0 if no, 1 if yes
* __u64 in_sync_hint; -- lowest region still needing resync
* }
*
* When the request has been processed, user-space must return the
......@@ -413,16 +414,16 @@ struct dm_ulog_request {
* differentiate between logs that are being swapped and have the
* same 'uuid'. (Think "live" and "inactive" device-mapper tables.)
*/
uint64_t luid;
__u64 luid;
char uuid[DM_UUID_LEN];
char padding[3]; /* Padding because DM_UUID_LEN = 129 */
uint32_t version; /* See DM_ULOG_REQUEST_VERSION */
int32_t error; /* Used to report back processing errors */
__u32 version; /* See DM_ULOG_REQUEST_VERSION */
__s32 error; /* Used to report back processing errors */
uint32_t seq; /* Sequence number for request */
uint32_t request_type; /* DM_ULOG_* defined above */
uint32_t data_size; /* How much data (not including this struct) */
__u32 seq; /* Sequence number for request */
__u32 request_type; /* DM_ULOG_* defined above */
__u32 data_size; /* How much data (not including this struct) */
char data[0];
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment