Commit 325b7640 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.12/dm-changes' of...

Merge tag 'for-5.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Fix DM integrity's HMAC support to provide enhanced security of
   internal_hash and journal_mac capabilities.

 - Various DM writecache fixes to address performance, fix table output
   to match what was provided at table creation, fix writing beyond end
   of device when shrinking underlying data device, and a couple other
   small cleanups.

 - Add DM crypt support for using trusted keys.

 - Fix deadlock when swapping to DM crypt device by throttling number of
   in-flight REQ_SWAP bios. Implemented in DM core so that other
   bio-based targets can opt-in by setting ti->limit_swap_bios.

 - Fix various inverted logic bugs in the .iterate_devices callout
   functions that are used to assess if specific feature or capability
   is supported across all devices being combined/stacked by DM.

 - Fix DM era target bugs that exposed users to lost writes or memory
   leaks.

 - Add DM core support for passing through inline crypto support of
   underlying devices. Includes block/keyslot-manager changes that
   enable extending this support to DM.

 - Various small fixes and cleanups (spelling fixes, front padding
   calculation cleanup, cleanup conditional zoned support in targets,
   etc).

* tag 'for-5.12/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (31 commits)
  dm: fix deadlock when swapping to encrypted device
  dm: simplify target code conditional on CONFIG_BLK_DEV_ZONED
  dm: set DM_TARGET_PASSES_CRYPTO feature for some targets
  dm: support key eviction from keyslot managers of underlying devices
  dm: add support for passing through inline crypto support
  block/keyslot-manager: Introduce functions for device mapper support
  block/keyslot-manager: Introduce passthrough keyslot manager
  dm era: only resize metadata in preresume
  dm era: Use correct value size in equality function of writeset tree
  dm era: Fix bitset memory leaks
  dm era: Verify the data block size hasn't changed
  dm era: Reinitialize bitset cache before digesting a new writeset
  dm era: Update in-core bitset after committing the metadata
  dm era: Recover committed writeset after crash
  dm writecache: use bdev_nr_sectors() instead of open-coded equivalent
  dm writecache: fix writing beyond end of underlying device when shrinking
  dm table: remove needless request_queue NULL pointer checks
  dm table: fix zoned iterate_devices based device capability checks
  dm table: fix DAX iterate_devices based device capability checks
  dm table: fix iterate_devices based device capability checks
  ...
parents a99163e9 a666e5c0
......@@ -67,7 +67,7 @@ Parameters::
the value passed in <key_size>.
<key_type>
Either 'logon', 'user' or 'encrypted' kernel key type.
Either 'logon', 'user', 'encrypted' or 'trusted' kernel key type.
<key_description>
The kernel keyring key description crypt target should look for
......
......@@ -186,6 +186,17 @@ fix_padding
space-efficient. If this option is not present, large padding is
used - that is for compatibility with older kernels.
fix_hmac
Improve security of internal_hash and journal_mac:
- the section number is mixed to the mac, so that an attacker can't
copy sectors from one journal section to another journal section
- the superblock is protected by journal_mac
- a 16-byte salt stored in the superblock is mixed to the mac, so
that the attacker can't detect that two disks have the same hmac
key and also to disallow the attacker to move sectors from one
disk to another
legacy_recalculate
Allow recalculating of volumes with HMAC keys. This is disabled by
default for security reasons - an attacker could modify the volume,
......
......@@ -409,3 +409,4 @@ int blk_crypto_evict_key(struct request_queue *q,
*/
return blk_crypto_fallback_evict_key(key);
}
EXPORT_SYMBOL_GPL(blk_crypto_evict_key);
......@@ -63,6 +63,11 @@ static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
pm_runtime_put_sync(ksm->dev);
}
static inline bool blk_ksm_is_passthrough(struct blk_keyslot_manager *ksm)
{
return ksm->num_slots == 0;
}
/**
* blk_ksm_init() - Initialize a keyslot manager
* @ksm: The keyslot_manager to initialize.
......@@ -234,6 +239,10 @@ blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
int err;
*slot_ptr = NULL;
if (blk_ksm_is_passthrough(ksm))
return BLK_STS_OK;
down_read(&ksm->lock);
slot = blk_ksm_find_and_grab_keyslot(ksm, key);
up_read(&ksm->lock);
......@@ -354,6 +363,16 @@ int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
struct blk_ksm_keyslot *slot;
int err = 0;
if (blk_ksm_is_passthrough(ksm)) {
if (ksm->ksm_ll_ops.keyslot_evict) {
blk_ksm_hw_enter(ksm);
err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, -1);
blk_ksm_hw_exit(ksm);
return err;
}
return 0;
}
blk_ksm_hw_enter(ksm);
slot = blk_ksm_find_keyslot(ksm, key);
if (!slot)
......@@ -389,6 +408,9 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
{
unsigned int slot;
if (blk_ksm_is_passthrough(ksm))
return;
/* This is for device initialization, so don't resume the device */
down_write(&ksm->lock);
for (slot = 0; slot < ksm->num_slots; slot++) {
......@@ -430,3 +452,127 @@ void blk_ksm_unregister(struct request_queue *q)
{
q->ksm = NULL;
}
/**
* blk_ksm_intersect_modes() - restrict supported modes by child device
* @parent: The keyslot manager for parent device
* @child: The keyslot manager for child device, or NULL
*
* Clear any crypto mode support bits in @parent that aren't set in @child.
* If @child is NULL, then all parent bits are cleared.
*
* Only use this when setting up the keyslot manager for a layered device,
* before it's been exposed yet.
*/
void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
const struct blk_keyslot_manager *child)
{
if (child) {
unsigned int i;
parent->max_dun_bytes_supported =
min(parent->max_dun_bytes_supported,
child->max_dun_bytes_supported);
for (i = 0; i < ARRAY_SIZE(child->crypto_modes_supported);
i++) {
parent->crypto_modes_supported[i] &=
child->crypto_modes_supported[i];
}
} else {
parent->max_dun_bytes_supported = 0;
memset(parent->crypto_modes_supported, 0,
sizeof(parent->crypto_modes_supported));
}
}
EXPORT_SYMBOL_GPL(blk_ksm_intersect_modes);
/**
* blk_ksm_is_superset() - Check if a KSM supports a superset of crypto modes
* and DUN bytes that another KSM supports. Here,
* "superset" refers to the mathematical meaning of the
* word - i.e. if two KSMs have the *same* capabilities,
* they *are* considered supersets of each other.
* @ksm_superset: The KSM that we want to verify is a superset
* @ksm_subset: The KSM that we want to verify is a subset
*
* Return: True if @ksm_superset supports a superset of the crypto modes and DUN
* bytes that @ksm_subset supports.
*/
bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
struct blk_keyslot_manager *ksm_subset)
{
int i;
if (!ksm_subset)
return true;
if (!ksm_superset)
return false;
for (i = 0; i < ARRAY_SIZE(ksm_superset->crypto_modes_supported); i++) {
if (ksm_subset->crypto_modes_supported[i] &
(~ksm_superset->crypto_modes_supported[i])) {
return false;
}
}
if (ksm_subset->max_dun_bytes_supported >
ksm_superset->max_dun_bytes_supported) {
return false;
}
return true;
}
EXPORT_SYMBOL_GPL(blk_ksm_is_superset);
/**
* blk_ksm_update_capabilities() - Update the restrictions of a KSM to those of
* another KSM
* @target_ksm: The KSM whose restrictions to update.
* @reference_ksm: The KSM to whose restrictions this function will update
* @target_ksm's restrictions to.
*
* Blk-crypto requires that crypto capabilities that were
* advertised when a bio was created continue to be supported by the
* device until that bio is ended. This is turn means that a device cannot
* shrink its advertised crypto capabilities without any explicit
* synchronization with upper layers. So if there's no such explicit
* synchronization, @reference_ksm must support all the crypto capabilities that
* @target_ksm does
* (i.e. we need blk_ksm_is_superset(@reference_ksm, @target_ksm) == true).
*
* Note also that as long as the crypto capabilities are being expanded, the
* order of updates becoming visible is not important because it's alright
* for blk-crypto to see stale values - they only cause blk-crypto to
* believe that a crypto capability isn't supported when it actually is (which
* might result in blk-crypto-fallback being used if available, or the bio being
* failed).
*/
void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
struct blk_keyslot_manager *reference_ksm)
{
memcpy(target_ksm->crypto_modes_supported,
reference_ksm->crypto_modes_supported,
sizeof(target_ksm->crypto_modes_supported));
target_ksm->max_dun_bytes_supported =
reference_ksm->max_dun_bytes_supported;
}
EXPORT_SYMBOL_GPL(blk_ksm_update_capabilities);
/**
* blk_ksm_init_passthrough() - Init a passthrough keyslot manager
* @ksm: The keyslot manager to init
*
* Initialize a passthrough keyslot manager.
* Called by e.g. storage drivers to set up a keyslot manager in their
* request_queue, when the storage driver wants to manage its keys by itself.
* This is useful for inline encryption hardware that doesn't have the concept
* of keyslots, and for layered devices.
*/
void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm)
{
memset(ksm, 0, sizeof(*ksm));
init_rwsem(&ksm->lock);
}
EXPORT_SYMBOL_GPL(blk_ksm_init_passthrough);
......@@ -270,6 +270,7 @@ config DM_CRYPT
tristate "Crypt target support"
depends on BLK_DEV_DM
depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
depends on (TRUSTED_KEYS || TRUSTED_KEYS=n)
select CRYPTO
select CRYPTO_CBC
select CRYPTO_ESSIV
......
......@@ -13,6 +13,7 @@
#include <linux/ktime.h>
#include <linux/genhd.h>
#include <linux/blk-mq.h>
#include <linux/keyslot-manager.h>
#include <trace/events/block.h>
......@@ -102,6 +103,10 @@ struct mapped_device {
/* kobject and completion */
struct dm_kobject_holder kobj_holder;
int swap_bios;
struct semaphore swap_bios_semaphore;
struct mutex swap_bios_lock;
struct dm_stats stats;
/* for blk-mq request-based DM support */
......@@ -162,6 +167,10 @@ struct dm_table {
void *event_context;
struct dm_md_mempools *mempools;
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct blk_keyslot_manager *ksm;
#endif
};
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
......
......@@ -37,6 +37,7 @@
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
#include <keys/trusted-type.h>
#include <linux/device-mapper.h>
......@@ -133,7 +134,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
DM_CRYPT_WRITE_INLINE };
enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cipher */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
};
......@@ -2436,7 +2437,6 @@ static int set_key_user(struct crypt_config *cc, struct key *key)
return 0;
}
#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
static int set_key_encrypted(struct crypt_config *cc, struct key *key)
{
const struct encrypted_key_payload *ekp;
......@@ -2452,7 +2452,22 @@ static int set_key_encrypted(struct crypt_config *cc, struct key *key)
return 0;
}
#endif /* CONFIG_ENCRYPTED_KEYS */
static int set_key_trusted(struct crypt_config *cc, struct key *key)
{
const struct trusted_key_payload *tkp;
tkp = key->payload.data[0];
if (!tkp)
return -EKEYREVOKED;
if (cc->key_size != tkp->key_len)
return -EINVAL;
memcpy(cc->key, tkp->key, cc->key_size);
return 0;
}
static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string)
{
......@@ -2482,11 +2497,14 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string
} else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) {
type = &key_type_user;
set_key = set_key_user;
#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE)
} else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) {
} else if (IS_ENABLED(CONFIG_ENCRYPTED_KEYS) &&
!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) {
type = &key_type_encrypted;
set_key = set_key_encrypted;
#endif
} else if (IS_ENABLED(CONFIG_TRUSTED_KEYS) &&
!strncmp(key_string, "trusted:", key_desc - key_string + 1)) {
type = &key_type_trusted;
set_key = set_key_trusted;
} else {
return -EINVAL;
}
......@@ -3116,7 +3134,6 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
}
#ifdef CONFIG_BLK_DEV_ZONED
static int crypt_report_zones(struct dm_target *ti,
struct dm_report_zones_args *args, unsigned int nr_zones)
{
......@@ -3127,7 +3144,8 @@ static int crypt_report_zones(struct dm_target *ti,
return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
dm_report_zones_cb, args);
}
#else
#define crypt_report_zones NULL
#endif
/*
......@@ -3324,6 +3342,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
ti->limit_swap_bios = true;
return 0;
......@@ -3558,14 +3577,12 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
.version = {1, 22, 0},
.version = {1, 23, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
#ifdef CONFIG_BLK_DEV_ZONED
.features = DM_TARGET_ZONED_HM,
.report_zones = crypt_report_zones,
#endif
.map = crypt_map,
.status = crypt_status,
.postsuspend = crypt_postsuspend,
......
......@@ -130,7 +130,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block,
dd->badblock_count++;
if (!dd->quiet_mode) {
DMINFO("%s: badblock added at block %llu with write fail count %hhu",
DMINFO("%s: badblock added at block %llu with write fail count %u",
__func__, block, wr_fail_cnt);
}
spin_unlock_irqrestore(&dd->dust_lock, flags);
......
......@@ -47,6 +47,7 @@ struct writeset {
static void writeset_free(struct writeset *ws)
{
vfree(ws->bits);
ws->bits = NULL;
}
static int setup_on_disk_bitset(struct dm_disk_bitset *info,
......@@ -71,8 +72,6 @@ static size_t bitset_size(unsigned nr_bits)
*/
static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
{
ws->md.nr_bits = nr_blocks;
ws->md.root = INVALID_WRITESET_ROOT;
ws->bits = vzalloc(bitset_size(nr_blocks));
if (!ws->bits) {
DMERR("%s: couldn't allocate in memory bitset", __func__);
......@@ -85,12 +84,14 @@ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
/*
* Wipes the in-core bitset, and creates a new on disk bitset.
*/
static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws)
static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws,
dm_block_t nr_blocks)
{
int r;
memset(ws->bits, 0, bitset_size(ws->md.nr_bits));
memset(ws->bits, 0, bitset_size(nr_blocks));
ws->md.nr_bits = nr_blocks;
r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
if (r) {
DMERR("%s: setup_on_disk_bitset failed", __func__);
......@@ -134,7 +135,7 @@ static int writeset_test_and_set(struct dm_disk_bitset *info,
{
int r;
if (!test_and_set_bit(block, ws->bits)) {
if (!test_bit(block, ws->bits)) {
r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
if (r) {
/* FIXME: fail mode */
......@@ -388,7 +389,7 @@ static void ws_dec(void *context, const void *value)
static int ws_eq(void *context, const void *value1, const void *value2)
{
return !memcmp(value1, value2, sizeof(struct writeset_metadata));
return !memcmp(value1, value2, sizeof(struct writeset_disk));
}
/*----------------------------------------------------------------*/
......@@ -564,6 +565,15 @@ static int open_metadata(struct era_metadata *md)
}
disk = dm_block_data(sblock);
/* Verify the data block size hasn't changed */
if (le32_to_cpu(disk->data_block_size) != md->block_size) {
DMERR("changing the data block size (from %u to %llu) is not supported",
le32_to_cpu(disk->data_block_size), md->block_size);
r = -EINVAL;
goto bad;
}
r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
disk->metadata_space_map_root,
sizeof(disk->metadata_space_map_root),
......@@ -575,10 +585,10 @@ static int open_metadata(struct era_metadata *md)
setup_infos(md);
md->block_size = le32_to_cpu(disk->data_block_size);
md->nr_blocks = le32_to_cpu(disk->nr_blocks);
md->current_era = le32_to_cpu(disk->current_era);
ws_unpack(&disk->current_writeset, &md->current_writeset->md);
md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
md->era_array_root = le64_to_cpu(disk->era_array_root);
md->metadata_snap = le64_to_cpu(disk->metadata_snap);
......@@ -746,6 +756,12 @@ static int metadata_digest_lookup_writeset(struct era_metadata *md,
ws_unpack(&disk, &d->writeset);
d->value = cpu_to_le32(key);
/*
* We initialise another bitset info to avoid any caching side effects
* with the previous one.
*/
dm_disk_bitset_init(md->tm, &d->info);
d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
d->current_bit = 0;
d->step = metadata_digest_transcribe_writeset;
......@@ -759,12 +775,6 @@ static int metadata_digest_start(struct era_metadata *md, struct digest *d)
return 0;
memset(d, 0, sizeof(*d));
/*
* We initialise another bitset info to avoid any caching side
* effects with the previous one.
*/
dm_disk_bitset_init(md->tm, &d->info);
d->step = metadata_digest_lookup_writeset;
return 0;
......@@ -802,6 +812,8 @@ static struct era_metadata *metadata_open(struct block_device *bdev,
static void metadata_close(struct era_metadata *md)
{
writeset_free(&md->writesets[0]);
writeset_free(&md->writesets[1]);
destroy_persistent_data_objects(md);
kfree(md);
}
......@@ -839,6 +851,7 @@ static int metadata_resize(struct era_metadata *md, void *arg)
r = writeset_alloc(&md->writesets[1], *new_size);
if (r) {
DMERR("%s: writeset_alloc failed for writeset 1", __func__);
writeset_free(&md->writesets[0]);
return r;
}
......@@ -849,6 +862,8 @@ static int metadata_resize(struct era_metadata *md, void *arg)
&value, &md->era_array_root);
if (r) {
DMERR("%s: dm_array_resize failed", __func__);
writeset_free(&md->writesets[0]);
writeset_free(&md->writesets[1]);
return r;
}
......@@ -870,7 +885,6 @@ static int metadata_era_archive(struct era_metadata *md)
}
ws_pack(&md->current_writeset->md, &value);
md->current_writeset->md.root = INVALID_WRITESET_ROOT;
keys[0] = md->current_era;
__dm_bless_for_disk(&value);
......@@ -882,6 +896,7 @@ static int metadata_era_archive(struct era_metadata *md)
return r;
}
md->current_writeset->md.root = INVALID_WRITESET_ROOT;
md->archived_writesets = true;
return 0;
......@@ -898,7 +913,7 @@ static int metadata_new_era(struct era_metadata *md)
int r;
struct writeset *new_writeset = next_writeset(md);
r = writeset_init(&md->bitset_info, new_writeset);
r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks);
if (r) {
DMERR("%s: writeset_init failed", __func__);
return r;
......@@ -951,7 +966,7 @@ static int metadata_commit(struct era_metadata *md)
int r;
struct dm_block *sblock;
if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) {
if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
&md->current_writeset->md.root);
if (r) {
......@@ -1225,8 +1240,10 @@ static void process_deferred_bios(struct era *era)
int r;
struct bio_list deferred_bios, marked_bios;
struct bio *bio;
struct blk_plug plug;
bool commit_needed = false;
bool failed = false;
struct writeset *ws = era->md->current_writeset;
bio_list_init(&deferred_bios);
bio_list_init(&marked_bios);
......@@ -1236,9 +1253,11 @@ static void process_deferred_bios(struct era *era)
bio_list_init(&era->deferred_bios);
spin_unlock(&era->deferred_lock);
if (bio_list_empty(&deferred_bios))
return;
while ((bio = bio_list_pop(&deferred_bios))) {
r = writeset_test_and_set(&era->md->bitset_info,
era->md->current_writeset,
r = writeset_test_and_set(&era->md->bitset_info, ws,
get_block(era, bio));
if (r < 0) {
/*
......@@ -1246,7 +1265,6 @@ static void process_deferred_bios(struct era *era)
* FIXME: finish.
*/
failed = true;
} else if (r == 0)
commit_needed = true;
......@@ -1262,9 +1280,19 @@ static void process_deferred_bios(struct era *era)
if (failed)
while ((bio = bio_list_pop(&marked_bios)))
bio_io_error(bio);
else
while ((bio = bio_list_pop(&marked_bios)))
else {
blk_start_plug(&plug);
while ((bio = bio_list_pop(&marked_bios))) {
/*
* Only update the in-core writeset if the on-disk one
* was updated too.
*/
if (commit_needed)
set_bit(get_block(era, bio), ws->bits);
submit_bio_noacct(bio);
}
blk_finish_plug(&plug);
}
}
static void process_rpc_calls(struct era *era)
......@@ -1473,15 +1501,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
era->md = md;
era->nr_blocks = calc_nr_blocks(era);
r = metadata_resize(era->md, &era->nr_blocks);
if (r) {
ti->error = "couldn't resize metadata";
era_destroy(era);
return -ENOMEM;
}
era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
if (!era->wq) {
ti->error = "could not create workqueue for metadata object";
......@@ -1556,16 +1575,24 @@ static int era_preresume(struct dm_target *ti)
dm_block_t new_size = calc_nr_blocks(era);
if (era->nr_blocks != new_size) {
r = in_worker1(era, metadata_resize, &new_size);
if (r)
r = metadata_resize(era->md, &new_size);
if (r) {
DMERR("%s: metadata_resize failed", __func__);
return r;
}
r = metadata_commit(era->md);
if (r) {
DMERR("%s: metadata_commit failed", __func__);
return r;
}
era->nr_blocks = new_size;
}
start_worker(era);
r = in_worker0(era, metadata_new_era);
r = in_worker0(era, metadata_era_rollover);
if (r) {
DMERR("%s: metadata_era_rollover failed", __func__);
return r;
......
......@@ -469,6 +469,8 @@ static int flakey_report_zones(struct dm_target *ti,
return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
dm_report_zones_cb, args);
}
#else
#define flakey_report_zones NULL
#endif
static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
......@@ -481,10 +483,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
.version = {1, 5, 0},
#ifdef CONFIG_BLK_DEV_ZONED
.features = DM_TARGET_ZONED_HM,
.features = DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO,
.report_zones = flakey_report_zones,
#endif
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
......
......@@ -40,6 +40,7 @@
#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
#define BITMAP_FLUSH_INTERVAL (10 * HZ)
#define DISCARD_FILLER 0xf6
#define SALT_SIZE 16
/*
* Warning - DEBUG_PRINT prints security-sensitive data to the log,
......@@ -57,6 +58,7 @@
#define SB_VERSION_2 2
#define SB_VERSION_3 3
#define SB_VERSION_4 4
#define SB_VERSION_5 5
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
......@@ -72,12 +74,15 @@ struct superblock {
__u8 log2_blocks_per_bitmap_bit;
__u8 pad[2];
__u64 recalc_sector;
__u8 pad2[8];
__u8 salt[SALT_SIZE];
};
#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
#define SB_FLAG_RECALCULATING 0x2
#define SB_FLAG_DIRTY_BITMAP 0x4
#define SB_FLAG_FIXED_PADDING 0x8
#define SB_FLAG_FIXED_HMAC 0x10
#define JOURNAL_ENTRY_ROUNDUP 8
......@@ -259,6 +264,7 @@ struct dm_integrity_c {
bool recalculate_flag;
bool discard;
bool fix_padding;
bool fix_hmac;
bool legacy_recalculate;
struct alg_spec internal_hash_alg;
......@@ -389,8 +395,11 @@ static int dm_integrity_failed(struct dm_integrity_c *ic)
static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic)
{
if ((ic->internal_hash_alg.key || ic->journal_mac_alg.key) &&
!ic->legacy_recalculate)
if (ic->legacy_recalculate)
return false;
if (!(ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) ?
ic->internal_hash_alg.key || ic->journal_mac_alg.key :
ic->internal_hash_alg.key && !ic->journal_mac_alg.key)
return true;
return false;
}
......@@ -477,7 +486,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
static void sb_set_version(struct dm_integrity_c *ic)
{
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC))
ic->sb->version = SB_VERSION_5;
else if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
ic->sb->version = SB_VERSION_4;
else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
ic->sb->version = SB_VERSION_3;
......@@ -487,10 +498,58 @@ static void sb_set_version(struct dm_integrity_c *ic)
ic->sb->version = SB_VERSION_1;
}
static int sb_mac(struct dm_integrity_c *ic, bool wr)
{
SHASH_DESC_ON_STACK(desc, ic->journal_mac);
int r;
unsigned size = crypto_shash_digestsize(ic->journal_mac);
if (sizeof(struct superblock) + size > 1 << SECTOR_SHIFT) {
dm_integrity_io_error(ic, "digest is too long", -EINVAL);
return -EINVAL;
}
desc->tfm = ic->journal_mac;
r = crypto_shash_init(desc);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_init", r);
return r;
}
r = crypto_shash_update(desc, (__u8 *)ic->sb, (1 << SECTOR_SHIFT) - size);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
return r;
}
if (likely(wr)) {
r = crypto_shash_final(desc, (__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_final", r);
return r;
}
} else {
__u8 result[HASH_MAX_DIGESTSIZE];
r = crypto_shash_final(desc, result);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_final", r);
return r;
}
if (memcmp((__u8 *)ic->sb + (1 << SECTOR_SHIFT) - size, result, size)) {
dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
return -EILSEQ;
}
}
return 0;
}
static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
{
struct dm_io_request io_req;
struct dm_io_region io_loc;
int r;
io_req.bi_op = op;
io_req.bi_op_flags = op_flags;
......@@ -502,10 +561,28 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
io_loc.sector = ic->start;
io_loc.count = SB_SECTORS;
if (op == REQ_OP_WRITE)
if (op == REQ_OP_WRITE) {
sb_set_version(ic);
if (ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
r = sb_mac(ic, true);
if (unlikely(r))
return r;
}
}
return dm_io(&io_req, 1, &io_loc, NULL);
r = dm_io(&io_req, 1, &io_loc, NULL);
if (unlikely(r))
return r;
if (op == REQ_OP_READ) {
if (ic->mode != 'R' && ic->journal_mac && ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
r = sb_mac(ic, false);
if (unlikely(r))
return r;
}
}
return 0;
}
#define BITMAP_OP_TEST_ALL_SET 0
......@@ -722,15 +799,32 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
desc->tfm = ic->journal_mac;
r = crypto_shash_init(desc);
if (unlikely(r)) {
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_init", r);
goto err;
}
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
uint64_t section_le;
r = crypto_shash_update(desc, (__u8 *)&ic->sb->salt, SALT_SIZE);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
goto err;
}
section_le = cpu_to_le64(section);
r = crypto_shash_update(desc, (__u8 *)&section_le, sizeof section_le);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
goto err;
}
}
for (j = 0; j < ic->journal_section_entries; j++) {
struct journal_entry *je = access_journal_entry(ic, section, j);
r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
if (unlikely(r)) {
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
goto err;
}
......@@ -740,7 +834,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
if (likely(size <= JOURNAL_MAC_SIZE)) {
r = crypto_shash_final(desc, result);
if (unlikely(r)) {
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_final", r);
goto err;
}
......@@ -753,7 +847,7 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
goto err;
}
r = crypto_shash_final(desc, digest);
if (unlikely(r)) {
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_final", r);
goto err;
}
......@@ -1399,7 +1493,7 @@ static void flush_notify(unsigned long error, void *fr_)
{
struct flush_request *fr = fr_;
if (unlikely(error != 0))
dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO);
dm_integrity_io_error(fr->ic, "flushing disk cache", -EIO);
complete(&fr->comp);
}
......@@ -1556,6 +1650,14 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
goto failed;
}
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
r = crypto_shash_update(req, (__u8 *)&ic->sb->salt, SALT_SIZE);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
goto failed;
}
}
r = crypto_shash_update(req, (const __u8 *)&sector_le, sizeof sector_le);
if (unlikely(r < 0)) {
dm_integrity_io_error(ic, "crypto_shash_update", r);
......@@ -3149,6 +3251,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0;
arg_count += ic->legacy_recalculate;
DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start,
ic->tag_size, ic->mode, arg_count);
......@@ -3173,6 +3276,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
}
if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
DMEMIT(" fix_padding");
if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) != 0)
DMEMIT(" fix_hmac");
if (ic->legacy_recalculate)
DMEMIT(" legacy_recalculate");
......@@ -3310,6 +3415,11 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
if (!journal_sections)
journal_sections = 1;
if (ic->fix_hmac && (ic->internal_hash_alg.alg_string || ic->journal_mac_alg.alg_string)) {
ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_HMAC);
get_random_bytes(ic->sb->salt, SALT_SIZE);
}
if (!ic->meta_dev) {
if (ic->fix_padding)
ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
......@@ -3804,7 +3914,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
unsigned extra_args;
struct dm_arg_set as;
static const struct dm_arg _args[] = {
{0, 16, "Invalid number of feature args"},
{0, 17, "Invalid number of feature args"},
};
unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
bool should_write_sb;
......@@ -3942,7 +4052,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto bad;
} else if (!strncmp(opt_string, "journal_mac:", strlen("journal_mac:"))) {
r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
r = get_alg_and_key(opt_string, &ic->journal_mac_alg, &ti->error,
"Invalid journal_mac argument");
if (r)
goto bad;
......@@ -3952,6 +4062,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ic->discard = true;
} else if (!strcmp(opt_string, "fix_padding")) {
ic->fix_padding = true;
} else if (!strcmp(opt_string, "fix_hmac")) {
ic->fix_hmac = true;
} else if (!strcmp(opt_string, "legacy_recalculate")) {
ic->legacy_recalculate = true;
} else {
......@@ -4110,7 +4222,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
should_write_sb = true;
}
if (!ic->sb->version || ic->sb->version > SB_VERSION_4) {
if (!ic->sb->version || ic->sb->version > SB_VERSION_5) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
......@@ -4442,7 +4554,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
.version = {1, 6, 0},
.version = {1, 7, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
......
......@@ -146,6 +146,8 @@ static int linear_report_zones(struct dm_target *ti,
return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
dm_report_zones_cb, args);
}
#else
#define linear_report_zones NULL
#endif
static int linear_iterate_devices(struct dm_target *ti,
......@@ -227,13 +229,9 @@ static int linear_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
static struct target_type linear_target = {
.name = "linear",
.version = {1, 4, 0},
#ifdef CONFIG_BLK_DEV_ZONED
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
DM_TARGET_ZONED_HM,
DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO,
.report_zones = linear_report_zones,
#else
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT,
#endif
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
......
This diff is collapsed.
......@@ -148,6 +148,7 @@ struct dm_writecache {
size_t metadata_sectors;
size_t n_blocks;
uint64_t seq_count;
sector_t data_device_sectors;
void *block_start;
struct wc_entry *entries;
unsigned block_size;
......@@ -159,14 +160,22 @@ struct dm_writecache {
bool overwrote_committed:1;
bool memory_vmapped:1;
bool start_sector_set:1;
bool high_wm_percent_set:1;
bool low_wm_percent_set:1;
bool max_writeback_jobs_set:1;
bool autocommit_blocks_set:1;
bool autocommit_time_set:1;
bool max_age_set:1;
bool writeback_fua_set:1;
bool flush_on_suspend:1;
bool cleaner:1;
bool cleaner_set:1;
unsigned high_wm_percent_value;
unsigned low_wm_percent_value;
unsigned autocommit_time_value;
unsigned max_age_value;
unsigned writeback_all;
struct workqueue_struct *writeback_wq;
......@@ -523,7 +532,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
region.bdev = wc->ssd_dev->bdev;
region.sector = 0;
region.count = PAGE_SIZE;
region.count = PAGE_SIZE >> SECTOR_SHIFT;
if (unlikely(region.sector + region.count > wc->metadata_sectors))
region.count = wc->metadata_sectors - region.sector;
......@@ -969,6 +978,8 @@ static void writecache_resume(struct dm_target *ti)
wc_lock(wc);
wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
if (WC_MODE_PMEM(wc)) {
persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
} else {
......@@ -1638,6 +1649,10 @@ static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t
void *address = memory_data(wc, e);
persistent_memory_flush_cache(address, block_size);
if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
return true;
return bio_add_page(&wb->bio, persistent_memory_page(address),
block_size, persistent_memory_page_offset(address)) != 0;
}
......@@ -1709,6 +1724,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
if (writecache_has_error(wc)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
} else if (unlikely(!bio_sectors(bio))) {
bio->bi_status = BLK_STS_OK;
bio_endio(bio);
} else {
submit_bio(bio);
}
......@@ -1752,6 +1770,14 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac
e = f;
}
if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
if (to.sector >= wc->data_device_sectors) {
writecache_copy_endio(0, 0, c);
continue;
}
from.count = to.count = wc->data_device_sectors - to.sector;
}
dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
__writeback_throttle(wc, wbl);
......@@ -2004,8 +2030,7 @@ static void writecache_dtr(struct dm_target *ti)
if (wc->ssd_dev)
dm_put_device(ti, wc->ssd_dev);
if (wc->entries)
vfree(wc->entries);
vfree(wc->entries);
if (wc->memory_map) {
if (WC_MODE_PMEM(wc))
......@@ -2020,8 +2045,7 @@ static void writecache_dtr(struct dm_target *ti)
if (wc->dm_io)
dm_io_client_destroy(wc->dm_io);
if (wc->dirty_bitmap)
vfree(wc->dirty_bitmap);
vfree(wc->dirty_bitmap);
kfree(wc);
}
......@@ -2205,6 +2229,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
goto invalid_optional;
wc->start_sector = start_sector;
wc->start_sector_set = true;
if (wc->start_sector != start_sector ||
wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
goto invalid_optional;
......@@ -2214,6 +2239,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto invalid_optional;
if (high_wm_percent < 0 || high_wm_percent > 100)
goto invalid_optional;
wc->high_wm_percent_value = high_wm_percent;
wc->high_wm_percent_set = true;
} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
......@@ -2221,6 +2247,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto invalid_optional;
if (low_wm_percent < 0 || low_wm_percent > 100)
goto invalid_optional;
wc->low_wm_percent_value = low_wm_percent;
wc->low_wm_percent_set = true;
} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
string = dm_shift_arg(&as), opt_params--;
......@@ -2240,6 +2267,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (autocommit_msecs > 3600000)
goto invalid_optional;
wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
wc->autocommit_time_value = autocommit_msecs;
wc->autocommit_time_set = true;
} else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
unsigned max_age_msecs;
......@@ -2249,7 +2277,10 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (max_age_msecs > 86400000)
goto invalid_optional;
wc->max_age = msecs_to_jiffies(max_age_msecs);
wc->max_age_set = true;
wc->max_age_value = max_age_msecs;
} else if (!strcasecmp(string, "cleaner")) {
wc->cleaner_set = true;
wc->cleaner = true;
} else if (!strcasecmp(string, "fua")) {
if (WC_MODE_PMEM(wc)) {
......@@ -2455,7 +2486,6 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
struct dm_writecache *wc = ti->private;
unsigned extra_args;
unsigned sz = 0;
uint64_t x;
switch (type) {
case STATUSTYPE_INFO:
......@@ -2467,11 +2497,11 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
wc->dev->name, wc->ssd_dev->name, wc->block_size);
extra_args = 0;
if (wc->start_sector)
if (wc->start_sector_set)
extra_args += 2;
if (wc->high_wm_percent_set && !wc->cleaner)
if (wc->high_wm_percent_set)
extra_args += 2;
if (wc->low_wm_percent_set && !wc->cleaner)
if (wc->low_wm_percent_set)
extra_args += 2;
if (wc->max_writeback_jobs_set)
extra_args += 2;
......@@ -2479,37 +2509,29 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
extra_args += 2;
if (wc->autocommit_time_set)
extra_args += 2;
if (wc->max_age != MAX_AGE_UNSPECIFIED)
if (wc->max_age_set)
extra_args += 2;
if (wc->cleaner)
if (wc->cleaner_set)
extra_args++;
if (wc->writeback_fua_set)
extra_args++;
DMEMIT("%u", extra_args);
if (wc->start_sector)
if (wc->start_sector_set)
DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
if (wc->high_wm_percent_set && !wc->cleaner) {
x = (uint64_t)wc->freelist_high_watermark * 100;
x += wc->n_blocks / 2;
do_div(x, (size_t)wc->n_blocks);
DMEMIT(" high_watermark %u", 100 - (unsigned)x);
}
if (wc->low_wm_percent_set && !wc->cleaner) {
x = (uint64_t)wc->freelist_low_watermark * 100;
x += wc->n_blocks / 2;
do_div(x, (size_t)wc->n_blocks);
DMEMIT(" low_watermark %u", 100 - (unsigned)x);
}
if (wc->high_wm_percent_set)
DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
if (wc->low_wm_percent_set)
DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
if (wc->max_writeback_jobs_set)
DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
if (wc->autocommit_blocks_set)
DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
if (wc->autocommit_time_set)
DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
if (wc->max_age != MAX_AGE_UNSPECIFIED)
DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age));
if (wc->cleaner)
DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
if (wc->max_age_set)
DMEMIT(" max_age %u", wc->max_age_value);
if (wc->cleaner_set)
DMEMIT(" cleaner");
if (wc->writeback_fua_set)
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
......@@ -2519,7 +2541,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
static struct target_type writecache_target = {
.name = "writecache",
.version = {1, 3, 0},
.version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = writecache_ctr,
.dtr = writecache_dtr,
......
......@@ -28,6 +28,7 @@
#include <linux/refcount.h>
#include <linux/part_stat.h>
#include <linux/blk-crypto.h>
#include <linux/keyslot-manager.h>
#define DM_MSG_PREFIX "core"
......@@ -105,12 +106,16 @@ struct dm_io {
struct dm_target_io tio;
};
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
#define DM_IO_BIO_OFFSET \
(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
void *dm_per_bio_data(struct bio *bio, size_t data_size)
{
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
if (!tio->inside_dm_io)
return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
return (char *)bio - DM_IO_BIO_OFFSET - data_size;
}
EXPORT_SYMBOL_GPL(dm_per_bio_data);
......@@ -118,9 +123,9 @@ struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
{
struct dm_io *io = (struct dm_io *)((char *)data + data_size);
if (io->magic == DM_IO_MAGIC)
return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
BUG_ON(io->magic != DM_TIO_MAGIC);
return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
}
EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
......@@ -148,6 +153,16 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
#define DM_NUMA_NODE NUMA_NO_NODE
static int dm_numa_node = DM_NUMA_NODE;
#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
static int swap_bios = DEFAULT_SWAP_BIOS;
static int get_swap_bios(void)
{
int latch = READ_ONCE(swap_bios);
if (unlikely(latch <= 0))
latch = DEFAULT_SWAP_BIOS;
return latch;
}
/*
* For mempools pre-allocation at the table loading time.
*/
......@@ -969,6 +984,11 @@ void disable_write_zeroes(struct mapped_device *md)
limits->max_write_zeroes_sectors = 0;
}
static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
{
return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
}
static void clone_endio(struct bio *bio)
{
blk_status_t error = bio->bi_status;
......@@ -1020,6 +1040,11 @@ static void clone_endio(struct bio *bio)
}
}
if (unlikely(swap_bios_limit(tio->ti, bio))) {
struct mapped_device *md = io->md;
up(&md->swap_bios_semaphore);
}
free_tio(tio);
dec_pending(io, error);
}
......@@ -1129,7 +1154,7 @@ static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bd
if (!map)
goto out;
ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
out:
dm_put_live_table(md, srcu_idx);
......@@ -1253,6 +1278,22 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
{
mutex_lock(&md->swap_bios_lock);
while (latch < md->swap_bios) {
cond_resched();
down(&md->swap_bios_semaphore);
md->swap_bios--;
}
while (latch > md->swap_bios) {
cond_resched();
up(&md->swap_bios_semaphore);
md->swap_bios++;
}
mutex_unlock(&md->swap_bios_lock);
}
static blk_qc_t __map_bio(struct dm_target_io *tio)
{
int r;
......@@ -1272,6 +1313,14 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
atomic_inc(&io->io_count);
sector = clone->bi_iter.bi_sector;
if (unlikely(swap_bios_limit(ti, clone))) {
struct mapped_device *md = io->md;
int latch = get_swap_bios();
if (unlikely(latch != md->swap_bios))
__set_swap_bios_limit(md, latch);
down(&md->swap_bios_semaphore);
}
r = ti->type->map(ti, clone);
switch (r) {
case DM_MAPIO_SUBMITTED:
......@@ -1282,10 +1331,18 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
ret = submit_bio_noacct(clone);
break;
case DM_MAPIO_KILL:
if (unlikely(swap_bios_limit(ti, clone))) {
struct mapped_device *md = io->md;
up(&md->swap_bios_semaphore);
}
free_tio(tio);
dec_pending(io, BLK_STS_IOERR);
break;
case DM_MAPIO_REQUEUE:
if (unlikely(swap_bios_limit(ti, clone))) {
struct mapped_device *md = io->md;
up(&md->swap_bios_semaphore);
}
free_tio(tio);
dec_pending(io, BLK_STS_DM_REQUEUE);
break;
......@@ -1718,6 +1775,19 @@ static const struct dax_operations dm_dax_ops;
static void dm_wq_work(struct work_struct *work);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
{
dm_destroy_keyslot_manager(q->ksm);
}
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
{
}
#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
static void cleanup_mapped_device(struct mapped_device *md)
{
if (md->wq)
......@@ -1739,14 +1809,17 @@ static void cleanup_mapped_device(struct mapped_device *md)
put_disk(md->disk);
}
if (md->queue)
if (md->queue) {
dm_queue_destroy_keyslot_manager(md->queue);
blk_cleanup_queue(md->queue);
}
cleanup_srcu_struct(&md->io_barrier);
mutex_destroy(&md->suspend_lock);
mutex_destroy(&md->type_lock);
mutex_destroy(&md->table_devices_lock);
mutex_destroy(&md->swap_bios_lock);
dm_mq_cleanup_mapped_device(md);
}
......@@ -1814,6 +1887,10 @@ static struct mapped_device *alloc_dev(int minor)
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
md->swap_bios = get_swap_bios();
sema_init(&md->swap_bios_semaphore, md->swap_bios);
mutex_init(&md->swap_bios_lock);
md->disk->major = _major;
md->disk->first_minor = minor;
md->disk->fops = &dm_blk_dops;
......@@ -2849,8 +2926,8 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
case DM_TYPE_BIO_BASED:
case DM_TYPE_DAX_BIO_BASED:
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
if (ret)
goto out;
......@@ -3097,6 +3174,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
module_param(swap_bios, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
......@@ -73,7 +73,7 @@ void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
bool dm_table_supports_dax(struct dm_table *t, iterate_devices_callout_fn fn,
int *blocksize);
int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data);
void dm_lock_md_type(struct mapped_device *md);
......
......@@ -100,7 +100,7 @@ struct dm_block *shadow_parent(struct shadow_spine *s);
int shadow_has_parent(struct shadow_spine *s);
int shadow_root(struct shadow_spine *s);
dm_block_t shadow_root(struct shadow_spine *s);
/*
* Some inlines.
......
......@@ -235,7 +235,7 @@ int shadow_has_parent(struct shadow_spine *s)
return s->count >= 2;
}
int shadow_root(struct shadow_spine *s)
dm_block_t shadow_root(struct shadow_spine *s)
{
return s->root;
}
......
......@@ -93,9 +93,18 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv,
typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **bdev);
#ifdef CONFIG_BLK_DEV_ZONED
typedef int (*dm_report_zones_fn) (struct dm_target *ti,
struct dm_report_zones_args *args,
unsigned int nr_zones);
#else
/*
* Define dm_report_zones_fn so that targets can assign to NULL if
* CONFIG_BLK_DEV_ZONED disabled. Otherwise each target needs to do
* awkward #ifdefs in their target_type, etc.
*/
typedef int (*dm_report_zones_fn) (struct dm_target *dummy);
#endif
/*
* These iteration functions are typically used to check (and combine)
......@@ -187,9 +196,7 @@ struct target_type {
dm_status_fn status;
dm_message_fn message;
dm_prepare_ioctl_fn prepare_ioctl;
#ifdef CONFIG_BLK_DEV_ZONED
dm_report_zones_fn report_zones;
#endif
dm_busy_fn busy;
dm_iterate_devices_fn iterate_devices;
dm_io_hints_fn io_hints;
......@@ -248,8 +255,13 @@ struct target_type {
/*
* Indicates that a target supports host-managed zoned block devices.
*/
#ifdef CONFIG_BLK_DEV_ZONED
#define DM_TARGET_ZONED_HM 0x00000040
#define dm_target_supports_zoned_hm(type) ((type)->features & DM_TARGET_ZONED_HM)
#else
#define DM_TARGET_ZONED_HM 0x00000000
#define dm_target_supports_zoned_hm(type) (false)
#endif
/*
* A target handles REQ_NOWAIT
......@@ -257,6 +269,12 @@ struct target_type {
#define DM_TARGET_NOWAIT 0x00000080
#define dm_target_supports_nowait(type) ((type)->features & DM_TARGET_NOWAIT)
/*
* A target supports passing through inline crypto support.
*/
#define DM_TARGET_PASSES_CRYPTO 0x00000100
#define dm_target_passes_crypto(type) ((type)->features & DM_TARGET_PASSES_CRYPTO)
struct dm_target {
struct dm_table *table;
struct target_type *type;
......@@ -325,6 +343,11 @@ struct dm_target {
* whether or not its underlying devices have support.
*/
bool discards_supported:1;
/*
* Set if we need to limit the number of in-flight bios when swapping.
*/
bool limit_swap_bios:1;
};
void *dm_per_bio_data(struct bio *bio, size_t data_size);
......@@ -533,6 +556,11 @@ void dm_table_run_md_queue_async(struct dm_table *t);
struct dm_table *dm_swap_table(struct mapped_device *md,
struct dm_table *t);
/*
* Table keyslot manager functions
*/
void dm_destroy_keyslot_manager(struct blk_keyslot_manager *ksm);
/*
* A wrapper around vmalloc.
*/
......
......@@ -106,4 +106,15 @@ void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm);
void blk_ksm_destroy(struct blk_keyslot_manager *ksm);
void blk_ksm_intersect_modes(struct blk_keyslot_manager *parent,
const struct blk_keyslot_manager *child);
void blk_ksm_init_passthrough(struct blk_keyslot_manager *ksm);
bool blk_ksm_is_superset(struct blk_keyslot_manager *ksm_superset,
struct blk_keyslot_manager *ksm_subset);
void blk_ksm_update_capabilities(struct blk_keyslot_manager *target_ksm,
struct blk_keyslot_manager *reference_ksm);
#endif /* __LINUX_KEYSLOT_MANAGER_H */
......@@ -272,9 +272,9 @@ enum {
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
#define DM_VERSION_MINOR 43
#define DM_VERSION_MINOR 44
#define DM_VERSION_PATCHLEVEL 0
#define DM_VERSION_EXTRA "-ioctl (2020-10-01)"
#define DM_VERSION_EXTRA "-ioctl (2021-02-01)"
/* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment