Commit e9f8ca0a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.6/dm-changes' of...

Merge tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Fix DM core's potential for q->make_request_fn NULL pointer in the
   unlikely case that a DM device is created without a DM table and then
   accessed due to upper-layer userspace code or user error.

 - Fix DM thin-provisioning's metadata_pre_commit_callback to not use
   memory after it is free'd. Also refactor code to disallow changing
   the thin-pool's data device once in use -- doing so guarantees smae
   lifetime of pool's data device relative to the pool metadata.

 - Fix DM space maps used by DM thinp and DM cache to avoid reuse of a
   already used block. This race was identified with extremely heavy
   snapshot use in the context of DM thin provisioning.

 - Fix DM raid's table status relative to an active rebuild.

 - Fix DM crypt to use GFP_NOIO rather than GFP_NOFS in call to
   skcipher_request_alloc(). Also fix benbi IV constructor crash if used
   in authenticated mode.

 - Add DM crypt support for Elephant diffuser to allow for Bitlocker
   compatibility.

 - Fix DM verity target to not prefetch hash blocks for data that has
   already been verified.

 - Fix DM writecache's incorrect flush sequence during commit when in
   SSD mode.

 - Improve DM writecache's sequential write performance on SSDs.

 - Add DM zoned target support for zone sizes smaller than 128MiB.

 - Add DM multipath 'queue_if_no_path_timeout_secs' module param to
   allow timeout if path isn't reinstated. This allows users a kernel
   safety-net against IO hanging indefinitely, due to no active paths,
   that has historically only been provided by multipathd userspace.

 - Various DM code cleanups to use true/false rather than 1/0, a
   variable rename in dm-dust, and fix for a math error in comment for
   DM thin metadata's ondisk format.

* tag 'for-5.6/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (21 commits)
  dm: fix potential for q->make_request_fn NULL pointer
  dm writecache: improve performance of large linear writes on SSDs
  dm mpath: Add timeout mechanism for queue_if_no_path
  dm thin: change data device's flush_bio to be member of struct pool
  dm thin: don't allow changing data device during thin-pool reload
  dm thin: fix use-after-free in metadata_pre_commit_callback
  dm thin metadata: use pool locking at end of dm_pool_metadata_close
  dm writecache: fix incorrect flush sequence when doing SSD mode commit
  dm crypt: fix benbi IV constructor crash if used in authenticated mode
  dm crypt: Implement Elephant diffuser for Bitlocker compatibility
  dm space map common: fix to ensure new block isn't already in use
  dm verity: don't prefetch hash blocks for already-verified data
  dm crypt: fix GFP flags passed to skcipher_request_alloc()
  dm thin metadata: Fix trivial math error in on-disk format documentation
  dm thin metadata: use true/false for bool variable
  dm snapshot: use true/false for bool variable
  dm bio prison v2: use true/false for bool variable
  dm mpath: use true/false for bool variable
  dm zoned: support zone sizes smaller than 128MiB
  dm raid: table line rebuild status fixes
  ...
parents 05ef8b97 47ace7e0
...@@ -419,3 +419,5 @@ Version History ...@@ -419,3 +419,5 @@ Version History
rebuild errors. rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap 1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions pages allocated; also fix those not occuring after previous reductions
1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode)
on the status line.
...@@ -324,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison, ...@@ -324,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
bio_list_init(&cell->bios); bio_list_init(&cell->bios);
if (cell->shared_count) { if (cell->shared_count) {
cell->exclusive_lock = 0; cell->exclusive_lock = false;
return false; return false;
} }
......
/* /*
* Copyright (C) 2003 Jana Saout <jana@saout.de> * Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved. * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved.
* Copyright (C) 2013-2017 Milan Broz <gmazyland@gmail.com> * Copyright (C) 2013-2020 Milan Broz <gmazyland@gmail.com>
* *
* This file is released under the GPL. * This file is released under the GPL.
*/ */
...@@ -115,6 +115,11 @@ struct iv_tcw_private { ...@@ -115,6 +115,11 @@ struct iv_tcw_private {
u8 *whitening; u8 *whitening;
}; };
#define ELEPHANT_MAX_KEY_SIZE 32
struct iv_elephant_private {
struct crypto_skcipher *tfm;
};
/* /*
* Crypt: maps a linear range of a block device * Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time. * and encrypts / decrypts at the same time.
...@@ -125,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, ...@@ -125,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
enum cipher_flags { enum cipher_flags {
CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */
CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */
CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */
}; };
/* /*
...@@ -152,6 +158,7 @@ struct crypt_config { ...@@ -152,6 +158,7 @@ struct crypt_config {
struct iv_benbi_private benbi; struct iv_benbi_private benbi;
struct iv_lmk_private lmk; struct iv_lmk_private lmk;
struct iv_tcw_private tcw; struct iv_tcw_private tcw;
struct iv_elephant_private elephant;
} iv_gen_private; } iv_gen_private;
u64 iv_offset; u64 iv_offset;
unsigned int iv_size; unsigned int iv_size;
...@@ -285,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) ...@@ -285,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode) * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode)
* The IV is encrypted little-endian byte-offset (with the same key * The IV is encrypted little-endian byte-offset (with the same key
* and cipher as the volume). * and cipher as the volume).
*
* elephant: The extended version of eboiv with additional Elephant diffuser
* used with Bitlocker CBC mode.
* This mode was used in older Windows systems
* http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf
*/ */
static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
...@@ -331,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, ...@@ -331,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts) const char *opts)
{ {
unsigned bs = crypto_skcipher_blocksize(any_tfm(cc)); unsigned bs;
int log = ilog2(bs); int log;
if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags))
bs = crypto_aead_blocksize(any_tfm_aead(cc));
else
bs = crypto_skcipher_blocksize(any_tfm(cc));
log = ilog2(bs);
/* we need to calculate how far we must shift the sector count /* we need to calculate how far we must shift the sector count
* to get the cipher block count, we use this shift in _gen */ * to get the cipher block count, we use this shift in _gen */
...@@ -717,7 +735,7 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, ...@@ -717,7 +735,7 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
struct crypto_wait wait; struct crypto_wait wait;
int err; int err;
req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS); req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO);
if (!req) if (!req)
return -ENOMEM; return -ENOMEM;
...@@ -734,6 +752,290 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, ...@@ -734,6 +752,290 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv,
return err; return err;
} }
static void crypt_iv_elephant_dtr(struct crypt_config *cc)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
crypto_free_skcipher(elephant->tfm);
elephant->tfm = NULL;
}
static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
int r;
elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
if (IS_ERR(elephant->tfm)) {
r = PTR_ERR(elephant->tfm);
elephant->tfm = NULL;
return r;
}
r = crypt_iv_eboiv_ctr(cc, ti, NULL);
if (r)
crypt_iv_elephant_dtr(cc);
return r;
}
static void diffuser_disk_to_cpu(u32 *d, size_t n)
{
#ifndef __LITTLE_ENDIAN
int i;
for (i = 0; i < n; i++)
d[i] = le32_to_cpu((__le32)d[i]);
#endif
}
static void diffuser_cpu_to_disk(__le32 *d, size_t n)
{
#ifndef __LITTLE_ENDIAN
int i;
for (i = 0; i < n; i++)
d[i] = cpu_to_le32((u32)d[i]);
#endif
}
static void diffuser_a_decrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 5; i++) {
i1 = 0;
i2 = n - 2;
i3 = n - 5;
while (i1 < (n - 1)) {
d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
i1++; i2++; i3++;
if (i3 >= n)
i3 -= n;
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
if (i2 >= n)
i2 -= n;
d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
i1++; i2++; i3++;
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
}
}
}
static void diffuser_a_encrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 5; i++) {
i1 = n - 1;
i2 = n - 2 - 1;
i3 = n - 5 - 1;
while (i1 > 0) {
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19);
i1--; i2--; i3--;
if (i2 < 0)
i2 += n;
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
if (i3 < 0)
i3 += n;
d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23);
i1--; i2--; i3--;
}
}
}
static void diffuser_b_decrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 3; i++) {
i1 = 0;
i2 = 2;
i3 = 5;
while (i1 < (n - 1)) {
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
i1++; i2++; i3++;
if (i2 >= n)
i2 -= n;
d[i1] += d[i2] ^ d[i3];
i1++; i2++; i3++;
if (i3 >= n)
i3 -= n;
d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
i1++; i2++; i3++;
}
}
}
static void diffuser_b_encrypt(u32 *d, size_t n)
{
int i, i1, i2, i3;
for (i = 0; i < 3; i++) {
i1 = n - 1;
i2 = 2 - 1;
i3 = 5 - 1;
while (i1 > 0) {
d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7);
i1--; i2--; i3--;
if (i3 < 0)
i3 += n;
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
if (i2 < 0)
i2 += n;
d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22);
i1--; i2--; i3--;
d[i1] -= d[i2] ^ d[i3];
i1--; i2--; i3--;
}
}
}
static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
u8 *es, *ks, *data, *data2, *data_offset;
struct skcipher_request *req;
struct scatterlist *sg, *sg2, src, dst;
struct crypto_wait wait;
int i, r;
req = skcipher_request_alloc(elephant->tfm, GFP_NOIO);
es = kzalloc(16, GFP_NOIO); /* Key for AES */
ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */
if (!req || !es || !ks) {
r = -ENOMEM;
goto out;
}
*(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size);
/* E(Ks, e(s)) */
sg_init_one(&src, es, 16);
sg_init_one(&dst, ks, 16);
skcipher_request_set_crypt(req, &src, &dst, 16, NULL);
skcipher_request_set_callback(req, 0, crypto_req_done, &wait);
r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
if (r)
goto out;
/* E(Ks, e'(s)) */
es[15] = 0x80;
sg_init_one(&dst, &ks[16], 16);
r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
if (r)
goto out;
sg = crypt_get_sg_data(cc, dmreq->sg_out);
data = kmap_atomic(sg_page(sg));
data_offset = data + sg->offset;
/* Cannot modify original bio, copy to sg_out and apply Elephant to it */
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
sg2 = crypt_get_sg_data(cc, dmreq->sg_in);
data2 = kmap_atomic(sg_page(sg2));
memcpy(data_offset, data2 + sg2->offset, cc->sector_size);
kunmap_atomic(data2);
}
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
}
for (i = 0; i < (cc->sector_size / 32); i++)
crypto_xor(data_offset + i * 32, ks, 32);
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32));
diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32));
}
kunmap_atomic(data);
out:
kzfree(ks);
kzfree(es);
skcipher_request_free(req);
return r;
}
static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
int r;
if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
r = crypt_iv_elephant(cc, dmreq);
if (r)
return r;
}
return crypt_iv_eboiv_gen(cc, iv, dmreq);
}
static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv,
struct dm_crypt_request *dmreq)
{
if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
return crypt_iv_elephant(cc, dmreq);
return 0;
}
static int crypt_iv_elephant_init(struct crypt_config *cc)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
int key_offset = cc->key_size - cc->key_extra_size;
return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size);
}
static int crypt_iv_elephant_wipe(struct crypt_config *cc)
{
struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant;
u8 key[ELEPHANT_MAX_KEY_SIZE];
memset(key, 0, cc->key_extra_size);
return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size);
}
static const struct crypt_iv_operations crypt_iv_plain_ops = { static const struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen .generator = crypt_iv_plain_gen
}; };
...@@ -787,6 +1089,15 @@ static struct crypt_iv_operations crypt_iv_eboiv_ops = { ...@@ -787,6 +1089,15 @@ static struct crypt_iv_operations crypt_iv_eboiv_ops = {
.generator = crypt_iv_eboiv_gen .generator = crypt_iv_eboiv_gen
}; };
static struct crypt_iv_operations crypt_iv_elephant_ops = {
.ctr = crypt_iv_elephant_ctr,
.dtr = crypt_iv_elephant_dtr,
.init = crypt_iv_elephant_init,
.wipe = crypt_iv_elephant_wipe,
.generator = crypt_iv_elephant_gen,
.post = crypt_iv_elephant_post
};
/* /*
* Integrity extensions * Integrity extensions
*/ */
...@@ -1103,6 +1414,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, ...@@ -1103,6 +1414,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); r = cc->iv_gen_ops->generator(cc, org_iv, dmreq);
if (r < 0) if (r < 0)
return r; return r;
/* Data can be already preprocessed in generator */
if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags))
sg_in = sg_out;
/* Store generated IV in integrity metadata */ /* Store generated IV in integrity metadata */
if (cc->integrity_iv_size) if (cc->integrity_iv_size)
memcpy(tag_iv, org_iv, cc->integrity_iv_size); memcpy(tag_iv, org_iv, cc->integrity_iv_size);
...@@ -2191,7 +2505,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) ...@@ -2191,7 +2505,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_null_ops; cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "eboiv") == 0) else if (strcmp(ivmode, "eboiv") == 0)
cc->iv_gen_ops = &crypt_iv_eboiv_ops; cc->iv_gen_ops = &crypt_iv_eboiv_ops;
else if (strcmp(ivmode, "lmk") == 0) { else if (strcmp(ivmode, "elephant") == 0) {
cc->iv_gen_ops = &crypt_iv_elephant_ops;
cc->key_parts = 2;
cc->key_extra_size = cc->key_size / 2;
if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE)
return -EINVAL;
set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags);
} else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops; cc->iv_gen_ops = &crypt_iv_lmk_ops;
/* /*
* Version 2 and 3 is recognised according * Version 2 and 3 is recognised according
...@@ -2959,7 +3280,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) ...@@ -2959,7 +3280,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = { static struct target_type crypt_target = {
.name = "crypt", .name = "crypt",
.version = {1, 19, 0}, .version = {1, 20, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = crypt_ctr, .ctr = crypt_ctr,
.dtr = crypt_dtr, .dtr = crypt_dtr,
......
...@@ -207,16 +207,16 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock, ...@@ -207,16 +207,16 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb) bool fail_read_on_bb)
{ {
unsigned long flags; unsigned long flags;
int ret = DM_MAPIO_REMAPPED; int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) { if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift; thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags); spin_lock_irqsave(&dd->dust_lock, flags);
ret = __dust_map_write(dd, thisblock); r = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags); spin_unlock_irqrestore(&dd->dust_lock, flags);
} }
return ret; return r;
} }
static int dust_map(struct dm_target *ti, struct bio *bio) static int dust_map(struct dm_target *ti, struct bio *bio)
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/time.h> #include <linux/time.h>
#include <linux/timer.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <scsi/scsi_dh.h> #include <scsi/scsi_dh.h>
...@@ -29,6 +30,9 @@ ...@@ -29,6 +30,9 @@
#define DM_MSG_PREFIX "multipath" #define DM_MSG_PREFIX "multipath"
#define DM_PG_INIT_DELAY_MSECS 2000 #define DM_PG_INIT_DELAY_MSECS 2000
#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0
static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT;
/* Path properties */ /* Path properties */
struct pgpath { struct pgpath {
...@@ -91,6 +95,8 @@ struct multipath { ...@@ -91,6 +95,8 @@ struct multipath {
struct work_struct process_queued_bios; struct work_struct process_queued_bios;
struct bio_list queued_bios; struct bio_list queued_bios;
struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
}; };
/* /*
...@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work); ...@@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work);
static void activate_or_offline_path(struct pgpath *pgpath); static void activate_or_offline_path(struct pgpath *pgpath);
static void activate_path_work(struct work_struct *work); static void activate_path_work(struct work_struct *work);
static void process_queued_bios(struct work_struct *work); static void process_queued_bios(struct work_struct *work);
static void queue_if_no_path_timeout_work(struct timer_list *t);
/*----------------------------------------------- /*-----------------------------------------------
* Multipath state flags. * Multipath state flags.
...@@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) ...@@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->ti = ti; m->ti = ti;
ti->private = m; ti->private = m;
timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0);
} }
return m; return m;
...@@ -717,6 +726,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, ...@@ -717,6 +726,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
return 0; return 0;
} }
/*
* If the queue_if_no_path timeout fires, turn off queue_if_no_path and
* process any queued I/O.
*/
static void queue_if_no_path_timeout_work(struct timer_list *t)
{
struct multipath *m = from_timer(m, t, nopath_timer);
struct mapped_device *md = dm_table_get_md(m->ti->table);
DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md));
queue_if_no_path(m, false, false);
}
/*
* Enable the queue_if_no_path timeout if necessary.
* Called with m->lock held.
*/
static void enable_nopath_timeout(struct multipath *m)
{
unsigned long queue_if_no_path_timeout =
READ_ONCE(queue_if_no_path_timeout_secs) * HZ;
lockdep_assert_held(&m->lock);
if (queue_if_no_path_timeout > 0 &&
atomic_read(&m->nr_valid_paths) == 0 &&
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
mod_timer(&m->nopath_timer,
jiffies + queue_if_no_path_timeout);
}
}
static void disable_nopath_timeout(struct multipath *m)
{
del_timer_sync(&m->nopath_timer);
}
/* /*
* An event is triggered whenever a path is taken out of use. * An event is triggered whenever a path is taken out of use.
* Includes path failure and PG bypass. * Includes path failure and PG bypass.
...@@ -1090,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -1090,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct dm_arg_set as; struct dm_arg_set as;
unsigned pg_count = 0; unsigned pg_count = 0;
unsigned next_pg_num; unsigned next_pg_num;
unsigned long flags;
as.argc = argc; as.argc = argc;
as.argv = argv; as.argv = argv;
...@@ -1154,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -1154,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad; goto bad;
} }
spin_lock_irqsave(&m->lock, flags);
enable_nopath_timeout(m);
spin_unlock_irqrestore(&m->lock, flags);
ti->num_flush_bios = 1; ti->num_flush_bios = 1;
ti->num_discard_bios = 1; ti->num_discard_bios = 1;
ti->num_write_same_bios = 1; ti->num_write_same_bios = 1;
...@@ -1208,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti) ...@@ -1208,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti)
{ {
struct multipath *m = ti->private; struct multipath *m = ti->private;
disable_nopath_timeout(m);
flush_multipath_work(m); flush_multipath_work(m);
free_multipath(m); free_multipath(m);
} }
...@@ -1241,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath) ...@@ -1241,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event); schedule_work(&m->trigger_event);
enable_nopath_timeout(m);
out: out:
spin_unlock_irqrestore(&m->lock, flags); spin_unlock_irqrestore(&m->lock, flags);
...@@ -1291,6 +1345,9 @@ static int reinstate_path(struct pgpath *pgpath) ...@@ -1291,6 +1345,9 @@ static int reinstate_path(struct pgpath *pgpath)
process_queued_io_list(m); process_queued_io_list(m);
} }
if (pgpath->is_active)
disable_nopath_timeout(m);
return r; return r;
} }
...@@ -1444,7 +1501,7 @@ static void pg_init_done(void *data, int errors) ...@@ -1444,7 +1501,7 @@ static void pg_init_done(void *data, int errors)
break; break;
case SCSI_DH_RETRY: case SCSI_DH_RETRY:
/* Wait before retrying. */ /* Wait before retrying. */
delay_retry = 1; delay_retry = true;
/* fall through */ /* fall through */
case SCSI_DH_IMM_RETRY: case SCSI_DH_IMM_RETRY:
case SCSI_DH_RES_TEMP_UNAVAIL: case SCSI_DH_RES_TEMP_UNAVAIL:
...@@ -1789,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, ...@@ -1789,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
struct dm_dev *dev; struct dm_dev *dev;
struct multipath *m = ti->private; struct multipath *m = ti->private;
action_fn action; action_fn action;
unsigned long flags;
mutex_lock(&m->work_mutex); mutex_lock(&m->work_mutex);
...@@ -1800,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, ...@@ -1800,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv,
if (argc == 1) { if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) { if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false); r = queue_if_no_path(m, true, false);
spin_lock_irqsave(&m->lock, flags);
enable_nopath_timeout(m);
spin_unlock_irqrestore(&m->lock, flags);
goto out; goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) { } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false); r = queue_if_no_path(m, false, false);
disable_nopath_timeout(m);
goto out; goto out;
} }
} }
...@@ -2065,6 +2127,10 @@ static void __exit dm_multipath_exit(void) ...@@ -2065,6 +2127,10 @@ static void __exit dm_multipath_exit(void)
module_init(dm_multipath_init); module_init(dm_multipath_init);
module_exit(dm_multipath_exit); module_exit(dm_multipath_exit);
module_param_named(queue_if_no_path_timeout_secs,
queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds");
MODULE_DESCRIPTION(DM_NAME " multipath target"); MODULE_DESCRIPTION(DM_NAME " multipath target");
MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>"); MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -129,7 +129,9 @@ struct raid_dev { ...@@ -129,7 +129,9 @@ struct raid_dev {
CTR_FLAG_RAID10_COPIES | \ CTR_FLAG_RAID10_COPIES | \
CTR_FLAG_RAID10_FORMAT | \ CTR_FLAG_RAID10_FORMAT | \
CTR_FLAG_DELTA_DISKS | \ CTR_FLAG_DELTA_DISKS | \
CTR_FLAG_DATA_OFFSET) CTR_FLAG_DATA_OFFSET | \
CTR_FLAG_JOURNAL_DEV | \
CTR_FLAG_JOURNAL_MODE)
/* Valid options definitions per raid level... */ /* Valid options definitions per raid level... */
...@@ -3001,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3001,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{ 1, 254, "Cannot understand number of raid devices parameters" } { 1, 254, "Cannot understand number of raid devices parameters" }
}; };
/* Must have <raid_type> */
arg = dm_shift_arg(&as); arg = dm_shift_arg(&as);
if (!arg) { if (!arg) {
ti->error = "No arguments"; ti->error = "No arguments";
...@@ -3508,8 +3509,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, ...@@ -3508,8 +3509,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned long recovery; unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
unsigned int sz = 0; unsigned int sz = 0;
unsigned int rebuild_disks; unsigned int rebuild_writemostly_count = 0;
unsigned int write_mostly_params = 0;
sector_t progress, resync_max_sectors, resync_mismatches; sector_t progress, resync_max_sectors, resync_mismatches;
enum sync_state state; enum sync_state state;
struct raid_type *rt; struct raid_type *rt;
...@@ -3593,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type, ...@@ -3593,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type,
case STATUSTYPE_TABLE: case STATUSTYPE_TABLE:
/* Report the table line string you would use to construct this raid set */ /* Report the table line string you would use to construct this raid set */
/* Calculate raid parameter count */ /*
for (i = 0; i < rs->raid_disks; i++) * Count any rebuild or writemostly argument pairs and subtract the
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) * hweight count being added below of any rebuild and writemostly ctr flags.
write_mostly_params += 2; */
rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks)); for (i = 0; i < rs->raid_disks; i++) {
raid_param_cnt += rebuild_disks * 2 + rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) +
write_mostly_params + (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0);
}
rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) +
(test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0);
/* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */
raid_param_cnt += rebuild_writemostly_count +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
(test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) +
(test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0);
/* Emit table line */ /* Emit table line */
/* This has to be in the documented order for userspace! */ /* This has to be in the documented order for userspace! */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
...@@ -3612,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type, ...@@ -3612,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type,
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC));
if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC));
if (rebuild_disks) if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++) for (i = 0; i < rs->raid_disks; i++)
if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) if (test_bit(i, (void *) rs->rebuild_disks))
DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i);
rs->dev[i].rdev.raid_disk);
if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags))
DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP),
mddev->bitmap_info.daemon_sleep); mddev->bitmap_info.daemon_sleep);
...@@ -3626,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, ...@@ -3626,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE),
mddev->sync_speed_max); mddev->sync_speed_max);
if (write_mostly_params) if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags))
for (i = 0; i < rs->raid_disks; i++) for (i = 0; i < rs->raid_disks; i++)
if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) if (test_bit(WriteMostly, &rs->dev[i].rdev.flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY), DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY),
...@@ -4029,7 +4030,7 @@ static void raid_resume(struct dm_target *ti) ...@@ -4029,7 +4030,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = { static struct target_type raid_target = {
.name = "raid", .name = "raid",
.version = {1, 15, 0}, .version = {1, 15, 1},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = raid_ctr, .ctr = raid_ctr,
.dtr = raid_dtr, .dtr = raid_dtr,
......
...@@ -1061,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) ...@@ -1061,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
DMERR("Read error in exception store: " DMERR("Read error in exception store: "
"shutting down merge"); "shutting down merge");
down_write(&s->lock); down_write(&s->lock);
s->merge_failed = 1; s->merge_failed = true;
up_write(&s->lock); up_write(&s->lock);
} }
goto shut; goto shut;
...@@ -1149,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) ...@@ -1149,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
shut: shut:
down_write(&s->lock); down_write(&s->lock);
s->merge_failed = 1; s->merge_failed = true;
b = __release_queued_bios_after_merge(s); b = __release_queued_bios_after_merge(s);
up_write(&s->lock); up_write(&s->lock);
error_bios(b); error_bios(b);
...@@ -1314,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -1314,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&s->list); INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock); spin_lock_init(&s->pe_lock);
s->state_bits = 0; s->state_bits = 0;
s->merge_failed = 0; s->merge_failed = false;
s->first_merging_chunk = 0; s->first_merging_chunk = 0;
s->num_merging_chunks = 0; s->num_merging_chunks = 0;
bio_list_init(&s->bios_queued_during_merge); bio_list_init(&s->bios_queued_during_merge);
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
* *
* - A hierarchical btree, with 2 levels which effectively maps (thin * - A hierarchical btree, with 2 levels which effectively maps (thin
* dev id, virtual block) -> block_time. Block time is a 64-bit * dev id, virtual block) -> block_time. Block time is a 64-bit
* field holding the time in the low 24 bits, and block in the top 48 * field holding the time in the low 24 bits, and block in the top 40
* bits. * bits.
* *
* BTrees consist solely of btree_nodes, that fill a block. Some are * BTrees consist solely of btree_nodes, that fill a block. Some are
...@@ -387,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value ...@@ -387,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
* Variant that is used for in-core only changes or code that * Variant that is used for in-core only changes or code that
* shouldn't put the pool in service on its own (e.g. commit). * shouldn't put the pool in service on its own (e.g. commit).
*/ */
static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
__acquires(pmd->root_lock) __acquires(pmd->root_lock)
{ {
down_write(&pmd->root_lock); down_write(&pmd->root_lock);
} }
#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
static inline void pmd_write_lock(struct dm_pool_metadata *pmd) static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
{ {
__pmd_write_lock(pmd); pmd_write_lock_in_core(pmd);
if (unlikely(!pmd->in_service)) if (unlikely(!pmd->in_service))
pmd->in_service = true; pmd->in_service = true;
} }
...@@ -811,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd) ...@@ -811,7 +810,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd)
return r; return r;
if (td->open_count) if (td->open_count)
td->changed = 0; td->changed = false;
else { else {
list_del(&td->list); list_del(&td->list);
kfree(td); kfree(td);
...@@ -831,6 +830,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) ...@@ -831,6 +830,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
* We need to know if the thin_disk_superblock exceeds a 512-byte sector. * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
*/ */
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
BUG_ON(!rwsem_is_locked(&pmd->root_lock));
if (unlikely(!pmd->in_service)) if (unlikely(!pmd->in_service))
return 0; return 0;
...@@ -953,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) ...@@ -953,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
return -EBUSY; return -EBUSY;
} }
pmd_write_lock_in_core(pmd);
if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) { if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
r = __commit_transaction(pmd); r = __commit_transaction(pmd);
if (r < 0) if (r < 0)
...@@ -961,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) ...@@ -961,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
} }
if (!pmd->fail_io) if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd); __destroy_persistent_data_objects(pmd);
pmd_write_unlock(pmd);
kfree(pmd); kfree(pmd);
return 0; return 0;
...@@ -1106,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd, ...@@ -1106,7 +1108,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd,
if (r) if (r)
return r; return r;
td->changed = 1; td->changed = true;
td->snapshotted_time = time; td->snapshotted_time = time;
snap->mapped_blocks = td->mapped_blocks; snap->mapped_blocks = td->mapped_blocks;
...@@ -1618,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block, ...@@ -1618,7 +1620,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block,
if (r) if (r)
return r; return r;
td->changed = 1; td->changed = true;
if (inserted) if (inserted)
td->mapped_blocks++; td->mapped_blocks++;
...@@ -1649,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) ...@@ -1649,7 +1651,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
return r; return r;
td->mapped_blocks--; td->mapped_blocks--;
td->changed = 1; td->changed = true;
return 0; return 0;
} }
...@@ -1703,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_ ...@@ -1703,7 +1705,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_
} }
td->mapped_blocks -= total_count; td->mapped_blocks -= total_count;
td->changed = 1; td->changed = true;
/* /*
* Reinsert the mapping tree. * Reinsert the mapping tree.
...@@ -1841,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) ...@@ -1841,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
* Care is taken to not have commit be what * Care is taken to not have commit be what
* triggers putting the thin-pool in-service. * triggers putting the thin-pool in-service.
*/ */
__pmd_write_lock(pmd); pmd_write_lock_in_core(pmd);
if (pmd->fail_io) if (pmd->fail_io)
goto out; goto out;
......
...@@ -231,6 +231,7 @@ struct pool { ...@@ -231,6 +231,7 @@ struct pool {
struct dm_target *ti; /* Only set if a pool target is bound */ struct dm_target *ti; /* Only set if a pool target is bound */
struct mapped_device *pool_md; struct mapped_device *pool_md;
struct block_device *data_dev;
struct block_device *md_dev; struct block_device *md_dev;
struct dm_pool_metadata *pmd; struct dm_pool_metadata *pmd;
...@@ -281,6 +282,8 @@ struct pool { ...@@ -281,6 +282,8 @@ struct pool {
struct dm_bio_prison_cell **cell_sort_array; struct dm_bio_prison_cell **cell_sort_array;
mempool_t mapping_pool; mempool_t mapping_pool;
struct bio flush_bio;
}; };
static void metadata_operation_failed(struct pool *pool, const char *op, int r); static void metadata_operation_failed(struct pool *pool, const char *op, int r);
...@@ -328,7 +331,6 @@ struct pool_c { ...@@ -328,7 +331,6 @@ struct pool_c {
dm_block_t low_water_blocks; dm_block_t low_water_blocks;
struct pool_features requested_pf; /* Features requested during table load */ struct pool_features requested_pf; /* Features requested during table load */
struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
struct bio flush_bio;
}; };
/* /*
...@@ -2924,6 +2926,7 @@ static void __pool_destroy(struct pool *pool) ...@@ -2924,6 +2926,7 @@ static void __pool_destroy(struct pool *pool)
if (pool->next_mapping) if (pool->next_mapping)
mempool_free(pool->next_mapping, &pool->mapping_pool); mempool_free(pool->next_mapping, &pool->mapping_pool);
mempool_exit(&pool->mapping_pool); mempool_exit(&pool->mapping_pool);
bio_uninit(&pool->flush_bio);
dm_deferred_set_destroy(pool->shared_read_ds); dm_deferred_set_destroy(pool->shared_read_ds);
dm_deferred_set_destroy(pool->all_io_ds); dm_deferred_set_destroy(pool->all_io_ds);
kfree(pool); kfree(pool);
...@@ -2933,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache; ...@@ -2933,6 +2936,7 @@ static struct kmem_cache *_new_mapping_cache;
static struct pool *pool_create(struct mapped_device *pool_md, static struct pool *pool_create(struct mapped_device *pool_md,
struct block_device *metadata_dev, struct block_device *metadata_dev,
struct block_device *data_dev,
unsigned long block_size, unsigned long block_size,
int read_only, char **error) int read_only, char **error)
{ {
...@@ -3003,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, ...@@ -3003,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->low_water_triggered = false; pool->low_water_triggered = false;
pool->suspended = true; pool->suspended = true;
pool->out_of_data_space = false; pool->out_of_data_space = false;
bio_init(&pool->flush_bio, NULL, 0);
pool->shared_read_ds = dm_deferred_set_create(); pool->shared_read_ds = dm_deferred_set_create();
if (!pool->shared_read_ds) { if (!pool->shared_read_ds) {
...@@ -3040,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, ...@@ -3040,6 +3045,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->last_commit_jiffies = jiffies; pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md; pool->pool_md = pool_md;
pool->md_dev = metadata_dev; pool->md_dev = metadata_dev;
pool->data_dev = data_dev;
__pool_table_insert(pool); __pool_table_insert(pool);
return pool; return pool;
...@@ -3081,6 +3087,7 @@ static void __pool_dec(struct pool *pool) ...@@ -3081,6 +3087,7 @@ static void __pool_dec(struct pool *pool)
static struct pool *__pool_find(struct mapped_device *pool_md, static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev, struct block_device *metadata_dev,
struct block_device *data_dev,
unsigned long block_size, int read_only, unsigned long block_size, int read_only,
char **error, int *created) char **error, int *created)
{ {
...@@ -3091,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md, ...@@ -3091,19 +3098,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
*error = "metadata device already in use by a pool"; *error = "metadata device already in use by a pool";
return ERR_PTR(-EBUSY); return ERR_PTR(-EBUSY);
} }
if (pool->data_dev != data_dev) {
*error = "data device already in use by a pool";
return ERR_PTR(-EBUSY);
}
__pool_inc(pool); __pool_inc(pool);
} else { } else {
pool = __pool_table_lookup(pool_md); pool = __pool_table_lookup(pool_md);
if (pool) { if (pool) {
if (pool->md_dev != metadata_dev) { if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
*error = "different pool cannot replace a pool"; *error = "different pool cannot replace a pool";
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
__pool_inc(pool); __pool_inc(pool);
} else { } else {
pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
*created = 1; *created = 1;
} }
} }
...@@ -3124,7 +3135,6 @@ static void pool_dtr(struct dm_target *ti) ...@@ -3124,7 +3135,6 @@ static void pool_dtr(struct dm_target *ti)
__pool_dec(pt->pool); __pool_dec(pt->pool);
dm_put_device(ti, pt->metadata_dev); dm_put_device(ti, pt->metadata_dev);
dm_put_device(ti, pt->data_dev); dm_put_device(ti, pt->data_dev);
bio_uninit(&pt->flush_bio);
kfree(pt); kfree(pt);
mutex_unlock(&dm_thin_pool_table.mutex); mutex_unlock(&dm_thin_pool_table.mutex);
...@@ -3203,11 +3213,11 @@ static void metadata_low_callback(void *context) ...@@ -3203,11 +3213,11 @@ static void metadata_low_callback(void *context)
*/ */
static int metadata_pre_commit_callback(void *context) static int metadata_pre_commit_callback(void *context)
{ {
struct pool_c *pt = context; struct pool *pool = context;
struct bio *flush_bio = &pt->flush_bio; struct bio *flush_bio = &pool->flush_bio;
bio_reset(flush_bio); bio_reset(flush_bio);
bio_set_dev(flush_bio, pt->data_dev->bdev); bio_set_dev(flush_bio, pool->data_dev);
flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
return submit_bio_wait(flush_bio); return submit_bio_wait(flush_bio);
...@@ -3356,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -3356,7 +3366,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out; goto out;
} }
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
if (IS_ERR(pool)) { if (IS_ERR(pool)) {
r = PTR_ERR(pool); r = PTR_ERR(pool);
...@@ -3381,7 +3391,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -3381,7 +3391,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->data_dev = data_dev; pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks; pt->low_water_blocks = low_water_blocks;
pt->adjusted_pf = pt->requested_pf = pf; pt->adjusted_pf = pt->requested_pf = pf;
bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1; ti->num_flush_bios = 1;
/* /*
...@@ -3408,9 +3417,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) ...@@ -3408,9 +3417,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r) if (r)
goto out_flags_changed; goto out_flags_changed;
dm_pool_register_pre_commit_callback(pt->pool->pmd, dm_pool_register_pre_commit_callback(pool->pmd,
metadata_pre_commit_callback, metadata_pre_commit_callback, pool);
pt);
pt->callbacks.congested_fn = pool_is_congested; pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks); dm_table_add_target_callbacks(ti->table, &pt->callbacks);
...@@ -4099,7 +4107,7 @@ static struct target_type pool_target = { ...@@ -4099,7 +4107,7 @@ static struct target_type pool_target = {
.name = "thin-pool", .name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE, DM_TARGET_IMMUTABLE,
.version = {1, 21, 0}, .version = {1, 22, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = pool_ctr, .ctr = pool_ctr,
.dtr = pool_dtr, .dtr = pool_dtr,
...@@ -4476,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) ...@@ -4476,7 +4484,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = { static struct target_type thin_target = {
.name = "thin", .name = "thin",
.version = {1, 21, 0}, .version = {1, 22, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = thin_ctr, .ctr = thin_ctr,
.dtr = thin_dtr, .dtr = thin_dtr,
......
...@@ -611,8 +611,22 @@ static void verity_prefetch_io(struct work_struct *work) ...@@ -611,8 +611,22 @@ static void verity_prefetch_io(struct work_struct *work)
static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
{ {
sector_t block = io->block;
unsigned int n_blocks = io->n_blocks;
struct dm_verity_prefetch_work *pw; struct dm_verity_prefetch_work *pw;
if (v->validated_blocks) {
while (n_blocks && test_bit(block, v->validated_blocks)) {
block++;
n_blocks--;
}
while (n_blocks && test_bit(block + n_blocks - 1,
v->validated_blocks))
n_blocks--;
if (!n_blocks)
return;
}
pw = kmalloc(sizeof(struct dm_verity_prefetch_work), pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
...@@ -621,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) ...@@ -621,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
INIT_WORK(&pw->work, verity_prefetch_io); INIT_WORK(&pw->work, verity_prefetch_io);
pw->v = v; pw->v = v;
pw->block = io->block; pw->block = block;
pw->n_blocks = io->n_blocks; pw->n_blocks = n_blocks;
queue_work(v->verify_wq, &pw->work); queue_work(v->verify_wq, &pw->work);
} }
......
...@@ -442,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context) ...@@ -442,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context)
complete(&endio->c); complete(&endio->c);
} }
static void ssd_commit_flushed(struct dm_writecache *wc) static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
{
wait_event(wc->bio_in_progress_wait[direction],
!atomic_read(&wc->bio_in_progress[direction]));
}
static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{ {
struct dm_io_region region; struct dm_io_region region;
struct dm_io_request req; struct dm_io_request req;
...@@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc) ...@@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc)
writecache_notify_io(0, &endio); writecache_notify_io(0, &endio);
wait_for_completion_io(&endio.c); wait_for_completion_io(&endio.c);
if (wait_for_ios)
writecache_wait_for_ios(wc, WRITE);
writecache_disk_flush(wc, wc->ssd_dev); writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
} }
static void writecache_commit_flushed(struct dm_writecache *wc) static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
{ {
if (WC_MODE_PMEM(wc)) if (WC_MODE_PMEM(wc))
wmb(); wmb();
else else
ssd_commit_flushed(wc); ssd_commit_flushed(wc, wait_for_ios);
} }
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
...@@ -522,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) ...@@ -522,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
writecache_error(wc, r, "error flushing metadata: %d", r); writecache_error(wc, r, "error flushing metadata: %d", r);
} }
static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
{
wait_event(wc->bio_in_progress_wait[direction],
!atomic_read(&wc->bio_in_progress[direction]));
}
#define WFE_RETURN_FOLLOWING 1 #define WFE_RETURN_FOLLOWING 1
#define WFE_LOWEST_SEQ 2 #define WFE_LOWEST_SEQ 2
...@@ -622,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry ...@@ -622,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry
wc->freelist_size++; wc->freelist_size++;
} }
static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{ {
struct wc_entry *e; struct wc_entry *e;
...@@ -631,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) ...@@ -631,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(!wc->current_free)) if (unlikely(!wc->current_free))
return NULL; return NULL;
e = wc->current_free; e = wc->current_free;
if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
return NULL;
next = rb_next(&e->rb_node); next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree); rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next)) if (unlikely(!next))
...@@ -640,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) ...@@ -640,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
if (unlikely(list_empty(&wc->freelist))) if (unlikely(list_empty(&wc->freelist)))
return NULL; return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru); e = container_of(wc->freelist.next, struct wc_entry, lru);
if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
return NULL;
list_del(&e->lru); list_del(&e->lru);
} }
wc->freelist_size--; wc->freelist_size--;
...@@ -724,15 +731,12 @@ static void writecache_flush(struct dm_writecache *wc) ...@@ -724,15 +731,12 @@ static void writecache_flush(struct dm_writecache *wc)
e = e2; e = e2;
cond_resched(); cond_resched();
} }
writecache_commit_flushed(wc); writecache_commit_flushed(wc, true);
if (!WC_MODE_PMEM(wc))
writecache_wait_for_ios(wc, WRITE);
wc->seq_count++; wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
wc->overwrote_committed = false; wc->overwrote_committed = false;
...@@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc) ...@@ -756,7 +760,7 @@ static void writecache_flush(struct dm_writecache *wc)
} }
if (need_flush_after_free) if (need_flush_after_free)
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
} }
static void writecache_flush_work(struct work_struct *work) static void writecache_flush_work(struct work_struct *work)
...@@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ ...@@ -809,7 +813,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
} }
if (discarded_something) if (discarded_something)
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
} }
static bool writecache_wait_for_writeback(struct dm_writecache *wc) static bool writecache_wait_for_writeback(struct dm_writecache *wc)
...@@ -958,7 +962,7 @@ static void writecache_resume(struct dm_target *ti) ...@@ -958,7 +962,7 @@ static void writecache_resume(struct dm_target *ti)
if (need_flush) { if (need_flush) {
writecache_flush_all_metadata(wc); writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
} }
wc_unlock(wc); wc_unlock(wc);
...@@ -1193,7 +1197,7 @@ static int writecache_map(struct dm_target *ti, struct bio *bio) ...@@ -1193,7 +1197,7 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
goto bio_copy; goto bio_copy;
} }
} }
e = writecache_pop_from_freelist(wc); e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) { if (unlikely(!e)) {
writecache_wait_on_freelist(wc); writecache_wait_on_freelist(wc);
continue; continue;
...@@ -1205,9 +1209,26 @@ static int writecache_map(struct dm_target *ti, struct bio *bio) ...@@ -1205,9 +1209,26 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
if (WC_MODE_PMEM(wc)) { if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e)); bio_copy_block(wc, bio, memory_data(wc, e));
} else { } else {
dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); unsigned bio_size = wc->block_size;
sector_t start_cache_sec = cache_sector(wc, e);
sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
while (bio_size < bio->bi_iter.bi_size) {
struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
if (!f)
break;
write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
(bio_size >> SECTOR_SHIFT), wc->seq_count);
writecache_insert_entry(wc, f);
wc->uncommitted_blocks++;
bio_size += wc->block_size;
current_cache_sec += wc->block_size >> SECTOR_SHIFT;
}
bio_set_dev(bio, wc->ssd_dev->bdev); bio_set_dev(bio, wc->ssd_dev->bdev);
bio->bi_iter.bi_sector = cache_sector(wc, e); bio->bi_iter.bi_sector = start_cache_sec;
dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0; wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work); queue_work(wc->writeback_wq, &wc->flush_work);
...@@ -1342,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head * ...@@ -1342,7 +1363,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *
wc->writeback_size--; wc->writeback_size--;
n_walked++; n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) { if (unlikely(n_walked >= ENDIO_LATENCY)) {
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
wc_unlock(wc); wc_unlock(wc);
wc_lock(wc); wc_lock(wc);
n_walked = 0; n_walked = 0;
...@@ -1423,7 +1444,7 @@ static int writecache_endio_thread(void *data) ...@@ -1423,7 +1444,7 @@ static int writecache_endio_thread(void *data)
writecache_wait_for_ios(wc, READ); writecache_wait_for_ios(wc, READ);
} }
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
wc_unlock(wc); wc_unlock(wc);
} }
...@@ -1766,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc) ...@@ -1766,10 +1787,10 @@ static int init_memory(struct dm_writecache *wc)
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
writecache_flush_all_metadata(wc); writecache_flush_all_metadata(wc);
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
writecache_commit_flushed(wc); writecache_commit_flushed(wc, false);
return 0; return 0;
} }
......
...@@ -134,6 +134,7 @@ struct dmz_metadata { ...@@ -134,6 +134,7 @@ struct dmz_metadata {
sector_t zone_bitmap_size; sector_t zone_bitmap_size;
unsigned int zone_nr_bitmap_blocks; unsigned int zone_nr_bitmap_blocks;
unsigned int zone_bits_per_mblk;
unsigned int nr_bitmap_blocks; unsigned int nr_bitmap_blocks;
unsigned int nr_map_blocks; unsigned int nr_map_blocks;
...@@ -1161,7 +1162,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd) ...@@ -1161,7 +1162,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
/* Init */ /* Init */
zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT; zmd->zone_nr_bitmap_blocks =
max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks,
DMZ_BLOCK_SIZE_BITS);
/* Allocate zone array */ /* Allocate zone array */
zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
...@@ -1956,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, ...@@ -1956,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
dmz_release_mblock(zmd, to_mblk); dmz_release_mblock(zmd, to_mblk);
dmz_release_mblock(zmd, from_mblk); dmz_release_mblock(zmd, from_mblk);
chunk_block += DMZ_BLOCK_SIZE_BITS; chunk_block += zmd->zone_bits_per_mblk;
} }
to_zone->weight = from_zone->weight; to_zone->weight = from_zone->weight;
...@@ -2017,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, ...@@ -2017,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Set bits */ /* Set bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS; bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
if (count) { if (count) {
...@@ -2096,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, ...@@ -2096,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Clear bits */ /* Clear bits */
bit = chunk_block & DMZ_BLOCK_MASK_BITS; bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
count = dmz_clear_bits((unsigned long *)mblk->data, count = dmz_clear_bits((unsigned long *)mblk->data,
bit, nr_bits); bit, nr_bits);
...@@ -2156,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, ...@@ -2156,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
{ {
struct dmz_mblock *mblk; struct dmz_mblock *mblk;
unsigned int bit, set_bit, nr_bits; unsigned int bit, set_bit, nr_bits;
unsigned int zone_bits = zmd->zone_bits_per_mblk;
unsigned long *bitmap; unsigned long *bitmap;
int n = 0; int n = 0;
...@@ -2170,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, ...@@ -2170,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
/* Get offset */ /* Get offset */
bitmap = (unsigned long *) mblk->data; bitmap = (unsigned long *) mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS; bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); nr_bits = min(nr_blocks, zone_bits - bit);
if (set) if (set)
set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); set_bit = find_next_bit(bitmap, zone_bits, bit);
else else
set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
dmz_release_mblock(zmd, mblk); dmz_release_mblock(zmd, mblk);
n += set_bit - bit; n += set_bit - bit;
if (set_bit < DMZ_BLOCK_SIZE_BITS) if (set_bit < zone_bits)
break; break;
nr_blocks -= nr_bits; nr_blocks -= nr_bits;
...@@ -2281,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) ...@@ -2281,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Count bits in this block */ /* Count bits in this block */
bitmap = mblk->data; bitmap = mblk->data;
bit = chunk_block & DMZ_BLOCK_MASK_BITS; bit = chunk_block & DMZ_BLOCK_MASK_BITS;
nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
n += dmz_count_bits(bitmap, bit, nr_bits); n += dmz_count_bits(bitmap, bit, nr_bits);
dmz_release_mblock(zmd, mblk); dmz_release_mblock(zmd, mblk);
......
...@@ -1859,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md) ...@@ -1859,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md)
/* /*
* Initialize aspects of queue that aren't relevant for blk-mq * Initialize aspects of queue that aren't relevant for blk-mq
*/ */
md->queue->backing_dev_info->congested_data = md;
md->queue->backing_dev_info->congested_fn = dm_any_congested; md->queue->backing_dev_info->congested_fn = dm_any_congested;
} }
...@@ -1949,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor) ...@@ -1949,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->queue) if (!md->queue)
goto bad; goto bad;
md->queue->queuedata = md; md->queue->queuedata = md;
md->queue->backing_dev_info->congested_data = md; /*
* default to bio-based required ->make_request_fn until DM
* table is loaded and md->type established. If request-based
* table is loaded: blk-mq will override accordingly.
*/
blk_queue_make_request(md->queue, dm_make_request);
md->disk = alloc_disk_node(1, md->numa_node_id); md->disk = alloc_disk_node(1, md->numa_node_id);
if (!md->disk) if (!md->disk)
...@@ -2264,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) ...@@ -2264,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
case DM_TYPE_DAX_BIO_BASED: case DM_TYPE_DAX_BIO_BASED:
case DM_TYPE_NVME_BIO_BASED: case DM_TYPE_NVME_BIO_BASED:
dm_init_normal_md_queue(md); dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
break; break;
case DM_TYPE_NONE: case DM_TYPE_NONE:
WARN_ON_ONCE(true); WARN_ON_ONCE(true);
......
...@@ -380,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, ...@@ -380,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
return -ENOSPC; return -ENOSPC;
} }
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *b)
{
int r;
uint32_t count;
do {
r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
if (r)
break;
/* double check this block wasn't used in the old transaction */
if (*b >= old_ll->nr_blocks)
count = 0;
else {
r = sm_ll_lookup(old_ll, *b, &count);
if (r)
break;
if (count)
begin = *b + 1;
}
} while (count);
return r;
}
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
int (*mutator)(void *context, uint32_t old, uint32_t *new), int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev) void *context, enum allocation_event *ev)
......
...@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); ...@@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
dm_block_t end, dm_block_t *result); dm_block_t end, dm_block_t *result);
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
dm_block_t begin, dm_block_t end, dm_block_t *result);
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
......
...@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) ...@@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev; enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm); struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
/* FIXME: we should loop round a couple of times */ /*
r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); * Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
if (r) if (r)
return r; return r;
......
...@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) ...@@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
enum allocation_event ev; enum allocation_event ev;
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); /*
* Any block we allocate has to be free in both the old and current ll.
*/
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
if (r) if (r)
return r; return r;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment