Commit eeee2827 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.5/dm-changes' of...

Merge tag 'for-5.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Fix DM core to disallow stacking request-based DM on partitions.

 - Fix DM raid target to properly resync raidset even if bitmap needed
   additional pages.

 - Fix DM crypt performance regression due to use of WQ_HIGHPRI for the
   IO and crypt workqueues.

 - Fix DM integrity metadata layout that was aligned on 128K boundary
   rather than the intended 4K boundary (removes 124K of wasted space
   for each metadata block).

 - Improve the DM thin, cache and clone targets to use spin_lock_irq
   rather than spin_lock_irqsave where possible.

 - Fix DM thin single thread performance that was lost due to needless
   workqueue wakeups.

 - Fix DM zoned target performance that was lost due to excessive
   backing device checks.

 - Add ability to trigger write failure with the DM dust test target.

 - Fix whitespace indentation in drivers/md/Kconfig.

 - Various smalls fixes and cleanups (e.g. use struct_size, fix
   uninitialized variable, variable renames, etc).

* tag 'for-5.5/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (22 commits)
  Revert "dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues"
  dm: Fix Kconfig indentation
  dm thin: wakeup worker only when deferred bios exist
  dm integrity: fix excessive alignment of metadata runs
  dm raid: Remove unnecessary negation of a shift in raid10_format_to_md_layout
  dm zoned: reduce overhead of backing device checks
  dm dust: add limited write failure mode
  dm dust: change ret to r in dust_map_read and dust_map
  dm dust: change result vars to r
  dm cache: replace spin_lock_irqsave with spin_lock_irq
  dm bio prison: replace spin_lock_irqsave with spin_lock_irq
  dm thin: replace spin_lock_irqsave with spin_lock_irq
  dm clone: add bucket_lock_irq/bucket_unlock_irq helpers
  dm clone: replace spin_lock_irqsave with spin_lock_irq
  dm writecache: handle REQ_FUA
  dm writecache: fix uninitialized variable warning
  dm stripe: use struct_size() in kmalloc()
  dm raid: streamline rs_get_progress() and its raid_status() caller side
  dm raid: simplify rs_setup_recovery call chain
  dm raid: to ensure resynchronization, perform raid set grow in preresume
  ...
parents 7e5192b9 f612b213
......@@ -177,6 +177,11 @@ bitmap_flush_interval:number
The bitmap flush interval in milliseconds. The metadata buffers
are synchronized when this interval expires.
fix_padding
Use a smaller padding of the tag area that is more
space-efficient. If this option is not present, large padding is
used - that is for compatibility with older kernels.
The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
be changed when reloading the target (load an inactive table and swap the
......
......@@ -417,3 +417,5 @@ Version History
deadlock/potential data corruption. Update superblock when
specific devices are requested via rebuild. Fix RAID leg
rebuild errors.
1.15.0 Fix size extensions not being synchronized in case of new MD bitmap
pages allocated; also fix those not occuring after previous reductions
......@@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison,
struct dm_bio_prison_cell **cell_result)
{
int r;
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
return r;
}
......@@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell,
struct bio_list *bios)
{
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
__cell_release(prison, cell, bios);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
......@@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void *context,
struct dm_bio_prison_cell *cell)
{
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
visit_fn(context, cell);
rb_erase(&cell->node, &prison->cells);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_visit_release);
......@@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell)
{
int r;
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
r = __promote_or_release(prison, cell);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
return r;
}
......@@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec);
int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
{
int r = 1;
unsigned long flags;
unsigned next_entry;
spin_lock_irqsave(&ds->lock, flags);
spin_lock_irq(&ds->lock);
if ((ds->sweeper == ds->current_entry) &&
!ds->entries[ds->current_entry].count)
r = 0;
......@@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work)
if (!ds->entries[next_entry].count)
ds->current_entry = next_entry;
}
spin_unlock_irqrestore(&ds->lock, flags);
spin_unlock_irq(&ds->lock);
return r;
}
......
......@@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 **cell_result)
{
int r;
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
return r;
}
......@@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 **cell_result)
{
int r;
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
r = __lock(prison, key, lock_level, cell_prealloc, cell_result);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
return r;
}
......@@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison,
struct dm_bio_prison_cell_v2 *cell,
struct work_struct *continuation)
{
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
__quiesce(prison, cell, continuation);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
}
EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2);
......@@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison,
unsigned new_lock_level)
{
int r;
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
r = __promote(prison, cell, new_lock_level);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
return r;
}
......@@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison,
struct bio_list *bios)
{
bool r;
unsigned long flags;
spin_lock_irqsave(&prison->lock, flags);
spin_lock_irq(&prison->lock);
r = __unlock(prison, cell, bios);
spin_unlock_irqrestore(&prison->lock, flags);
spin_unlock_irq(&prison->lock);
return r;
}
......
......@@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
bool r;
unsigned long flags;
spin_lock_irqsave(&iot->lock, flags);
spin_lock_irq(&iot->lock);
r = __iot_idle_for(iot, jifs);
spin_unlock_irqrestore(&iot->lock, flags);
spin_unlock_irq(&iot->lock);
return r;
}
static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
unsigned long flags;
spin_lock_irqsave(&iot->lock, flags);
spin_lock_irq(&iot->lock);
iot->in_flight += len;
spin_unlock_irqrestore(&iot->lock, flags);
spin_unlock_irq(&iot->lock);
}
static void __iot_io_end(struct io_tracker *iot, sector_t len)
......@@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws)
{
struct batcher *b = container_of(_ws, struct batcher, commit_work);
blk_status_t r;
unsigned long flags;
struct list_head work_items;
struct work_struct *ws, *tmp;
struct continuation *k;
......@@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws)
* We have to grab these before the commit_op to avoid a race
* condition.
*/
spin_lock_irqsave(&b->lock, flags);
spin_lock_irq(&b->lock);
list_splice_init(&b->work_items, &work_items);
bio_list_merge(&bios, &b->bios);
bio_list_init(&b->bios);
b->commit_scheduled = false;
spin_unlock_irqrestore(&b->lock, flags);
spin_unlock_irq(&b->lock);
r = b->commit_op(b->commit_context);
......@@ -238,13 +234,12 @@ static void async_commit(struct batcher *b)
static void continue_after_commit(struct batcher *b, struct continuation *k)
{
unsigned long flags;
bool commit_scheduled;
spin_lock_irqsave(&b->lock, flags);
spin_lock_irq(&b->lock);
commit_scheduled = b->commit_scheduled;
list_add_tail(&k->ws.entry, &b->work_items);
spin_unlock_irqrestore(&b->lock, flags);
spin_unlock_irq(&b->lock);
if (commit_scheduled)
async_commit(b);
......@@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k)
*/
static void issue_after_commit(struct batcher *b, struct bio *bio)
{
unsigned long flags;
bool commit_scheduled;
spin_lock_irqsave(&b->lock, flags);
spin_lock_irq(&b->lock);
commit_scheduled = b->commit_scheduled;
bio_list_add(&b->bios, bio);
spin_unlock_irqrestore(&b->lock, flags);
spin_unlock_irq(&b->lock);
if (commit_scheduled)
async_commit(b);
......@@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio)
static void schedule_commit(struct batcher *b)
{
bool immediate;
unsigned long flags;
spin_lock_irqsave(&b->lock, flags);
spin_lock_irq(&b->lock);
immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
b->commit_scheduled = true;
spin_unlock_irqrestore(&b->lock, flags);
spin_unlock_irq(&b->lock);
if (immediate)
async_commit(b);
......@@ -630,23 +623,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio)
static void defer_bio(struct cache *cache, struct bio *bio)
{
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
bio_list_add(&cache->deferred_bios, bio);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
}
static void defer_bios(struct cache *cache, struct bio_list *bios)
{
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
bio_list_merge(&cache->deferred_bios, bios);
bio_list_init(bios);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
wake_deferred_bio_worker(cache);
}
......@@ -756,33 +745,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
static void set_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
atomic_inc(&cache->stats.discard_count);
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
set_bit(from_dblock(b), cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
}
static void clear_discard(struct cache *cache, dm_dblock_t b)
{
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
clear_bit(from_dblock(b), cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
}
static bool is_discarded(struct cache *cache, dm_dblock_t b)
{
int r;
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
r = test_bit(from_dblock(b), cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
return r;
}
......@@ -790,12 +773,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b)
static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
{
int r;
unsigned long flags;
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
cache->discard_bitset);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
return r;
}
......@@ -827,17 +808,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
{
unsigned long flags;
struct per_bio_data *pb;
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
bio_op(bio) != REQ_OP_DISCARD) {
pb = get_per_bio_data(bio);
pb->tick = true;
cache->need_tick_bio = false;
}
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
}
static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
......@@ -1889,17 +1869,16 @@ static void process_deferred_bios(struct work_struct *ws)
{
struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
unsigned long flags;
bool commit_needed = false;
struct bio_list bios;
struct bio *bio;
bio_list_init(&bios);
spin_lock_irqsave(&cache->lock, flags);
spin_lock_irq(&cache->lock);
bio_list_merge(&bios, &cache->deferred_bios);
bio_list_init(&cache->deferred_bios);
spin_unlock_irqrestore(&cache->lock, flags);
spin_unlock_irq(&cache->lock);
while ((bio = bio_list_pop(&bios))) {
if (bio->bi_opf & REQ_PREFLUSH)
......
......@@ -712,7 +712,7 @@ static int __metadata_commit(struct dm_clone_metadata *cmd)
static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
{
int r;
unsigned long word, flags;
unsigned long word;
word = 0;
do {
......@@ -736,9 +736,9 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
return r;
/* Update the changed flag */
spin_lock_irqsave(&cmd->bitmap_lock, flags);
spin_lock_irq(&cmd->bitmap_lock);
dmap->changed = 0;
spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
spin_unlock_irq(&cmd->bitmap_lock);
return 0;
}
......@@ -746,7 +746,6 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
{
int r = -EPERM;
unsigned long flags;
struct dirty_map *dmap, *next_dmap;
down_write(&cmd->lock);
......@@ -770,9 +769,9 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
}
/* Swap dirty bitmaps */
spin_lock_irqsave(&cmd->bitmap_lock, flags);
spin_lock_irq(&cmd->bitmap_lock);
cmd->current_dmap = next_dmap;
spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
spin_unlock_irq(&cmd->bitmap_lock);
/*
* No one is accessing the old dirty bitmap anymore, so we can flush
......@@ -817,9 +816,9 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
{
int r = 0;
struct dirty_map *dmap;
unsigned long word, region_nr, flags;
unsigned long word, region_nr;
spin_lock_irqsave(&cmd->bitmap_lock, flags);
spin_lock_irq(&cmd->bitmap_lock);
if (cmd->read_only) {
r = -EPERM;
......@@ -836,7 +835,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
}
}
out:
spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
spin_unlock_irq(&cmd->bitmap_lock);
return r;
}
......@@ -903,13 +902,11 @@ int dm_clone_metadata_abort(struct dm_clone_metadata *cmd)
void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd)
{
unsigned long flags;
down_write(&cmd->lock);
spin_lock_irqsave(&cmd->bitmap_lock, flags);
spin_lock_irq(&cmd->bitmap_lock);
cmd->read_only = 1;
spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
spin_unlock_irq(&cmd->bitmap_lock);
if (!cmd->fail_io)
dm_bm_set_read_only(cmd->bm);
......@@ -919,13 +916,11 @@ void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd)
void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd)
{
unsigned long flags;
down_write(&cmd->lock);
spin_lock_irqsave(&cmd->bitmap_lock, flags);
spin_lock_irq(&cmd->bitmap_lock);
cmd->read_only = 0;
spin_unlock_irqrestore(&cmd->bitmap_lock, flags);
spin_unlock_irq(&cmd->bitmap_lock);
if (!cmd->fail_io)
dm_bm_set_read_write(cmd->bm);
......
......@@ -44,7 +44,9 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re
* @start: Starting region number
* @nr_regions: Number of regions in the range
*
* This function doesn't block, so it's safe to call it from interrupt context.
* This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq()
* it's NOT safe to call it from any context where interrupts are disabled, e.g.,
* from interrupt context.
*/
int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
unsigned long nr_regions);
......
......@@ -332,8 +332,6 @@ static void submit_bios(struct bio_list *bios)
*/
static void issue_bio(struct clone *clone, struct bio *bio)
{
unsigned long flags;
if (!bio_triggers_commit(clone, bio)) {
generic_make_request(bio);
return;
......@@ -352,9 +350,9 @@ static void issue_bio(struct clone *clone, struct bio *bio)
* Batch together any bios that trigger commits and then issue a single
* commit for them in process_deferred_flush_bios().
*/
spin_lock_irqsave(&clone->lock, flags);
spin_lock_irq(&clone->lock);
bio_list_add(&clone->deferred_flush_bios, bio);
spin_unlock_irqrestore(&clone->lock, flags);
spin_unlock_irq(&clone->lock);
wake_worker(clone);
}
......@@ -469,7 +467,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ
static void process_discard_bio(struct clone *clone, struct bio *bio)
{
unsigned long rs, re, flags;
unsigned long rs, re;
bio_region_range(clone, bio, &rs, &re);
BUG_ON(re > clone->nr_regions);
......@@ -501,9 +499,9 @@ static void process_discard_bio(struct clone *clone, struct bio *bio)
/*
* Defer discard processing.
*/
spin_lock_irqsave(&clone->lock, flags);
spin_lock_irq(&clone->lock);
bio_list_add(&clone->deferred_discard_bios, bio);
spin_unlock_irqrestore(&clone->lock, flags);
spin_unlock_irq(&clone->lock);
wake_worker(clone);
}
......@@ -554,6 +552,12 @@ struct hash_table_bucket {
#define bucket_unlock_irqrestore(bucket, flags) \
spin_unlock_irqrestore(&(bucket)->lock, flags)
#define bucket_lock_irq(bucket) \
spin_lock_irq(&(bucket)->lock)
#define bucket_unlock_irq(bucket) \
spin_unlock_irq(&(bucket)->lock)
static int hash_table_init(struct clone *clone)
{
unsigned int i, sz;
......@@ -851,7 +855,6 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio
*/
static void hydrate_bio_region(struct clone *clone, struct bio *bio)
{
unsigned long flags;
unsigned long region_nr;
struct hash_table_bucket *bucket;
struct dm_clone_region_hydration *hd, *hd2;
......@@ -859,19 +862,19 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
region_nr = bio_to_region(clone, bio);
bucket = get_hash_table_bucket(clone, region_nr);
bucket_lock_irqsave(bucket, flags);
bucket_lock_irq(bucket);
hd = __hash_find(bucket, region_nr);
if (hd) {
/* Someone else is hydrating the region */
bio_list_add(&hd->deferred_bios, bio);
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
return;
}
if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
/* The region has been hydrated */
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
issue_bio(clone, bio);
return;
}
......@@ -880,16 +883,16 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
* We must allocate a hydration descriptor and start the hydration of
* the corresponding region.
*/
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
hd = alloc_hydration(clone);
hydration_init(hd, region_nr);
bucket_lock_irqsave(bucket, flags);
bucket_lock_irq(bucket);
/* Check if the region has been hydrated in the meantime. */
if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
free_hydration(hd);
issue_bio(clone, bio);
return;
......@@ -899,7 +902,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
if (hd2 != hd) {
/* Someone else started the region's hydration. */
bio_list_add(&hd2->deferred_bios, bio);
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
free_hydration(hd);
return;
}
......@@ -911,7 +914,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
*/
if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
hlist_del(&hd->h);
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
free_hydration(hd);
bio_io_error(bio);
return;
......@@ -925,11 +928,11 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio)
* to the destination device.
*/
if (is_overwrite_bio(clone, bio)) {
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
hydration_overwrite(hd, bio);
} else {
bio_list_add(&hd->deferred_bios, bio);
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
hydration_copy(hd, 1);
}
}
......@@ -996,7 +999,6 @@ static unsigned long __start_next_hydration(struct clone *clone,
unsigned long offset,
struct batch_info *batch)
{
unsigned long flags;
struct hash_table_bucket *bucket;
struct dm_clone_region_hydration *hd;
unsigned long nr_regions = clone->nr_regions;
......@@ -1010,13 +1012,13 @@ static unsigned long __start_next_hydration(struct clone *clone,
break;
bucket = get_hash_table_bucket(clone, offset);
bucket_lock_irqsave(bucket, flags);
bucket_lock_irq(bucket);
if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
!__hash_find(bucket, offset)) {
hydration_init(hd, offset);
__insert_region_hydration(bucket, hd);
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
/* Batch hydration */
__batch_hydration(batch, hd);
......@@ -1024,7 +1026,7 @@ static unsigned long __start_next_hydration(struct clone *clone,
return (offset + 1);
}
bucket_unlock_irqrestore(bucket, flags);
bucket_unlock_irq(bucket);
} while (++offset < nr_regions);
......@@ -1140,13 +1142,13 @@ static void process_deferred_discards(struct clone *clone)
int r = -EPERM;
struct bio *bio;
struct blk_plug plug;
unsigned long rs, re, flags;
unsigned long rs, re;
struct bio_list discards = BIO_EMPTY_LIST;
spin_lock_irqsave(&clone->lock, flags);
spin_lock_irq(&clone->lock);
bio_list_merge(&discards, &clone->deferred_discard_bios);
bio_list_init(&clone->deferred_discard_bios);
spin_unlock_irqrestore(&clone->lock, flags);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&discards))
return;
......@@ -1176,13 +1178,12 @@ static void process_deferred_discards(struct clone *clone)
static void process_deferred_bios(struct clone *clone)
{
unsigned long flags;
struct bio_list bios = BIO_EMPTY_LIST;
spin_lock_irqsave(&clone->lock, flags);
spin_lock_irq(&clone->lock);
bio_list_merge(&bios, &clone->deferred_bios);
bio_list_init(&clone->deferred_bios);
spin_unlock_irqrestore(&clone->lock, flags);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios))
return;
......@@ -1193,7 +1194,6 @@ static void process_deferred_bios(struct clone *clone)
static void process_deferred_flush_bios(struct clone *clone)
{
struct bio *bio;
unsigned long flags;
struct bio_list bios = BIO_EMPTY_LIST;
struct bio_list bio_completions = BIO_EMPTY_LIST;
......@@ -1201,13 +1201,13 @@ static void process_deferred_flush_bios(struct clone *clone)
* If there are any deferred flush bios, we must commit the metadata
* before issuing them or signaling their completion.
*/
spin_lock_irqsave(&clone->lock, flags);
spin_lock_irq(&clone->lock);
bio_list_merge(&bios, &clone->deferred_flush_bios);
bio_list_init(&clone->deferred_flush_bios);
bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
bio_list_init(&clone->deferred_flush_completions);
spin_unlock_irqrestore(&clone->lock, flags);
spin_unlock_irq(&clone->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
......
......@@ -2700,21 +2700,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -ENOMEM;
cc->io_queue = alloc_workqueue("kcryptd_io/%s",
WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
1, devname);
cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
cc->crypt_queue = alloc_workqueue("kcryptd/%s",
WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
1, devname);
else
cc->crypt_queue = alloc_workqueue("kcryptd/%s",
WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
num_online_cpus(), devname);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
......
......@@ -17,6 +17,7 @@
struct badblock {
struct rb_node node;
sector_t bb;
unsigned char wr_fail_cnt;
};
struct dust_device {
......@@ -101,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block)
return 0;
}
static int dust_add_block(struct dust_device *dd, unsigned long long block)
static int dust_add_block(struct dust_device *dd, unsigned long long block,
unsigned char wr_fail_cnt)
{
struct badblock *bblock;
unsigned long flags;
......@@ -115,6 +117,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
spin_lock_irqsave(&dd->dust_lock, flags);
bblock->bb = block;
bblock->wr_fail_cnt = wr_fail_cnt;
if (!dust_rb_insert(&dd->badblocklist, bblock)) {
if (!dd->quiet_mode) {
DMERR("%s: block %llu already in badblocklist",
......@@ -126,8 +129,10 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block)
}
dd->badblock_count++;
if (!dd->quiet_mode)
DMINFO("%s: badblock added at block %llu", __func__, block);
if (!dd->quiet_mode) {
DMINFO("%s: badblock added at block %llu with write fail count %hhu",
__func__, block, wr_fail_cnt);
}
spin_unlock_irqrestore(&dd->dust_lock, flags);
return 0;
......@@ -163,22 +168,27 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
int ret = DM_MAPIO_REMAPPED;
int r = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
ret = __dust_map_read(dd, thisblock);
r = __dust_map_read(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
return ret;
return r;
}
static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
static int __dust_map_write(struct dust_device *dd, sector_t thisblock)
{
struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
if (bblk && bblk->wr_fail_cnt > 0) {
bblk->wr_fail_cnt--;
return DM_MAPIO_KILL;
}
if (bblk) {
rb_erase(&bblk->node, &dd->badblocklist);
dd->badblock_count--;
......@@ -189,37 +199,40 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
(unsigned long long)thisblock);
}
}
return DM_MAPIO_REMAPPED;
}
static int dust_map_write(struct dust_device *dd, sector_t thisblock,
bool fail_read_on_bb)
{
unsigned long flags;
int ret = DM_MAPIO_REMAPPED;
if (fail_read_on_bb) {
thisblock >>= dd->sect_per_block_shift;
spin_lock_irqsave(&dd->dust_lock, flags);
__dust_map_write(dd, thisblock);
ret = __dust_map_write(dd, thisblock);
spin_unlock_irqrestore(&dd->dust_lock, flags);
}
return DM_MAPIO_REMAPPED;
return ret;
}
static int dust_map(struct dm_target *ti, struct bio *bio)
{
struct dust_device *dd = ti->private;
int ret;
int r;
bio_set_dev(bio, dd->dev->bdev);
bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
if (bio_data_dir(bio) == READ)
ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
else
ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
return ret;
return r;
}
static bool __dust_clear_badblocks(struct rb_root *tree,
......@@ -375,8 +388,10 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
struct dust_device *dd = ti->private;
sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
bool invalid_msg = false;
int result = -EINVAL;
int r = -EINVAL;
unsigned long long tmp, block;
unsigned char wr_fail_cnt;
unsigned int tmp_ui;
unsigned long flags;
char dummy;
......@@ -388,45 +403,69 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
} else if (!strcasecmp(argv[0], "disable")) {
DMINFO("disabling read failures on bad sectors");
dd->fail_read_on_bb = false;
result = 0;
r = 0;
} else if (!strcasecmp(argv[0], "enable")) {
DMINFO("enabling read failures on bad sectors");
dd->fail_read_on_bb = true;
result = 0;
r = 0;
} else if (!strcasecmp(argv[0], "countbadblocks")) {
spin_lock_irqsave(&dd->dust_lock, flags);
DMINFO("countbadblocks: %llu badblock(s) found",
dd->badblock_count);
spin_unlock_irqrestore(&dd->dust_lock, flags);
result = 0;
r = 0;
} else if (!strcasecmp(argv[0], "clearbadblocks")) {
result = dust_clear_badblocks(dd);
r = dust_clear_badblocks(dd);
} else if (!strcasecmp(argv[0], "quiet")) {
if (!dd->quiet_mode)
dd->quiet_mode = true;
else
dd->quiet_mode = false;
result = 0;
r = 0;
} else {
invalid_msg = true;
}
} else if (argc == 2) {
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
return result;
return r;
block = tmp;
sector_div(size, dd->sect_per_block);
if (block > size) {
DMERR("selected block value out of range");
return result;
return r;
}
if (!strcasecmp(argv[0], "addbadblock"))
result = dust_add_block(dd, block);
r = dust_add_block(dd, block, 0);
else if (!strcasecmp(argv[0], "removebadblock"))
result = dust_remove_block(dd, block);
r = dust_remove_block(dd, block);
else if (!strcasecmp(argv[0], "queryblock"))
result = dust_query_block(dd, block);
r = dust_query_block(dd, block);
else
invalid_msg = true;
} else if (argc == 3) {
if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
return r;
if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1)
return r;
block = tmp;
if (tmp_ui > 255) {
DMERR("selected write fail count out of range");
return r;
}
wr_fail_cnt = tmp_ui;
sector_div(size, dd->sect_per_block);
if (block > size) {
DMERR("selected block value out of range");
return r;
}
if (!strcasecmp(argv[0], "addbadblock"))
r = dust_add_block(dd, block, wr_fail_cnt);
else
invalid_msg = true;
......@@ -436,7 +475,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
if (invalid_msg)
DMERR("unrecognized message '%s' received", argv[0]);
return result;
return r;
}
static void dust_status(struct dm_target *ti, status_type_t type,
......@@ -499,12 +538,12 @@ static struct target_type dust_target = {
static int __init dm_dust_init(void)
{
int result = dm_register_target(&dust_target);
int r = dm_register_target(&dust_target);
if (result < 0)
DMERR("dm_register_target failed %d", result);
if (r < 0)
DMERR("dm_register_target failed %d", r);
return result;
return r;
}
static void __exit dm_dust_exit(void)
......
......@@ -53,6 +53,7 @@
#define SB_VERSION_1 1
#define SB_VERSION_2 2
#define SB_VERSION_3 3
#define SB_VERSION_4 4
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
......@@ -73,6 +74,7 @@ struct superblock {
#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
#define SB_FLAG_RECALCULATING 0x2
#define SB_FLAG_DIRTY_BITMAP 0x4
#define SB_FLAG_FIXED_PADDING 0x8
#define JOURNAL_ENTRY_ROUNDUP 8
......@@ -250,6 +252,7 @@ struct dm_integrity_c {
bool journal_uptodate;
bool just_formatted;
bool recalculate_flag;
bool fix_padding;
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
......@@ -463,7 +466,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
static void sb_set_version(struct dm_integrity_c *ic)
{
if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING))
ic->sb->version = SB_VERSION_4;
else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
ic->sb->version = SB_VERSION_3;
else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
ic->sb->version = SB_VERSION_2;
......@@ -2955,6 +2960,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0;
DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start,
ic->tag_size, ic->mode, arg_count);
if (ic->meta_dev)
......@@ -2974,6 +2980,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
}
if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0)
DMEMIT(" fix_padding");
#define EMIT_ALG(a, n) \
do { \
......@@ -3042,8 +3050,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
if (!ic->meta_dev) {
sector_t last_sector, last_area, last_offset;
ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
(__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT;
/* we have to maintain excessive padding for compatibility with existing volumes */
__u64 metadata_run_padding =
ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ?
(__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) :
(__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS);
ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block),
metadata_run_padding) >> SECTOR_SHIFT;
if (!(ic->metadata_run & (ic->metadata_run - 1)))
ic->log2_metadata_run = __ffs(ic->metadata_run);
else
......@@ -3086,6 +3100,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec
journal_sections = 1;
if (!ic->meta_dev) {
if (ic->fix_padding)
ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING);
ic->sb->journal_sections = cpu_to_le32(journal_sections);
if (!interleave_sectors)
interleave_sectors = DEFAULT_INTERLEAVE_SECTORS;
......@@ -3725,6 +3741,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
} else if (!strcmp(opt_string, "recalculate")) {
ic->recalculate_flag = true;
} else if (!strcmp(opt_string, "fix_padding")) {
ic->fix_padding = true;
} else {
r = -EINVAL;
ti->error = "Invalid argument";
......@@ -3867,7 +3885,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
should_write_sb = true;
}
if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
if (!ic->sb->version || ic->sb->version > SB_VERSION_4) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
......@@ -4182,7 +4200,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
.version = {1, 3, 0},
.version = {1, 4, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
......
......@@ -209,6 +209,7 @@ struct raid_dev {
#define RT_FLAG_RS_SUSPENDED 5
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
#define RT_FLAG_RS_GROW 8
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
......@@ -241,6 +242,9 @@ struct raid_set {
struct raid_type *raid_type;
struct dm_target_callbacks callbacks;
sector_t array_sectors;
sector_t dev_sectors;
/* Optional raid4/5/6 journal device */
struct journal_dev {
struct dm_dev *dev;
......@@ -616,7 +620,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs,
} else if (algorithm == ALGORITHM_RAID10_FAR) {
f = copies;
r = !RAID10_OFFSET;
if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags))
r |= RAID10_USE_FAR_SETS;
......@@ -1615,13 +1618,12 @@ static int _check_data_dev_sectors(struct raid_set *rs)
}
/* Calculate the sectors per device and per array used for @rs */
static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev)
{
int delta_disks;
unsigned int data_stripes;
sector_t array_sectors = sectors, dev_sectors = sectors;
struct mddev *mddev = &rs->md;
struct md_rdev *rdev;
sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len;
if (use_mddev) {
delta_disks = mddev->delta_disks;
......@@ -1656,12 +1658,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
/* Striped layouts */
array_sectors = (data_stripes + delta_disks) * dev_sectors;
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags))
rdev->sectors = dev_sectors;
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
rs_set_rdev_sectors(rs);
return _check_data_dev_sectors(rs);
bad:
......@@ -1670,7 +1669,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
}
/* Setup recovery on @rs */
static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
{
/* raid0 does not recover */
if (rs_is_raid0(rs))
......@@ -1691,22 +1690,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
? MaxSector : dev_sectors;
}
/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */
static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
{
if (!dev_sectors)
/* New raid set or 'sync' flag provided */
__rs_setup_recovery(rs, 0);
else if (dev_sectors == MaxSector)
/* Prevent recovery */
__rs_setup_recovery(rs, MaxSector);
else if (__rdev_sectors(rs) < dev_sectors)
/* Grown raid set */
__rs_setup_recovery(rs, __rdev_sectors(rs));
else
__rs_setup_recovery(rs, MaxSector);
}
static void do_table_event(struct work_struct *ws)
{
struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
......@@ -2474,7 +2457,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
return -EINVAL;
}
/* Enable bitmap creation for RAID levels != 0 */
/* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */
mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
......@@ -2911,7 +2894,7 @@ static int rs_setup_reshape(struct raid_set *rs)
/* Remove disk(s) */
} else if (rs->delta_disks < 0) {
r = rs_set_dev_and_array_sectors(rs, true);
r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true);
mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */
/* Change layout and/or chunk size */
......@@ -3008,7 +2991,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bool resize = false;
struct raid_type *rt;
unsigned int num_raid_params, num_raid_devs;
sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors;
sector_t sb_array_sectors, rdev_sectors, reshape_sectors;
struct raid_set *rs = NULL;
const char *arg;
struct rs_layout rs_layout;
......@@ -3067,11 +3050,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*
* Any existing superblock will overwrite the array and device sizes
*/
r = rs_set_dev_and_array_sectors(rs, false);
r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
if (r)
goto bad;
calculated_dev_sectors = rs->md.dev_sectors;
/* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */
rs->array_sectors = rs->md.array_sectors;
rs->dev_sectors = rs->md.dev_sectors;
/*
* Backup any new raid set level, layout, ...
......@@ -3084,6 +3069,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
/* All in-core metadata now as of current superblocks after calling analyse_superblocks() */
sb_array_sectors = rs->md.array_sectors;
rdev_sectors = __rdev_sectors(rs);
if (!rdev_sectors) {
ti->error = "Invalid rdev size";
......@@ -3093,8 +3080,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
reshape_sectors = _get_reshape_sectors(rs);
if (calculated_dev_sectors != rdev_sectors)
resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors);
if (rs->dev_sectors != rdev_sectors) {
resize = (rs->dev_sectors != rdev_sectors - reshape_sectors);
if (rs->dev_sectors > rdev_sectors - reshape_sectors)
set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
}
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
......@@ -3121,13 +3111,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_set_new(rs);
} else if (rs_is_recovering(rs)) {
/* Rebuild particular devices */
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_setup_recovery(rs, MaxSector);
}
/* A recovering raid set may be resized */
; /* skip setup rs */
goto size_check;
} else if (rs_is_reshaping(rs)) {
/* Have to reject size change request during reshape */
if (resize) {
......@@ -3171,6 +3156,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs_setup_recovery(rs, MaxSector);
rs_set_new(rs);
} else if (rs_reshape_requested(rs)) {
/* Only request grow on raid set size extensions, not on reshapes. */
clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
/*
* No need to check for 'ongoing' takeover here, because takeover
* is an instant operation as oposed to an ongoing reshape.
......@@ -3201,13 +3189,31 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
rs_set_cur(rs);
} else {
size_check:
/* May not set recovery when a device rebuild is requested */
if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) {
clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags);
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
rs_setup_recovery(rs, MaxSector);
} else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
/*
* Set raid set to current size, i.e. size as of
* superblocks to grow to larger size in preresume.
*/
r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false);
if (r)
goto bad;
rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
} else {
/* This is no size change or it is shrinking, update size and record in superblocks */
r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
if (r)
goto bad;
if (sb_array_sectors > rs->array_sectors)
set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags);
} else
rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ?
0 : (resize ? calculated_dev_sectors : MaxSector));
}
rs_set_cur(rs);
}
......@@ -3406,10 +3412,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev)
/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */
static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
sector_t resync_max_sectors)
enum sync_state state, sector_t resync_max_sectors)
{
sector_t r;
enum sync_state state;
struct mddev *mddev = &rs->md;
clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
......@@ -3420,8 +3425,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
} else {
state = decipher_sync_action(mddev, recovery);
if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
r = mddev->recovery_cp;
else
......@@ -3439,18 +3442,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
/*
* In case we are recovering, the array is not in sync
* and health chars should show the recovering legs.
*
* Already retrieved recovery offset from curr_resync_completed above.
*/
;
else if (state == st_resync)
/*
* If "resync" is occurring, the raid set
* is or may be out of sync hence the health
* characters shall be 'a'.
*/
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
else if (state == st_reshape)
else if (state == st_resync || state == st_reshape)
/*
* If "reshape" is occurring, the raid set
* If "resync/reshape" is occurring, the raid set
* is or may be out of sync hence the health
* characters shall be 'a'.
*/
......@@ -3464,22 +3463,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
*/
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
else {
struct md_rdev *rdev;
else if (test_bit(MD_RECOVERY_NEEDED, &recovery))
/*
* We are idle and recovery is needed, prevent 'A' chars race
* caused by components still set to in-sync by constructor.
*/
if (test_bit(MD_RECOVERY_NEEDED, &recovery))
set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags);
else {
/*
* The raid set may be doing an initial sync, or it may
* be rebuilding individual components. If all the
* devices are In_sync, then it is the raid set that is
* being initialized.
* We are idle and the raid set may be doing an initial
* sync, or it may be rebuilding individual components.
* If all the devices are In_sync, then it is the raid set
* that is being initialized.
*/
struct md_rdev *rdev;
set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags);
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags) &&
......@@ -3512,7 +3511,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
unsigned int rebuild_disks;
unsigned int write_mostly_params = 0;
sector_t progress, resync_max_sectors, resync_mismatches;
const char *sync_action;
enum sync_state state;
struct raid_type *rt;
switch (type) {
......@@ -3526,14 +3525,14 @@ static void raid_status(struct dm_target *ti, status_type_t type,
/* Access most recent mddev properties for status output */
smp_rmb();
recovery = rs->md.recovery;
/* Get sensible max sectors even if raid set not yet started */
resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ?
mddev->resync_max_sectors : mddev->dev_sectors;
progress = rs_get_progress(rs, recovery, resync_max_sectors);
recovery = rs->md.recovery;
state = decipher_sync_action(mddev, recovery);
progress = rs_get_progress(rs, recovery, state, resync_max_sectors);
resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ?
atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = sync_str(decipher_sync_action(&rs->md, recovery));
/* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
for (i = 0; i < rs->raid_disks; i++)
......@@ -3561,7 +3560,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* See Documentation/admin-guide/device-mapper/dm-raid.rst for
* information on each of these states.
*/
DMEMIT(" %s", sync_action);
DMEMIT(" %s", sync_str(state));
/*
* v1.5.0+:
......@@ -3955,11 +3954,22 @@ static int raid_preresume(struct dm_target *ti)
if (r)
return r;
/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */
/* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */
if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) {
mddev->array_sectors = rs->array_sectors;
mddev->dev_sectors = rs->dev_sectors;
rs_set_rdev_sectors(rs);
rs_set_capacity(rs);
}
/* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */
if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap &&
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) {
r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors,
to_bytes(rs->requested_bitmap_chunk_sectors), 0);
(test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) ||
(rs->requested_bitmap_chunk_sectors &&
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
if (r)
DMERR("Failed to resize bitmap");
}
......@@ -3968,8 +3978,10 @@ static int raid_preresume(struct dm_target *ti)
/* Be prepared for mddev_resume() in raid_resume() */
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp;
if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
mddev->resync_max_sectors = mddev->dev_sectors;
}
/* Check for any reshape request unless new raid set */
......@@ -4017,7 +4029,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
.version = {1, 14, 0},
.version = {1, 15, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
......
......@@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work)
dm_table_event(sc->ti->table);
}
static inline struct stripe_c *alloc_context(unsigned int stripes)
{
size_t len;
if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
stripes))
return NULL;
len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
return kmalloc(len, GFP_KERNEL);
}
/*
* Parse a single <dev> <sector> pair
*/
......@@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -EINVAL;
}
sc = alloc_context(stripes);
sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL);
if (!sc) {
ti->error = "Memory allocation for striped context "
"failed";
......
......@@ -918,21 +918,15 @@ bool dm_table_supports_dax(struct dm_table *t,
static bool dm_table_does_not_support_partial_completion(struct dm_table *t);
struct verify_rq_based_data {
unsigned sq_count;
unsigned mq_count;
};
static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
struct verify_rq_based_data *v = data;
struct block_device *bdev = dev->bdev;
struct request_queue *q = bdev_get_queue(bdev);
if (queue_is_mq(q))
v->mq_count++;
else
v->sq_count++;
/* request-based cannot stack on partitions! */
if (bdev != bdev->bd_contains)
return false;
return queue_is_mq(q);
}
......@@ -941,7 +935,6 @@ static int dm_table_determine_type(struct dm_table *t)
{
unsigned i;
unsigned bio_based = 0, request_based = 0, hybrid = 0;
struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0};
struct dm_target *tgt;
struct list_head *devices = dm_table_get_devices(t);
enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
......@@ -1045,14 +1038,10 @@ static int dm_table_determine_type(struct dm_table *t)
/* Non-request-stackable devices can't be used for request-based dm */
if (!tgt->type->iterate_devices ||
!tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) {
!tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) {
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
if (v.sq_count > 0) {
DMERR("table load rejected: not all devices are blk-mq request-stackable");
return -EINVAL;
}
return 0;
}
......
......@@ -609,13 +609,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
blk_status_t error)
{
struct bio_list bios;
unsigned long flags;
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
__merge_bio_list(&bios, master);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
error_bio_list(&bios, error);
}
......@@ -623,15 +622,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
static void requeue_deferred_cells(struct thin_c *tc)
{
struct pool *pool = tc->pool;
unsigned long flags;
struct list_head cells;
struct dm_bio_prison_cell *cell, *tmp;
INIT_LIST_HEAD(&cells);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
list_splice_init(&tc->deferred_cells, &cells);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
list_for_each_entry_safe(cell, tmp, &cells, user_list)
cell_requeue(pool, cell);
......@@ -640,14 +638,13 @@ static void requeue_deferred_cells(struct thin_c *tc)
static void requeue_io(struct thin_c *tc)
{
struct bio_list bios;
unsigned long flags;
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
__merge_bio_list(&bios, &tc->deferred_bio_list);
__merge_bio_list(&bios, &tc->retry_on_resume_list);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
error_bio_list(&bios, BLK_STS_DM_REQUEUE);
requeue_deferred_cells(tc);
......@@ -756,7 +753,6 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio)
static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
unsigned long flags;
if (!bio_triggers_commit(tc, bio)) {
generic_make_request(bio);
......@@ -777,9 +773,9 @@ static void issue(struct thin_c *tc, struct bio *bio)
* Batch together any bios that trigger commits and then issue a
* single commit for them in process_deferred_bios().
*/
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_bios, bio);
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
}
static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
......@@ -886,11 +882,14 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c
{
struct pool *pool = tc->pool;
unsigned long flags;
int has_work;
spin_lock_irqsave(&tc->lock, flags);
cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
has_work = !bio_list_empty(&tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags);
if (has_work)
wake_worker(pool);
}
......@@ -960,7 +959,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
unsigned long flags;
/*
* If the bio has the REQ_FUA flag set we must commit the metadata
......@@ -985,9 +983,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
* Batch together any bios that trigger commits and then issue a
* single commit for them in process_deferred_bios().
*/
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
bio_list_add(&pool->deferred_flush_completions, bio);
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
}
static void process_prepared_mapping(struct dm_thin_new_mapping *m)
......@@ -1226,14 +1224,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
static void process_prepared(struct pool *pool, struct list_head *head,
process_mapping_fn *fn)
{
unsigned long flags;
struct list_head maps;
struct dm_thin_new_mapping *m, *tmp;
INIT_LIST_HEAD(&maps);
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
list_splice_init(head, &maps);
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
list_for_each_entry_safe(m, tmp, &maps, list)
(*fn)(m);
......@@ -1510,14 +1507,12 @@ static int commit(struct pool *pool)
static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
{
unsigned long flags;
if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
pool->low_water_triggered = true;
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
dm_table_event(pool->ti->table);
}
}
......@@ -1593,11 +1588,10 @@ static void retry_on_resume(struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct thin_c *tc = h->tc;
unsigned long flags;
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
bio_list_add(&tc->retry_on_resume_list, bio);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
}
static blk_status_t should_error_unserviceable_bio(struct pool *pool)
......@@ -2170,7 +2164,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc)
static void process_thin_deferred_bios(struct thin_c *tc)
{
struct pool *pool = tc->pool;
unsigned long flags;
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
......@@ -2184,10 +2177,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_init(&bios);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
if (bio_list_empty(&tc->deferred_bio_list)) {
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
return;
}
......@@ -2196,7 +2189,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
bio_list_merge(&bios, &tc->deferred_bio_list);
bio_list_init(&tc->deferred_bio_list);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) {
......@@ -2206,10 +2199,10 @@ static void process_thin_deferred_bios(struct thin_c *tc)
* prepared mappings to process.
*/
if (ensure_next_mapping(pool)) {
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
bio_list_merge(&tc->deferred_bio_list, &bios);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
break;
}
......@@ -2264,16 +2257,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells)
static void process_thin_deferred_cells(struct thin_c *tc)
{
struct pool *pool = tc->pool;
unsigned long flags;
struct list_head cells;
struct dm_bio_prison_cell *cell;
unsigned i, j, count;
INIT_LIST_HEAD(&cells);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
list_splice_init(&tc->deferred_cells, &cells);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
if (list_empty(&cells))
return;
......@@ -2294,9 +2286,9 @@ static void process_thin_deferred_cells(struct thin_c *tc)
for (j = i; j < count; j++)
list_add(&pool->cell_sort_array[j]->user_list, &cells);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
list_splice(&cells, &tc->deferred_cells);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
return;
}
......@@ -2349,7 +2341,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
static void process_deferred_bios(struct pool *pool)
{
unsigned long flags;
struct bio *bio;
struct bio_list bios, bio_completions;
struct thin_c *tc;
......@@ -2368,13 +2359,13 @@ static void process_deferred_bios(struct pool *pool)
bio_list_init(&bios);
bio_list_init(&bio_completions);
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
bio_list_merge(&bios, &pool->deferred_flush_bios);
bio_list_init(&pool->deferred_flush_bios);
bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
bio_list_init(&pool->deferred_flush_completions);
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
......@@ -2657,12 +2648,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r)
*/
static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
{
unsigned long flags;
struct pool *pool = tc->pool;
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
bio_list_add(&tc->deferred_bio_list, bio);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
wake_worker(pool);
}
......@@ -2678,13 +2668,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
{
unsigned long flags;
struct pool *pool = tc->pool;
throttle_lock(&pool->throttle);
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
list_add_tail(&cell->user_list, &tc->deferred_cells);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
throttle_unlock(&pool->throttle);
wake_worker(pool);
......@@ -2810,15 +2799,14 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
static void requeue_bios(struct pool *pool)
{
unsigned long flags;
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list) {
spin_lock_irqsave(&tc->lock, flags);
spin_lock_irq(&tc->lock);
bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
bio_list_init(&tc->retry_on_resume_list);
spin_unlock_irqrestore(&tc->lock, flags);
spin_unlock_irq(&tc->lock);
}
rcu_read_unlock();
}
......@@ -3412,15 +3400,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio)
int r;
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
unsigned long flags;
/*
* As this is a singleton target, ti->begin is always zero.
*/
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
bio_set_dev(bio, pt->data_dev->bdev);
r = DM_MAPIO_REMAPPED;
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
return r;
}
......@@ -3591,7 +3578,6 @@ static void pool_resume(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
unsigned long flags;
/*
* Must requeue active_thins' bios and then resume
......@@ -3600,10 +3586,10 @@ static void pool_resume(struct dm_target *ti)
requeue_bios(pool);
pool_resume_active_thins(pool);
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
pool->low_water_triggered = false;
pool->suspended = false;
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
do_waker(&pool->waker.work);
}
......@@ -3612,11 +3598,10 @@ static void pool_presuspend(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
unsigned long flags;
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
pool->suspended = true;
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
pool_suspend_active_thins(pool);
}
......@@ -3625,13 +3610,12 @@ static void pool_presuspend_undo(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
unsigned long flags;
pool_resume_active_thins(pool);
spin_lock_irqsave(&pool->lock, flags);
spin_lock_irq(&pool->lock);
pool->suspended = false;
spin_unlock_irqrestore(&pool->lock, flags);
spin_unlock_irq(&pool->lock);
}
static void pool_postsuspend(struct dm_target *ti)
......@@ -4110,11 +4094,10 @@ static void thin_put(struct thin_c *tc)
static void thin_dtr(struct dm_target *ti)
{
struct thin_c *tc = ti->private;
unsigned long flags;
spin_lock_irqsave(&tc->pool->lock, flags);
spin_lock_irq(&tc->pool->lock);
list_del_rcu(&tc->list);
spin_unlock_irqrestore(&tc->pool->lock, flags);
spin_unlock_irq(&tc->pool->lock);
synchronize_rcu();
thin_put(tc);
......@@ -4150,7 +4133,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
struct thin_c *tc;
struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
unsigned long flags;
mutex_lock(&dm_thin_pool_table.mutex);
......@@ -4244,9 +4226,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
mutex_unlock(&dm_thin_pool_table.mutex);
spin_lock_irqsave(&tc->pool->lock, flags);
spin_lock_irq(&tc->pool->lock);
if (tc->pool->suspended) {
spin_unlock_irqrestore(&tc->pool->lock, flags);
spin_unlock_irq(&tc->pool->lock);
mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
ti->error = "Unable to activate thin device while pool is suspended";
r = -EINVAL;
......@@ -4255,7 +4237,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
refcount_set(&tc->refcount, 1);
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
spin_unlock_irqrestore(&tc->pool->lock, flags);
spin_unlock_irq(&tc->pool->lock);
/*
* This synchronize_rcu() call is needed here otherwise we risk a
* wake_worker() call finding no bios to process (because the newly
......
......@@ -1218,7 +1218,8 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
}
} while (bio->bi_iter.bi_size);
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks))
if (unlikely(bio->bi_opf & REQ_FUA ||
wc->uncommitted_blocks >= wc->autocommit_blocks))
writecache_flush(wc);
else
writecache_schedule_autocommit(wc);
......@@ -1561,7 +1562,7 @@ static void writecache_writeback(struct work_struct *work)
{
struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
struct blk_plug plug;
struct wc_entry *f, *g, *e = NULL;
struct wc_entry *f, *uninitialized_var(g), *e = NULL;
struct rb_node *node, *next_node;
struct list_head skipped;
struct writeback_list wbl;
......
......@@ -554,6 +554,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
dmz_release_mblock(zmd, mblk);
dmz_check_bdev(zmd->dev);
return ERR_PTR(-EIO);
}
......@@ -625,6 +626,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
ret = submit_bio_wait(bio);
bio_put(bio);
if (ret)
dmz_check_bdev(zmd->dev);
return ret;
}
......@@ -691,6 +694,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
TASK_UNINTERRUPTIBLE);
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
clear_bit(DMZ_META_ERROR, &mblk->state);
dmz_check_bdev(zmd->dev);
ret = -EIO;
}
nr_mblks_submitted--;
......@@ -768,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
goto out;
goto err;
}
/*
......@@ -778,7 +782,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_log_dirty_mblocks(zmd, &write_list);
if (ret)
goto out;
goto err;
/*
* The log is on disk. It is now safe to update in place
......@@ -786,11 +790,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
*/
ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
if (ret)
goto out;
goto err;
ret = dmz_write_sb(zmd, zmd->mblk_primary);
if (ret)
goto out;
goto err;
while (!list_empty(&write_list)) {
mblk = list_first_entry(&write_list, struct dmz_mblock, link);
......@@ -805,16 +809,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
zmd->sb_gen++;
out:
if (ret && !list_empty(&write_list)) {
spin_lock(&zmd->mblk_lock);
list_splice(&write_list, &zmd->mblk_dirty_list);
spin_unlock(&zmd->mblk_lock);
}
dmz_unlock_flush(zmd);
up_write(&zmd->mblk_sem);
return ret;
err:
if (!list_empty(&write_list)) {
spin_lock(&zmd->mblk_lock);
list_splice(&write_list, &zmd->mblk_dirty_list);
spin_unlock(&zmd->mblk_lock);
}
if (!dmz_check_bdev(zmd->dev))
ret = -EIO;
goto out;
}
/*
......@@ -1221,6 +1229,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
if (ret < 0) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
dmz_check_bdev(zmd->dev);
return ret;
}
......
......@@ -82,6 +82,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
dmz_id(zmd, zone), (unsigned long long)wp_block,
(unsigned long long)block, nr_blocks, ret);
dmz_check_bdev(zrc->dev);
return ret;
}
......@@ -489,12 +490,7 @@ static void dmz_reclaim_work(struct work_struct *work)
ret = dmz_do_reclaim(zrc);
if (ret) {
dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
if (ret == -EIO)
/*
* LLD might be performing some error handling sequence
* at the underlying device. To not interfere, do not
* attempt to schedule the next reclaim run immediately.
*/
if (!dmz_check_bdev(zrc->dev))
return;
}
......
......@@ -80,6 +80,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
bio->bi_status = status;
if (bio->bi_status != BLK_STS_OK)
bioctx->target->dev->flags |= DMZ_CHECK_BDEV;
if (refcount_dec_and_test(&bioctx->ref)) {
struct dm_zone *zone = bioctx->zone;
......@@ -565,29 +567,49 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
}
/*
* Check the backing device availability. If it's on the way out,
* Check if the backing device is being removed. If it's on the way out,
* start failing I/O. Reclaim and metadata components also call this
* function to cleanly abort operation in the event of such failure.
*/
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
{
struct gendisk *disk;
if (dmz_dev->flags & DMZ_BDEV_DYING)
return true;
if (dmz_dev->flags & DMZ_CHECK_BDEV)
return !dmz_check_bdev(dmz_dev);
if (!(dmz_dev->flags & DMZ_BDEV_DYING)) {
disk = dmz_dev->bdev->bd_disk;
if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
dmz_dev_warn(dmz_dev, "Backing device queue dying");
dmz_dev->flags |= DMZ_BDEV_DYING;
} else if (disk->fops->check_events) {
if (disk->fops->check_events(disk, 0) &
DISK_EVENT_MEDIA_CHANGE) {
}
return dmz_dev->flags & DMZ_BDEV_DYING;
}
/*
* Check the backing device availability. This detects such events as
* backing device going offline due to errors, media removals, etc.
* This check is less efficient than dmz_bdev_is_dying() and should
* only be performed as a part of error handling.
*/
bool dmz_check_bdev(struct dmz_dev *dmz_dev)
{
struct gendisk *disk;
dmz_dev->flags &= ~DMZ_CHECK_BDEV;
if (dmz_bdev_is_dying(dmz_dev))
return false;
disk = dmz_dev->bdev->bd_disk;
if (disk->fops->check_events &&
disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) {
dmz_dev_warn(dmz_dev, "Backing device offline");
dmz_dev->flags |= DMZ_BDEV_DYING;
}
}
}
return dmz_dev->flags & DMZ_BDEV_DYING;
return !(dmz_dev->flags & DMZ_BDEV_DYING);
}
/*
......@@ -902,8 +924,8 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
struct dmz_target *dmz = ti->private;
if (dmz_bdev_is_dying(dmz->dev))
return -ENODEV;
if (!dmz_check_bdev(dmz->dev))
return -EIO;
*bdev = dmz->dev->bdev;
......
......@@ -72,6 +72,7 @@ struct dmz_dev {
/* Device flags. */
#define DMZ_BDEV_DYING (1 << 0)
#define DMZ_CHECK_BDEV (2 << 0)
/*
* Zone descriptor.
......@@ -255,5 +256,6 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
* Functions defined in dm-zoned-target.c
*/
bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
bool dmz_check_bdev(struct dmz_dev *dmz_dev);
#endif /* DM_ZONED_H */
......@@ -608,9 +608,6 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
*/
#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
#define dm_array_too_big(fixed, obj, num) \
((num) > (UINT_MAX - (fixed)) / (obj))
/*
* Sector offset taken relative to the start of the target instead of
* relative to the start of the device.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment