Commit 1731a474 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-4.13/dm-fixes' of...

Merge tag 'for-4.13/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:

 - a few DM integrity fixes that improve performance. One that address
   inefficiencies in the on-disk journal device layout. Another that
   makes use of the block layer's on-stack plugging when writing the
   journal.

 - a dm-bufio fix for the blk_status_t conversion that went in during
   the merge window.

 - a few DM raid fixes that address correctness when suspending the
   device and a validation fix for validation that occurs during device
   activation.

 - a couple DM zoned target fixes. Important one being the fix to not
   use GFP_KERNEL in the IO path due to concerns about deadlock in
   low-memory conditions (e.g. swap over a DM zoned device, etc).

 - a DM DAX device fix to make sure dm_dax_flush() is called if the
   underlying DAX device is operating as a write cache.

* tag 'for-4.13/dm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm, dax: Make sure dm_dax_flush() is called if device supports it
  dm verity fec: fix GFP flags used with mempool_alloc()
  dm zoned: use GFP_NOIO in I/O path
  dm zoned: remove test for impossible REQ_OP_FLUSH conditions
  dm raid: bump target version
  dm raid: avoid mddev->suspended access
  dm raid: fix activation check in validate_raid_redundancy()
  dm raid: remove WARN_ON() in raid10_md_layout_to_format()
  dm bufio: fix error code in dm_bufio_write_dirty_buffers()
  dm integrity: test for corrupted disk format during table load
  dm integrity: WARN_ON if variables representing journal usage get out of sync
  dm integrity: use plugging when writing the journal
  dm integrity: fix inefficient allocation of journal space
parents 0fa8dc42 273752c9
......@@ -343,3 +343,4 @@ Version History
1.11.0 Fix table line argument order
(wrong raid10_copies/raid10_format sequence)
1.11.1 Add raid4/5/6 journal write-back support via journal_mode option
1.12.1 fix for MD deadlock between mddev_suspend() and md_write_start() available
......@@ -278,6 +278,12 @@ void dax_write_cache(struct dax_device *dax_dev, bool wc)
}
EXPORT_SYMBOL_GPL(dax_write_cache);
bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
}
EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
bool dax_alive(struct dax_device *dax_dev)
{
lockdep_assert_held(&dax_srcu);
......
......@@ -1258,8 +1258,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
*/
int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
{
blk_status_t a;
int f;
int a, f;
unsigned long buffers_processed = 0;
struct dm_buffer *b, *tmp;
......
......@@ -1587,16 +1587,18 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map
if (likely(ic->mode == 'J')) {
if (dio->write) {
unsigned next_entry, i, pos;
unsigned ws, we;
unsigned ws, we, range_sectors;
dio->range.n_sectors = min(dio->range.n_sectors, ic->free_sectors);
dio->range.n_sectors = min(dio->range.n_sectors,
ic->free_sectors << ic->sb->log2_sectors_per_block);
if (unlikely(!dio->range.n_sectors))
goto sleep;
ic->free_sectors -= dio->range.n_sectors;
range_sectors = dio->range.n_sectors >> ic->sb->log2_sectors_per_block;
ic->free_sectors -= range_sectors;
journal_section = ic->free_section;
journal_entry = ic->free_section_entry;
next_entry = ic->free_section_entry + dio->range.n_sectors;
next_entry = ic->free_section_entry + range_sectors;
ic->free_section_entry = next_entry % ic->journal_section_entries;
ic->free_section += next_entry / ic->journal_section_entries;
ic->n_uncommitted_sections += next_entry / ic->journal_section_entries;
......@@ -1727,6 +1729,8 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
wraparound_section(ic, &ic->free_section);
ic->n_uncommitted_sections++;
}
WARN_ON(ic->journal_sections * ic->journal_section_entries !=
(ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
}
static void integrity_commit(struct work_struct *w)
......@@ -1821,6 +1825,9 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
{
unsigned i, j, n;
struct journal_completion comp;
struct blk_plug plug;
blk_start_plug(&plug);
comp.ic = ic;
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
......@@ -1945,6 +1952,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
dm_bufio_write_dirty_buffers_async(ic->bufio);
blk_finish_plug(&plug);
complete_journal_op(&comp);
wait_for_completion_io(&comp.comp);
......@@ -3019,6 +3028,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->error = "Block size doesn't match the information in superblock";
goto bad;
}
if (!le32_to_cpu(ic->sb->journal_sections)) {
r = -EINVAL;
ti->error = "Corrupted superblock, journal_sections is 0";
goto bad;
}
/* make sure that ti->max_io_len doesn't overflow */
if (ic->sb->log2_interleave_sectors < MIN_LOG2_INTERLEAVE_SECTORS ||
ic->sb->log2_interleave_sectors > MAX_LOG2_INTERLEAVE_SECTORS) {
......
......@@ -208,6 +208,7 @@ struct raid_dev {
#define RT_FLAG_RS_BITMAP_LOADED 2
#define RT_FLAG_UPDATE_SBS 3
#define RT_FLAG_RESHAPE_RS 4
#define RT_FLAG_RS_SUSPENDED 5
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
......@@ -564,9 +565,10 @@ static const char *raid10_md_layout_to_format(int layout)
if (__raid10_near_copies(layout) > 1)
return "near";
WARN_ON(__raid10_far_copies(layout) < 2);
if (__raid10_far_copies(layout) > 1)
return "far";
return "far";
return "unknown";
}
/* Return md raid10 algorithm for @name */
......@@ -2540,11 +2542,6 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (!freshest)
return 0;
if (validate_raid_redundancy(rs)) {
rs->ti->error = "Insufficient redundancy to activate array";
return -EINVAL;
}
/*
* Validation of the freshest device provides the source of
* validation for the remaining devices.
......@@ -2553,6 +2550,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (super_validate(rs, freshest))
return -EINVAL;
if (validate_raid_redundancy(rs)) {
rs->ti->error = "Insufficient redundancy to activate array";
return -EINVAL;
}
rdev_for_each(rdev, mddev)
if (!test_bit(Journal, &rdev->flags) &&
rdev != freshest &&
......@@ -3168,6 +3170,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
mddev_suspend(&rs->md);
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
if (rs_is_raid456(rs)) {
......@@ -3625,7 +3628,7 @@ static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
if (!rs->md.suspended)
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_suspend(&rs->md);
rs->md.ro = 1;
......@@ -3759,7 +3762,7 @@ static int rs_start_reshape(struct raid_set *rs)
return r;
/* Need to be resumed to be able to start reshape, recovery is frozen until raid_resume() though */
if (mddev->suspended)
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_resume(mddev);
/*
......@@ -3786,8 +3789,8 @@ static int rs_start_reshape(struct raid_set *rs)
}
/* Suspend because a resume will happen in raid_resume() */
if (!mddev->suspended)
mddev_suspend(mddev);
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
mddev_suspend(mddev);
/*
* Now reshape got set up, update superblocks to
......@@ -3883,13 +3886,13 @@ static void raid_resume(struct dm_target *ti)
if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->suspended)
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
mddev_resume(mddev);
}
static struct target_type raid_target = {
.name = "raid",
.version = {1, 11, 1},
.version = {1, 12, 1},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
......
......@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/blk-mq.h>
#include <linux/mount.h>
#include <linux/dax.h>
#define DM_MSG_PREFIX "table"
......@@ -1630,6 +1631,37 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
return false;
}
static int device_dax_write_cache_enabled(struct dm_target *ti,
struct dm_dev *dev, sector_t start,
sector_t len, void *data)
{
struct dax_device *dax_dev = dev->dax_dev;
if (!dax_dev)
return false;
if (dax_write_cache_enabled(dax_dev))
return true;
return false;
}
static int dm_table_supports_dax_write_cache(struct dm_table *t)
{
struct dm_target *ti;
unsigned i;
for (i = 0; i < dm_table_get_num_targets(t); i++) {
ti = dm_table_get_target(t, i);
if (ti->type->iterate_devices &&
ti->type->iterate_devices(ti,
device_dax_write_cache_enabled, NULL))
return true;
}
return false;
}
static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
......@@ -1785,6 +1817,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
if (dm_table_supports_dax_write_cache(t))
dax_write_cache(t->md->dax_dev, true);
/* Ensure that all underlying devices are non-rotational. */
if (dm_table_all_devices_attribute(t, device_is_nonrot))
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
......
......@@ -308,19 +308,14 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
{
unsigned n;
if (!fio->rs) {
fio->rs = mempool_alloc(v->fec->rs_pool, 0);
if (unlikely(!fio->rs)) {
DMERR("failed to allocate RS");
return -ENOMEM;
}
}
if (!fio->rs)
fio->rs = mempool_alloc(v->fec->rs_pool, GFP_NOIO);
fec_for_each_prealloc_buffer(n) {
if (fio->bufs[n])
continue;
fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOWAIT);
if (unlikely(!fio->bufs[n])) {
DMERR("failed to allocate FEC buffer");
return -ENOMEM;
......@@ -332,22 +327,16 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
if (fio->bufs[n])
continue;
fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOWAIT);
/* we can manage with even one buffer if necessary */
if (unlikely(!fio->bufs[n]))
break;
}
fio->nbufs = n;
if (!fio->output) {
if (!fio->output)
fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
if (!fio->output) {
DMERR("failed to allocate FEC page");
return -ENOMEM;
}
}
return 0;
}
......
......@@ -624,7 +624,7 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
if (ret == 0)
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
return ret;
}
......@@ -658,7 +658,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
/* Flush drive cache (this will also sync data) */
if (ret == 0)
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
return ret;
}
......@@ -722,7 +722,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
/* If there are no dirty metadata blocks, just flush the device cache */
if (list_empty(&write_list)) {
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_KERNEL, NULL);
ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
goto out;
}
......@@ -927,7 +927,7 @@ static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
(zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
}
page = alloc_page(GFP_KERNEL);
page = alloc_page(GFP_NOIO);
if (!page)
return -ENOMEM;
......@@ -1183,7 +1183,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Get zone information from disk */
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
&blkz, &nr_blkz, GFP_KERNEL);
&blkz, &nr_blkz, GFP_NOIO);
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
......@@ -1257,7 +1257,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
ret = blkdev_reset_zones(dev->bdev,
dmz_start_sect(zmd, zone),
dev->zone_nr_sectors, GFP_KERNEL);
dev->zone_nr_sectors, GFP_NOIO);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
dmz_id(zmd, zone), ret);
......
......@@ -75,7 +75,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone,
nr_blocks = block - wp_block;
ret = blkdev_issue_zeroout(zrc->dev->bdev,
dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block),
dmz_blk2sect(nr_blocks), GFP_NOFS, false);
dmz_blk2sect(nr_blocks), GFP_NOIO, 0);
if (ret) {
dmz_dev_err(zrc->dev,
"Align zone %u wp %llu to %llu (wp+%u) blocks failed %d",
......
......@@ -541,7 +541,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
int ret;
/* Create a new chunk work */
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOFS);
cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
if (!cw)
goto out;
......@@ -588,7 +588,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
bio->bi_bdev = dev->bdev;
if (!nr_sectors && (bio_op(bio) != REQ_OP_FLUSH) && (bio_op(bio) != REQ_OP_WRITE))
if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
return DM_MAPIO_REMAPPED;
/* The BIO should be block aligned */
......@@ -603,7 +603,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
bioctx->status = BLK_STS_OK;
/* Set the BIO pending in the flush list */
if (bio_op(bio) == REQ_OP_FLUSH || (!nr_sectors && bio_op(bio) == REQ_OP_WRITE)) {
if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
spin_lock(&dmz->flush_lock);
bio_list_add(&dmz->flush_list, bio);
spin_unlock(&dmz->flush_lock);
......@@ -785,7 +785,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Chunk BIO work */
mutex_init(&dmz->chunk_lock);
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOFS);
INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL);
dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
0, dev->name);
if (!dmz->chunk_wq) {
......
......@@ -87,6 +87,7 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t size);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
/*
* We use lowest available bit in exceptional entry for locking, one bit for
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment