Commit d37977f0 authored by Jens Axboe's avatar Jens Axboe

Merge tag 'md-6.9-20240306' of...

Merge tag 'md-6.9-20240306' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.9/block

Pull MD atomic queue limits changes from Song.

* tag 'md-6.9-20240306' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md:
  block: remove disk_stack_limits
  md: remove mddev->queue
  md: don't initialize queue limits
  md/raid10: use the atomic queue limit update APIs
  md/raid5: use the atomic queue limit update APIs
  md/raid1: use the atomic queue limit update APIs
  md/raid0: use the atomic queue limit update APIs
  md: add queue limit helpers
  md: add a mddev_is_dm helper
  md: add a mddev_add_trace_msg helper
  md: add a mddev_trace_remap helper
parents 34a2cf3f dd27a84b
......@@ -916,30 +916,6 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
}
EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);
/**
* disk_stack_limits - adjust queue limits for stacked drivers
* @disk: MD/DM gendisk (top)
* @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device
*
* Description:
* Merges the limits for a top level gendisk and a bottom level
* block_device.
*/
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset)
{
struct request_queue *t = disk->queue;
if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + (offset >> 9)) < 0)
pr_notice("%s: Warning: Device %pg is misaligned\n",
disk->disk_name, bdev);
disk_update_readahead(disk);
}
EXPORT_SYMBOL(disk_stack_limits);
/**
* blk_queue_update_dma_pad - update pad mask
* @q: the request queue for the device
......
......@@ -1046,8 +1046,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (dirty || need_write) {
if (!writing) {
md_bitmap_wait_writes(bitmap);
if (bitmap->mddev->queue)
blk_add_trace_msg(bitmap->mddev->queue,
mddev_add_trace_msg(bitmap->mddev,
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
......@@ -1319,9 +1318,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
}
bitmap->allclean = 1;
if (bitmap->mddev->queue)
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_daemon_work");
mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
/* Any file-page which is PENDING now needs to be written.
* So set NEEDWRITE now, then after we make any last-minute changes
......
......@@ -65,7 +65,6 @@
#include <linux/percpu-refcount.h>
#include <linux/part_stat.h>
#include <trace/events/block.h>
#include "md.h"
#include "md-bitmap.h"
#include "md-cluster.h"
......@@ -2411,7 +2410,7 @@ int md_integrity_register(struct mddev *mddev)
if (list_empty(&mddev->disks))
return 0; /* nothing to do */
if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk))
return 0; /* shouldn't register, or already is */
rdev_for_each(rdev, mddev) {
/* skip spares and non-functional disks */
......@@ -2464,7 +2463,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
struct blk_integrity *bi_mddev;
if (!mddev->gendisk)
if (mddev_is_dm(mddev))
return 0;
bi_mddev = blk_get_integrity(mddev->gendisk);
......@@ -2857,8 +2856,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync);
if (mddev->queue)
blk_add_trace_msg(mddev->queue, "md md_update_sb");
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
md_bitmap_update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
......@@ -4166,7 +4164,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer);
}
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
if (!mddev->thread)
......@@ -5753,6 +5750,51 @@ static const struct kobj_type md_ktype = {
int mdp_major = 0;
/* stack the limit for all rdevs into lim */
void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
}
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
/* apply the extra stacking limits from a new rdev into mddev */
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
struct queue_limits lim;
if (mddev_is_dm(mddev))
return 0;
lim = queue_limits_start_update(mddev->gendisk->queue);
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
return queue_limits_commit_update(mddev->gendisk->queue, &lim);
}
EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
/* update the optimal I/O size after a reshape */
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
{
struct queue_limits lim;
if (mddev_is_dm(mddev))
return;
/* don't bother updating io_opt if we can't suspend the array */
if (mddev_suspend(mddev, false) < 0)
return;
lim = queue_limits_start_update(mddev->gendisk->queue);
lim.io_opt = lim.io_min * nr_stripes;
queue_limits_commit_update(mddev->gendisk->queue, &lim);
mddev_resume(mddev);
}
EXPORT_SYMBOL_GPL(mddev_update_io_opt);
static void mddev_delayed_delete(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
......@@ -5835,9 +5877,7 @@ struct mddev *md_alloc(dev_t dev, char *name)
disk->fops = &md_fops;
disk->private_data = mddev;
mddev->queue = disk->queue;
blk_set_stacking_limits(&mddev->queue->limits);
blk_queue_write_cache(mddev->queue, true, true);
blk_queue_write_cache(disk->queue, true, true);
disk->events |= DISK_EVENT_MEDIA_CHANGE;
mddev->gendisk = disk;
error = add_disk(disk);
......@@ -5979,7 +6019,7 @@ int md_run(struct mddev *mddev)
invalidate_bdev(rdev->bdev);
if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
mddev->ro = MD_RDONLY;
if (mddev->gendisk)
if (!mddev_is_dm(mddev))
set_disk_ro(mddev->gendisk, 1);
}
......@@ -6141,7 +6181,8 @@ int md_run(struct mddev *mddev)
}
}
if (mddev->queue) {
if (!mddev_is_dm(mddev)) {
struct request_queue *q = mddev->gendisk->queue;
bool nonrot = true;
rdev_for_each(rdev, mddev) {
......@@ -6153,14 +6194,14 @@ int md_run(struct mddev *mddev)
if (mddev->degraded)
nonrot = false;
if (nonrot)
blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
else
blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue);
blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
/* Set the NOWAIT flags if all underlying devices support it */
if (nowait)
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
......@@ -6406,8 +6447,10 @@ static void mddev_detach(struct mddev *mddev)
mddev->pers->quiesce(mddev, 0);
}
md_unregister_thread(mddev, &mddev->thread);
if (mddev->queue)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
/* the unplug fn references 'conf' */
if (!mddev_is_dm(mddev))
blk_sync_queue(mddev->gendisk->queue);
}
static void __md_stop(struct mddev *mddev)
......@@ -7125,7 +7168,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
if (!bdev_nowait(rdev->bdev)) {
pr_info("%s: Disabling nowait because %pg does not support nowait\n",
mdname(mddev), rdev->bdev);
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue);
}
/*
* Kick recovery, maybe this spare has to be added to the
......@@ -7362,11 +7405,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
if (!rv) {
if (mddev_is_clustered(mddev))
md_cluster_ops->update_size(mddev, old_dev_sectors);
else if (mddev->queue) {
else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
}
}
return rv;
}
......@@ -8686,10 +8728,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
bio_chain(discard_bio, bio);
bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(discard_bio,
disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
submit_bio_noacct(discard_bio);
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
......@@ -9182,7 +9221,7 @@ void md_do_sync(struct md_thread *thread)
mddev->delta_disks > 0 &&
mddev->pers->finish_reshape &&
mddev->pers->size &&
mddev->queue) {
!mddev_is_dm(mddev)) {
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
......
......@@ -18,6 +18,7 @@
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <trace/events/block.h>
#include "md-cluster.h"
#define MaxSector (~(sector_t)0)
......@@ -479,7 +480,6 @@ struct mddev {
struct timer_list safemode_timer;
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
struct {
......@@ -868,7 +868,7 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
{
if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0;
mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
}
static inline int mddev_suspend_and_lock(struct mddev *mddev)
......@@ -907,7 +907,31 @@ void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
int do_md_run(struct mddev *mddev);
void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim);
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev);
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes);
extern const struct block_device_operations md_fops;
/*
* MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
*/
static inline bool mddev_is_dm(struct mddev *mddev)
{
return !mddev->gendisk;
}
static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
sector_t sector)
{
if (!mddev_is_dm(mddev))
trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector);
}
#define mddev_add_trace_msg(mddev, fmt, args...) \
do { \
if (!mddev_is_dm(mddev)) \
blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
} while (0)
#endif /* _MD_MD_H */
......@@ -379,6 +379,19 @@ static void raid0_free(struct mddev *mddev, void *priv)
free_conf(mddev, conf);
}
static int raid0_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid0_run(struct mddev *mddev)
{
struct r0conf *conf;
......@@ -399,20 +412,10 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
if (mddev->queue) {
struct md_rdev *rdev;
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
}
if (!mddev_is_dm(mddev)) {
ret = raid0_set_limits(mddev);
if (ret)
goto out_free_conf;
}
/* calculate array device size */
......@@ -426,8 +429,10 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret)
goto out_free_conf;
return 0;
out_free_conf:
free_conf(mddev, conf);
return ret;
}
......@@ -578,10 +583,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
if (mddev->gendisk)
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_trace_remap(mddev, bio, bio_sector);
mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio);
}
......
......@@ -46,9 +46,6 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
......@@ -1196,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra)
*/
spin_lock_irq(&conf->resync_lock);
conf->array_frozen = 1;
raid1_log(conf->mddev, "wait freeze");
mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
wait_event_lock_irq_cmd(
conf->wait_barrier,
get_unqueued_pending(conf) == extra,
......@@ -1385,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
raid1_log(mddev, "wait behind writes");
mddev_add_trace_msg(mddev, "raid1 wait behind writes");
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
......@@ -1418,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r1_bio;
if (mddev->gendisk)
trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r1_bio->sector);
mddev_trace_remap(mddev, read_bio, r1_bio->sector);
submit_bio_noacct(read_bio);
}
......@@ -1572,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio);
return;
}
raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, bio->bi_iter.bi_sector, false);
goto retry_write;
......@@ -1655,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
if (mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
r1_bio->sector);
mddev_trace_remap(mddev, mbio, r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
......@@ -1935,12 +1926,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors + mirror;
if (!p->rdev) {
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = mddev_stack_new_rdev(mddev, rdev);
if (err)
return err;
raid1_add_conf(conf, rdev, mirror, false);
err = 0;
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
......@@ -3204,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
static int raid1_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static void raid1_free(struct mddev *mddev, void *priv);
static int raid1_run(struct mddev *mddev)
{
struct r1conf *conf;
int i;
struct md_rdev *rdev;
int ret;
if (mddev->level != 1) {
......@@ -3236,14 +3235,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
if (mddev->queue)
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (!mddev_is_dm(mddev)) {
ret = raid1_set_limits(mddev);
if (ret)
goto abort;
}
mddev->degraded = 0;
......
......@@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio);
static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
#include "raid1-10.c"
#define NULL_CMD
......@@ -1019,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
ret = false;
} else {
conf->nr_waiting++;
raid10_log(conf->mddev, "wait barrier");
mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
wait_event_barrier(conf, stop_waiting_barrier(conf));
conf->nr_waiting--;
}
......@@ -1138,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
bio_wouldblock_error(bio);
return false;
}
raid10_log(conf->mddev, "wait reshape");
mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector ||
conf->reshape_progress >= bio->bi_iter.bi_sector +
......@@ -1235,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r10_bio;
if (mddev->gendisk)
trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
mddev_trace_remap(mddev, read_bio, r10_bio->sector);
submit_bio_noacct(read_bio);
return;
}
......@@ -1274,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
&& enough(conf, devnum))
mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r10_bio;
if (conf->mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
r10_bio->sector);
mddev_trace_remap(mddev, mbio, r10_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
......@@ -1342,7 +1333,8 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */
allow_barrier(conf);
raid10_log(conf->mddev, "%s wait rdev %d blocked",
mddev_add_trace_msg(conf->mddev,
"raid10 %s wait rdev %d blocked",
__func__, blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, false);
......@@ -1398,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio);
return;
}
raid10_log(conf->mddev, "wait reshape metadata");
mddev_add_trace_msg(conf->mddev,
"raid10 wait reshape metadata");
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
......@@ -2113,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
continue;
}
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = mddev_stack_new_rdev(mddev, rdev);
if (err)
return err;
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
rdev->raid_disk = mirror;
......@@ -2132,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot;
err = 0;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = mddev_stack_new_rdev(mddev, rdev);
if (err)
return err;
conf->fullsync = 1;
WRITE_ONCE(p->replacement, rdev);
}
......@@ -3976,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
static void raid10_set_io_opt(struct r10conf *conf)
static unsigned int raid10_nr_stripes(struct r10conf *conf)
{
int raid_disks = conf->geo.raid_disks;
unsigned int raid_disks = conf->geo.raid_disks;
if (conf->geo.raid_disks % conf->geo.near_copies)
return raid_disks;
return raid_disks / conf->geo.near_copies;
}
if (!(conf->geo.raid_disks % conf->geo.near_copies))
raid_disks /= conf->geo.near_copies;
blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
raid_disks);
static int raid10_set_queue_limits(struct mddev *mddev)
{
struct r10conf *conf = mddev->private;
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid10_run(struct mddev *mddev)
......@@ -3995,6 +3998,7 @@ static int raid10_run(struct mddev *mddev)
sector_t size;
sector_t min_offset_diff = 0;
int first = 1;
int ret = -EIO;
if (mddev->private == NULL) {
conf = setup_conf(mddev);
......@@ -4021,12 +4025,6 @@ static int raid10_run(struct mddev *mddev)
}
}
if (mddev->queue) {
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
raid10_set_io_opt(conf);
}
rdev_for_each(rdev, mddev) {
long long diff;
......@@ -4055,14 +4053,16 @@ static int raid10_run(struct mddev *mddev)
if (first || diff < min_offset_diff)
min_offset_diff = diff;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk->head_position = 0;
first = 0;
}
if (!mddev_is_dm(conf->mddev)) {
ret = raid10_set_queue_limits(mddev);
if (ret)
goto out_free_conf;
}
/* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) {
pr_err("md/raid10:%s: not enough operational mirrors.\n",
......@@ -4163,7 +4163,7 @@ static int raid10_run(struct mddev *mddev)
raid10_free_conf(conf);
mddev->private = NULL;
out:
return -EIO;
return ret;
}
static void raid10_free(struct mddev *mddev, void *priv)
......@@ -4940,8 +4940,7 @@ static void end_reshape(struct r10conf *conf)
conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
if (conf->mddev->queue)
raid10_set_io_opt(conf);
mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf));
conf->fullsync = 0;
}
......
......@@ -1393,7 +1393,8 @@ int ppl_init_log(struct r5conf *conf)
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
ppl_conf->block_size = 512;
} else {
ppl_conf->block_size = queue_logical_block_size(mddev->queue);
ppl_conf->block_size =
queue_logical_block_size(mddev->gendisk->queue);
}
for (i = 0; i < ppl_conf->count; i++) {
......
......@@ -1295,10 +1295,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
if (conf->mddev->gendisk)
trace_block_bio_remap(bi,
disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi);
else
......@@ -1342,10 +1339,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
*/
if (op == REQ_OP_DISCARD)
rbi->bi_vcnt = 0;
if (conf->mddev->gendisk)
trace_block_bio_remap(rbi,
disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi);
else
......@@ -2422,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num)
size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks);
if (conf->mddev->gendisk)
if (mddev_is_dm(conf->mddev))
snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev));
"raid%d-%p", conf->level, conf->mddev);
else
snprintf(conf->cache_name[0], namelen,
"raid%d-%p", conf->level, conf->mddev);
"raid%d-%s", conf->level, mdname(conf->mddev));
snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0;
......@@ -4201,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue,
"raid5 rmw %llu %d",
(unsigned long long)sh->sector, rmw);
mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_InJournal, &dev->flags) &&
......@@ -4281,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
if (rcw && conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector,
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
if (rcw && !mddev_is_dm(conf->mddev))
blk_add_trace_msg(conf->mddev->gendisk->queue,
"raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector, rcw, qread,
test_bit(STRIPE_DELAYED, &sh->state));
}
if (rcw > disks && rmw > disks &&
......@@ -5523,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
spin_unlock_irq(&conf->device_lock);
}
if (mddev->gendisk)
trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
raid_bio->bi_iter.bi_sector);
mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio);
return 1;
}
......@@ -5694,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
release_inactive_stripe_list(conf, cb->temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
if (mddev->queue)
trace_block_unplug(mddev->queue, cnt, !from_schedule);
if (!mddev_is_dm(mddev))
trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule);
kfree(cb);
}
......@@ -7098,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf)
err = -ENODEV;
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue;
struct request_queue *q = mddev->gendisk->queue;
conf->skip_copy = new;
if (new)
......@@ -7700,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
static void raid5_set_io_opt(struct r5conf *conf)
static int raid5_set_limits(struct mddev *mddev)
{
blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
(conf->raid_disks - conf->max_degraded));
struct r5conf *conf = mddev->private;
struct queue_limits lim;
int data_disks, stripe;
struct md_rdev *rdev;
/*
* The read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
*/
data_disks = conf->previous_raid_disks - conf->max_degraded;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
blk_set_stacking_limits(&lim);
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
lim.raid_partial_stripes_expensive = 1;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim);
rdev_for_each(rdev, mddev)
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
mddev->gendisk->disk_name);
/*
* Zeroing is required for discard, otherwise data could be lost.
*
* Consider a scenario: discard a stripe (the stripe could be
* inconsistent if discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again depending on which
* disks are used to calculate parity); the disk is broken; The stripe
* data of this disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that only safe
* devices are in use by setting a module parameter. A better idea
* might be to turn DISCARD into WRITE_ZEROES requests, as that is
* required to be safe.
*/
if (!devices_handle_discard_safely ||
lim.max_discard_sectors < (stripe >> 9) ||
lim.discard_granularity < stripe)
lim.max_hw_discard_sectors = 0;
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
/* No restrictions on the number of segments in the request */
lim.max_segments = USHRT_MAX;
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid5_run(struct mddev *mddev)
......@@ -7716,6 +7763,7 @@ static int raid5_run(struct mddev *mddev)
int i;
long long min_offset_diff = 0;
int first = 1;
int ret = -EIO;
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
......@@ -7968,66 +8016,10 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
if (mddev->queue) {
int chunk_size;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
*/
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = stripe * PAGE_SIZE;
stripe = roundup_pow_of_two(stripe);
mddev->queue->limits.discard_granularity = stripe;
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
}
/*
* zeroing is required, otherwise data
* could be lost. Consider a scenario: discard a stripe
* (the stripe could be inconsistent if
* discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again
* depending on which disks are used to calculate
* parity); the disk is broken; The stripe data of this
* disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that
* only safe devices are in use by setting a module parameter.
* A better idea might be to turn DISCARD into WRITE_ZEROES
* requests, as that is required to be safe.
*/
if (!devices_handle_discard_safely ||
mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
mddev->queue->limits.discard_granularity < stripe)
blk_queue_max_discard_sectors(mddev->queue, 0);
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
blk_queue_max_hw_sectors(mddev->queue,
RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
/* No restrictions on the number of segments in the request */
blk_queue_max_segments(mddev->queue, USHRT_MAX);
if (!mddev_is_dm(mddev)) {
ret = raid5_set_limits(mddev);
if (ret)
goto abort;
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
......@@ -8040,7 +8032,7 @@ static int raid5_run(struct mddev *mddev)
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
return -EIO;
return ret;
}
static void raid5_free(struct mddev *mddev, void *priv)
......@@ -8572,8 +8564,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
if (conf->mddev->queue)
raid5_set_io_opt(conf);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
}
}
......
......@@ -926,8 +926,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset);
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
sector_t offset, const char *pfx);
extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset);
extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment