Commit 0c9f4ac8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - Add a partscan attribute in sysfs, fixing an issue with systemd
   relying on an internal interface that went away.

 - Attempt #2 at making long running discards interruptible. The
   previous attempt went into 6.9, but we ended up mostly reverting it
   as it had issues.

 - Remove old ida_simple API in bcache

 - Support for zoned write plugging, greatly improving the performance
   on zoned devices.

 - Remove the old throttle low interface, which has been experimental
   since 2017 and never made it beyond that and isn't being used.

 - Remove page->index debugging checks in brd, as it hasn't caught
   anything and prepares us for removing in struct page.

 - MD pull request from Song

 - Don't schedule block workers on isolated CPUs

* tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux: (84 commits)
  blk-throttle: delay initialization until configuration
  blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW
  block: fix that util can be greater than 100%
  block: support to account io_ticks precisely
  block: add plug while submitting IO
  bcache: fix variable length array abuse in btree_iter
  bcache: Remove usage of the deprecated ida_simple_xx() API
  md: Revert "md: Fix overflow in is_mddev_idle"
  blk-lib: check for kill signal in ioctl BLKDISCARD
  block: add a bio_await_chain helper
  block: add a blk_alloc_discard_bio helper
  block: add a bio_chain_and_submit helper
  block: move discard checks into the ioctl handler
  block: remove the discard_granularity check in __blkdev_issue_discard
  block/ioctl: prefer different overflow check
  null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION()
  block: fix and simplify blkdevparts= cmdline parsing
  block: refine the EOF check in blkdev_iomap_begin
  block: add a partscan sysfs attribute for disks
  block: add a disk_has_partscan helper
  ...
parents 9961a785 a3166c51
......@@ -101,6 +101,16 @@ Description:
devices that support receiving integrity metadata.
What: /sys/block/<disk>/partscan
Date: May 2024
Contact: Christoph Hellwig <hch@lst.de>
Description:
The /sys/block/<disk>/partscan files reports if partition
scanning is enabled for the disk. It returns "1" if partition
scanning is enabled, or "0" if not. The value type is a 32-bit
unsigned integer, but only "0" and "1" are valid values.
What: /sys/block/<disk>/<partition>/alignment_offset
Date: April 2009
Contact: Martin K. Petersen <martin.petersen@oracle.com>
......@@ -584,18 +594,6 @@ Description:
the data. If no such restriction exists, this file will contain
'0'. This file is writable for testing purposes.
What: /sys/block/<disk>/queue/throttle_sample_time
Date: March 2017
Contact: linux-block@vger.kernel.org
Description:
[RW] This is the time window that blk-throttle samples data, in
millisecond. blk-throttle makes decision based on the
samplings. Lower time means cgroups have more smooth throughput,
but higher CPU overhead. This exists only when
CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
What: /sys/block/<disk>/queue/virt_boundary_mask
Date: April 2021
Contact: linux-block@vger.kernel.org
......
......@@ -76,7 +76,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_MODVERSIONS=y
CONFIG_BLK_DEV_ZONED=y
CONFIG_BLK_DEV_THROTTLING=y
CONFIG_BLK_DEV_THROTTLING_LOW=y
CONFIG_BLK_WBT=y
CONFIG_BLK_CGROUP_IOLATENCY=y
CONFIG_BLK_CGROUP_FC_APPID=y
......
......@@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
......@@ -120,17 +119,6 @@ config BLK_DEV_THROTTLING
See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
config BLK_DEV_THROTTLING_LOW
bool "Block throttling .low limit interface support (EXPERIMENTAL)"
depends on BLK_DEV_THROTTLING
help
Add .low limit interface for block throttling. The low limit is a best
effort limit to prioritize cgroups. Depending on the setting, the limit
can be used to protect cgroups in terms of bandwidth/iops and better
utilize disk resource.
Note, this is an experimental interface and could be changed someday.
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
......@@ -198,10 +186,6 @@ config BLK_DEBUG_FS
Unless you are building a kernel for a tiny system, you should
say Y here.
config BLK_DEBUG_FS_ZONED
bool
default BLK_DEBUG_FS && BLK_DEV_ZONED
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
......
......@@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
......
......@@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
/**
* bio_chain_and_submit - submit a bio after chaining it to another one
* @prev: bio to chain and submit
* @new: bio to chain to
*
* If @prev is non-NULL, chain it to @new and submit it.
*
* Return: @new.
*/
struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
{
struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
if (bio) {
bio_chain(bio, new);
submit_bio(bio);
if (prev) {
bio_chain(prev, new);
submit_bio(prev);
}
return new;
}
struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
{
return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
}
EXPORT_SYMBOL_GPL(blk_next_bio);
static void bio_alloc_rescue(struct work_struct *work)
......@@ -1384,6 +1395,26 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);
bio_put(bio);
}
/*
* bio_await_chain - ends @bio and waits for every chained bio to complete
*/
void bio_await_chain(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = bio_wait_end_io;
bio_endio(bio);
blk_wait_io(&done);
}
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
......@@ -1576,6 +1607,8 @@ void bio_endio(struct bio *bio)
if (!bio_integrity_endio(bio))
return;
blk_zone_bio_endio(bio);
rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
......@@ -1596,7 +1629,6 @@ void bio_endio(struct bio *bio)
goto again;
}
blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
if (bio->bi_end_io)
......
......@@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
{
int i, ret;
for (i = 0; i < BLKG_RWSTAT_NR; i++) {
ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
if (ret) {
while (--i >= 0)
percpu_counter_destroy(&rwstat->cpu_cnt[i]);
ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR);
if (ret)
return ret;
}
for (i = 0; i < BLKG_RWSTAT_NR; i++)
atomic64_set(&rwstat->aux_cnt[i], 0);
}
return 0;
}
EXPORT_SYMBOL_GPL(blkg_rwstat_init);
void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
{
int i;
for (i = 0; i < BLKG_RWSTAT_NR; i++)
percpu_counter_destroy(&rwstat->cpu_cnt[i]);
percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR);
}
EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
......
......@@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
/* as long as there are pending bios, @blkg can't go away */
spin_lock(&blkg->async_bio_lock);
bio_list_merge(&bios, &blkg->async_bios);
bio_list_init(&blkg->async_bios);
bio_list_merge_init(&bios, &blkg->async_bios);
spin_unlock(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
......@@ -1444,14 +1443,8 @@ int blkcg_init_disk(struct gendisk *disk)
if (ret)
goto err_destroy_all;
ret = blk_throtl_init(disk);
if (ret)
goto err_ioprio_exit;
return 0;
err_ioprio_exit:
blk_ioprio_exit(disk);
err_destroy_all:
blkg_destroy_all(disk);
return ret;
......
......@@ -591,8 +591,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
!bio_zone_is_seq(bio))
if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
return BLK_STS_IOERR;
/*
......@@ -604,7 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
if (nr_sectors > q->limits.max_zone_append_sectors)
if (nr_sectors > queue_max_zone_append_sectors(q))
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
......@@ -649,11 +648,13 @@ static void __submit_bio(struct bio *bio)
static void __submit_bio_noacct(struct bio *bio)
{
struct bio_list bio_list_on_stack[2];
struct blk_plug plug;
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
blk_start_plug(&plug);
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
......@@ -687,19 +688,23 @@ static void __submit_bio_noacct(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
blk_finish_plug(&plug);
current->bio_list = NULL;
}
static void __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
struct blk_plug plug;
current->bio_list = bio_list;
blk_start_plug(&plug);
do {
__submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
blk_finish_plug(&plug);
current->bio_list = NULL;
}
......@@ -910,12 +915,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
/*
* As the requests that require a zone lock are not plugged in the
* first place, directly accessing the plug instead of using
* blk_mq_plug() should not have any consequences during flushing for
* zoned devices.
*/
blk_flush_plug(current->plug, false);
/*
......@@ -987,10 +986,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
unsigned long stamp;
again:
stamp = READ_ONCE(part->bd_stamp);
if (unlikely(time_after(now, stamp))) {
if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
if (unlikely(time_after(now, stamp)) &&
likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
(end || part_in_flight(part)))
__part_stat_add(part, io_ticks, now - stamp);
if (part->bd_partno) {
part = bdev_whole(part);
goto again;
......
......@@ -130,6 +130,8 @@ static void blk_flush_restore_request(struct request *rq)
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
if (rq->bio)
rq->__sector = rq->bio->bi_iter.bi_sector;
/* make @rq a normal request */
rq->rq_flags &= ~RQF_FLUSH_SEQ;
......
......@@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
struct bio *blk_alloc_discard_bio(struct block_device *bdev,
sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask)
{
struct bio *bio = *biop;
sector_t bs_mask;
if (bdev_read_only(bdev))
return -EPERM;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
/* In case the discard granularity isn't set by buggy device driver */
if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
bdev);
return -EOPNOTSUPP;
}
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
if (!nr_sects)
return -EINVAL;
while (nr_sects) {
sector_t req_sects =
min(nr_sects, bio_discard_limit(bdev, sector));
sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector));
struct bio *bio;
bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio->bi_iter.bi_size = req_sects << 9;
sector += req_sects;
nr_sects -= req_sects;
if (!bio_sects)
return NULL;
bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask);
if (!bio)
return NULL;
bio->bi_iter.bi_sector = *sector;
bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT;
*sector += bio_sects;
*nr_sects -= bio_sects;
/*
* We can loop for a long time in here, if someone does
* full device discards (like mkfs). Be nice and allow
* us to schedule out to avoid softlocking if preempt
* is disabled.
* We can loop for a long time in here if someone does full device
* discards (like mkfs). Be nice and allow us to schedule out to avoid
* softlocking if preempt is disabled.
*/
cond_resched();
}
return bio;
}
*biop = bio;
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
{
struct bio *bio;
while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
gfp_mask)))
*biop = bio_chain_and_submit(*biop, bio);
return 0;
}
EXPORT_SYMBOL(__blkdev_issue_discard);
......
......@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
blkcg_bio_issue_init(split);
bio_chain(split, bio);
trace_block_split(split, bio->bi_iter.bi_sector);
WARN_ON_ONCE(bio_zone_write_plugging(bio));
submit_bio_noacct(bio);
return split;
}
......@@ -779,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req)
if (blk_do_io_stat(req)) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_local_dec(req->part,
in_flight[op_is_write(req_op(req))]);
part_stat_unlock();
}
}
......@@ -972,13 +975,7 @@ static void blk_account_io_merge_bio(struct request *req)
part_stat_unlock();
}
enum bio_merge_status {
BIO_MERGE_OK,
BIO_MERGE_NONE,
BIO_MERGE_FAILED,
};
static enum bio_merge_status bio_attempt_back_merge(struct request *req,
enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs)
{
const blk_opf_t ff = bio_failfast(bio);
......@@ -994,6 +991,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
blk_update_mixed_merge(req, bio, false);
if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
blk_zone_write_plug_bio_merged(bio);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
......@@ -1009,6 +1009,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
{
const blk_opf_t ff = bio_failfast(bio);
/*
* A front merge for writes to sequential zones of a zoned block device
* can happen only if the user submitted writes out of order. Do not
* merge such write to let it fail.
*/
if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
return BIO_MERGE_FAILED;
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
......@@ -1107,10 +1115,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct blk_plug *plug;
struct blk_plug *plug = current->plug;
struct request *rq;
plug = blk_mq_plug(bio);
if (!plug || rq_list_empty(plug->mq_list))
return false;
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017 Western Digital Corporation or its affiliates.
*/
#include <linux/blkdev.h>
#include "blk-mq-debugfs.h"
int queue_zone_wlock_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
unsigned int i;
if (!q->disk->seq_zones_wlock)
return 0;
for (i = 0; i < q->disk->nr_zones; i++)
if (test_bit(i, q->disk->seq_zones_wlock))
seq_printf(m, "%u\n", i);
return 0;
}
......@@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
{ "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
{ },
};
......@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
RQF_NAME(RESV),
};
......
......@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
}
#endif
#ifdef CONFIG_BLK_DEBUG_FS_ZONED
int queue_zone_wlock_show(void *data, struct seq_file *m);
#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
int queue_zone_wplugs_show(void *data, struct seq_file *m);
#else
static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
{
return 0;
}
......
This diff is collapsed.
......@@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
/*
* blk_mq_plug() - Get caller context plug
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
* order to increase BIO merging opportunities. This however can cause BIO
* insertion order to change from the order in which submit_bio() is being
* executed in the case of multiple contexts concurrently issuing BIOs to a
* device, even if these context are synchronized to tightly control BIO issuing
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
* ignoring the plug state of a BIO issuing context if it is for a zoned block
* device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
static inline struct blk_plug *blk_mq_plug( struct bio *bio)
{
/* Zoned block device write operation case: do not plug the BIO */
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
return NULL;
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
return current->plug;
}
/* Free all requests on the list */
static inline void blk_mq_free_requests(struct list_head *list)
{
......
......@@ -411,24 +411,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
* blk_queue_max_zone_append_sectors - set max sectors for a single zone append
* @q: the request queue for the device
* @max_zone_append_sectors: maximum number of sectors to write per command
*
* Sets the maximum number of sectors allowed for zone append commands. If
* Specifying 0 for @max_zone_append_sectors indicates that the queue does
* not natively support zone append operations and that the block layer must
* emulate these operations using regular writes.
**/
void blk_queue_max_zone_append_sectors(struct request_queue *q,
unsigned int max_zone_append_sectors)
{
unsigned int max_sectors;
unsigned int max_sectors = 0;
if (WARN_ON(!blk_queue_is_zoned(q)))
return;
max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
if (max_zone_append_sectors) {
max_sectors = min(q->limits.max_hw_sectors,
max_zone_append_sectors);
max_sectors = min(q->limits.chunk_sectors, max_sectors);
/*
* Signal eventual driver bugs resulting in the max_zone_append sectors limit
* being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
* or the max_hw_sectors limit not set.
* Signal eventual driver bugs resulting in the max_zone_append
* sectors limit being 0 due to the chunk_sectors limit (zone
* size) not set or the max_hw_sectors limit not set.
*/
WARN_ON(!max_sectors);
WARN_ON_ONCE(!max_sectors);
}
q->limits.max_zone_append_sectors = max_sectors;
}
......@@ -755,8 +763,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
t->max_zone_append_sectors = min(t->max_zone_append_sectors,
b->max_zone_append_sectors);
t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
queue_limits_max_zone_append_sectors(b));
t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
......@@ -1043,22 +1051,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
/**
* blk_queue_required_elevator_features - Set a queue required elevator features
* @q: the request queue for the target device
* @features: Required elevator features OR'ed together
*
* Tell the block layer that for the device controlled through @q, only the
* only elevators that can be used are those that implement at least the set of
* features specified by @features.
*/
void blk_queue_required_elevator_features(struct request_queue *q,
unsigned int features)
{
q->required_elevator_features = features;
}
EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
/**
* blk_queue_can_use_dma_map_merging - configure queue for merging segments.
* @q: the request queue for the device
......
......@@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now)
value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE)
blk_throtl_stat_add(rq, value);
rcu_read_lock();
cpu = get_cpu();
list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
......
......@@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
unsigned long long max_sectors = q->limits.max_zone_append_sectors;
unsigned long long max_sectors = queue_max_zone_append_sectors(q);
return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
}
......@@ -516,10 +516,6 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
#endif
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.attr = {.name = "hw_sector_size", .mode = 0444 },
......@@ -640,9 +636,6 @@ static struct attribute *queue_attrs[] = {
&queue_fua_entry.attr,
&queue_dax_entry.attr,
&queue_poll_delay_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
&blk_throtl_sample_time_entry.attr,
#endif
&queue_virt_boundary_mask_entry.attr,
&queue_dma_alignment_entry.attr,
NULL,
......@@ -814,7 +807,6 @@ int blk_register_queue(struct gendisk *disk)
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(disk);
blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
......
This diff is collapsed.
......@@ -58,12 +58,6 @@ enum tg_state_flags {
THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
};
enum {
LIMIT_LOW,
LIMIT_MAX,
LIMIT_CNT,
};
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
......@@ -102,14 +96,14 @@ struct throtl_grp {
bool has_rules_iops[2];
/* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT];
uint64_t bps[2];
/* user configured bps limits */
uint64_t bps_conf[2][LIMIT_CNT];
uint64_t bps_conf[2];
/* internally used IOPS limits */
unsigned int iops[2][LIMIT_CNT];
unsigned int iops[2];
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
unsigned int iops_conf[2];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
......@@ -132,22 +126,10 @@ struct throtl_grp {
unsigned long last_check_time;
unsigned long latency_target; /* us */
unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
unsigned long last_finish_time; /* ns / 1024 */
unsigned long checked_last_finish_time; /* ns / 1024 */
unsigned long avg_idletime; /* ns / 1024 */
unsigned long idletime_threshold; /* us */
unsigned long idletime_threshold_conf; /* us */
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
......@@ -168,23 +150,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
* Internal throttling interface
*/
#ifndef CONFIG_BLK_DEV_THROTTLING
static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
static inline void blk_throtl_exit(struct gendisk *disk) { }
static inline void blk_throtl_register(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct gendisk *disk);
void blk_throtl_exit(struct gendisk *disk);
void blk_throtl_register(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct gendisk *disk);
static inline bool blk_throtl_activated(struct request_queue *q)
{
return q->td != NULL;
}
static inline bool blk_should_throtl(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
struct throtl_grp *tg;
int rw = bio_data_dir(bio);
/*
* This is called under bio_queue_enter(), and it's synchronized with
* the activation of blk-throtl, which is protected by
* blk_mq_freeze_queue().
*/
if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
return false;
tg = blkg_to_tg(bio->bi_blkg);
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
bio_set_flag(bio, BIO_CGROUP_ACCT);
......
This diff is collapsed.
......@@ -38,6 +38,7 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
void bio_await_chain(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
......@@ -269,6 +270,14 @@ static inline void bio_integrity_free(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
enum bio_merge_status {
BIO_MERGE_OK,
BIO_MERGE_NONE,
BIO_MERGE_FAILED,
};
enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio, unsigned int nr_segs);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
......@@ -357,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq)
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
unsigned int part_in_flight(struct block_device *part);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
......@@ -378,17 +388,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
}
#endif /* CONFIG_BLK_ICQ */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
const char *page, size_t count);
extern void blk_throtl_bio_endio(struct bio *bio);
extern void blk_throtl_stat_add(struct request *rq, u64 time);
#else
static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif
struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
static inline bool blk_queue_may_bounce(struct request_queue *q)
......@@ -407,13 +406,85 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
}
#ifdef CONFIG_BLK_DEV_ZONED
void disk_free_zone_bitmaps(struct gendisk *disk);
void disk_init_zone_resources(struct gendisk *disk);
void disk_free_zone_resources(struct gendisk *disk);
static inline bool bio_zone_write_plugging(struct bio *bio)
{
return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
static inline bool bio_is_zone_append(struct bio *bio)
{
return bio_op(bio) == REQ_OP_ZONE_APPEND ||
bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
}
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
static inline void blk_zone_update_request_bio(struct request *rq,
struct bio *bio)
{
/*
* For zone append requests, the request sector indicates the location
* at which the BIO data was written. Return this value to the BIO
* issuer through the BIO iter sector.
* For plugged zone writes, which include emulated zone append, we need
* the original BIO sector so that blk_zone_write_plug_bio_endio() can
* lookup the zone write plug.
*/
if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
bio->bi_iter.bi_sector = rq->__sector;
}
void blk_zone_write_plug_bio_endio(struct bio *bio);
static inline void blk_zone_bio_endio(struct bio *bio)
{
/*
* For write BIOs to zoned devices, signal the completion of the BIO so
* that the next write BIO can be submitted by zone write plugging.
*/
if (bio_zone_write_plugging(bio))
blk_zone_write_plug_bio_endio(bio);
}
void blk_zone_write_plug_finish_request(struct request *rq);
static inline void blk_zone_finish_request(struct request *rq)
{
if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
blk_zone_write_plug_finish_request(rq);
}
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
static inline void disk_init_zone_resources(struct gendisk *disk)
{
}
static inline void disk_free_zone_resources(struct gendisk *disk)
{
}
static inline bool bio_zone_write_plugging(struct bio *bio)
{
return false;
}
static inline bool bio_is_zone_append(struct bio *bio)
{
return false;
}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
static inline void blk_zone_write_plug_init_request(struct request *rq)
{
}
static inline void blk_zone_update_request_bio(struct request *rq,
struct bio *bio)
{
}
static inline void blk_zone_bio_endio(struct bio *bio)
{
}
static inline void blk_zone_finish_request(struct request *rq)
{
}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
......
......@@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
static inline bool elv_support_features(struct request_queue *q,
const struct elevator_type *e)
{
return (q->required_elevator_features & e->elevator_features) ==
q->required_elevator_features;
}
/**
* elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
......@@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q,
spin_lock(&elv_list_lock);
e = __elevator_find(name);
if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
if (e && (!elevator_tryget(e)))
e = NULL;
spin_unlock(&elv_list_lock);
return e;
......@@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
}
/*
* Get the first elevator providing the features required by the request queue.
* Default to "none" if no matching elevator is found.
*/
static struct elevator_type *elevator_get_by_features(struct request_queue *q)
{
struct elevator_type *e, *found = NULL;
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
if (elv_support_features(q, e)) {
found = e;
break;
}
}
if (found && !elevator_tryget(found))
found = NULL;
spin_unlock(&elv_list_lock);
return found;
}
/*
* For a device queue that has no required features, use the default elevator
* settings. Otherwise, use the first elevator available matching the required
* features. If no suitable elevator is find or if the chosen elevator
* initialization fails, fall back to the "none" elevator (no elevator).
* Use the default elevator settings. If the chosen elevator initialization
* fails, fall back to the "none" elevator (no elevator).
*/
void elevator_init_mq(struct request_queue *q)
{
......@@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q)
if (unlikely(q->elevator))
return;
if (!q->required_elevator_features)
e = elevator_get_default(q);
else
e = elevator_get_by_features(q);
if (!e)
return;
......@@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
list_for_each_entry(e, &elv_list, list) {
if (e == cur)
len += sprintf(name+len, "[%s] ", e->elevator_name);
else if (elv_support_features(q, e))
else
len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
......
......@@ -74,7 +74,6 @@ struct elevator_type
struct elv_fs_entry *elevator_attrs;
const char *elevator_name;
const char *elevator_alias;
const unsigned int elevator_features;
struct module *elevator_owner;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
......
......@@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct iov_iter *iter, unsigned int nr_pages)
struct iov_iter *iter, struct block_device *bdev,
unsigned int nr_pages)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
......@@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
unsigned int nr_pages)
struct block_device *bdev, unsigned int nr_pages)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
......@@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
......@@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
struct block_device *bdev,
unsigned int nr_pages)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
bool is_read = iov_iter_rw(iter) == READ;
blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
......@@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
......@@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
return -EINVAL;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
return __blkdev_direct_IO_async(iocb, iter, nr_pages);
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
}
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
}
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
......@@ -390,7 +385,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
if (iomap->offset >= isize)
if (offset >= isize)
return -EIO;
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
......
......@@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part,
}
}
static unsigned int part_in_flight(struct block_device *part)
unsigned int part_in_flight(struct block_device *part)
{
unsigned int inflight = 0;
int cpu;
......@@ -345,9 +345,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
struct file *file;
int ret = 0;
if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
return -EINVAL;
if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
if (!disk_has_partscan(disk))
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
......@@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
goto out_unregister_bdi;
/* Make sure the first partition scan will be proceed */
if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) &&
!test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
if (get_capacity(disk) && disk_has_partscan(disk))
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev_add(disk->part0, ddev->devt);
......@@ -954,15 +951,10 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
else
inflight = part_in_flight(bdev);
if (inflight) {
part_stat_lock();
update_io_ticks(bdev, jiffies, true);
......@@ -1047,6 +1039,12 @@ static ssize_t diskseq_show(struct device *dev,
return sprintf(buf, "%llu\n", disk->diskseq);
}
static ssize_t partscan_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
......@@ -1060,6 +1058,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
......@@ -1106,6 +1105,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
&dev_attr_diskseq.attr,
&dev_attr_partscan.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
......@@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev)
disk_release_events(disk);
kfree(disk->random);
disk_free_zone_bitmaps(disk);
disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
disk->queue->disk = NULL;
......@@ -1251,11 +1251,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
inflight = part_in_flight(hd);
inflight = part_in_flight(hd);
if (inflight) {
part_stat_lock();
update_io_ticks(hd, jiffies, true);
......@@ -1364,6 +1361,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (blkcg_init_disk(disk))
goto out_erase_part0;
disk_init_zone_resources(disk);
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
......
......@@ -33,7 +33,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
if (op == BLKPG_DEL_PARTITION)
return bdev_del_partition(disk, p.pno);
if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
return -EINVAL;
/* Check that the partition is aligned to the block size */
if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
......@@ -95,9 +95,12 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
uint64_t range[2];
uint64_t start, len, end;
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
struct inode *inode = bdev->bd_inode;
uint64_t range[2], start, len, end;
struct bio *prev = NULL, *bio;
sector_t sector, nr_sects;
struct blk_plug plug;
int err;
if (!(mode & BLK_OPEN_WRITE))
......@@ -105,6 +108,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
return -EPERM;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
......@@ -112,9 +117,9 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
start = range[0];
len = range[1];
if (start & 511)
if (!len)
return -EINVAL;
if (len & 511)
if ((start | len) & bs_mask)
return -EINVAL;
if (check_add_overflow(start, len, &end) ||
......@@ -125,7 +130,32 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
goto fail;
err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
sector = start >> SECTOR_SHIFT;
nr_sects = len >> SECTOR_SHIFT;
blk_start_plug(&plug);
while (1) {
if (fatal_signal_pending(current)) {
if (prev)
bio_await_chain(prev);
err = -EINTR;
goto out_unplug;
}
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
GFP_KERNEL);
if (!bio)
break;
prev = bio_chain_and_submit(prev, bio);
}
if (prev) {
err = submit_bio_wait(prev);
if (err == -EOPNOTSUPP)
err = 0;
bio_put(prev);
}
out_unplug:
blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(inode->i_mapping);
return err;
......
......@@ -102,7 +102,6 @@ struct deadline_data {
int prio_aging_expire;
spinlock_t lock;
spinlock_t zone_lock;
};
/* Maps an I/O priority class to a deadline scheduler priority. */
......@@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
* get the request before `rq' in sector-sorted order
*/
static inline struct request *
deadline_earlier_request(struct request *rq)
{
struct rb_node *node = rb_prev(&rq->rb_node);
if (node)
return rb_entry_rq(node);
return NULL;
}
/*
* get the request after `rq' in sector-sorted order
*/
static inline struct request *
deadline_latter_request(struct request *rq)
{
struct rb_node *node = rb_next(&rq->rb_node);
if (node)
return rb_entry_rq(node);
return NULL;
}
/*
* Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
* return the first request after the start of the zone containing @pos.
* Return the first request for which blk_rq_pos() >= @pos.
*/
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
......@@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
return NULL;
rq = rb_entry_rq(node);
/*
* A zoned write may have been requeued with a starting position that
* is below that of the most recently dispatched request. Hence, for
* zoned writes, start searching from the start of a zone.
*/
if (blk_rq_is_seq_zoned_write(rq))
pos = round_down(pos, rq->q->limits.chunk_sectors);
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
......@@ -308,36 +270,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
}
/*
* Check if rq has a sequential request preceding it.
*/
static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
{
struct request *prev = deadline_earlier_request(rq);
if (!prev)
return false;
return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
}
/*
* Skip all write requests that are sequential from @rq, even if we cross
* a zone boundary.
*/
static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
struct request *rq)
{
sector_t pos = blk_rq_pos(rq);
do {
pos += blk_rq_sectors(rq);
rq = deadline_latter_request(rq);
} while (rq && blk_rq_pos(rq) == pos);
return rq;
}
/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
......@@ -346,40 +278,10 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
struct request *rq, *rb_rq, *next;
unsigned long flags;
if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone. For some HDDs, breaking a sequential
* write stream can lead to lower throughput, so make sure to preserve
* sequential write streams, even if that stream crosses into the next
* zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
queuelist) {
/* Check whether a prior request exists for the same zone. */
rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
rq = rb_rq;
if (blk_req_can_dispatch_to_zone(rq) &&
(blk_queue_nonrot(rq->q) ||
!deadline_is_seq_write(dd, rq)))
goto out;
}
rq = NULL;
out:
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
}
/*
......@@ -390,36 +292,8 @@ static struct request *
deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
struct request *rq;
unsigned long flags;
rq = deadline_from_pos(per_prio, data_dir,
return deadline_from_pos(per_prio, data_dir,
per_prio->latest_pos[data_dir]);
if (!rq)
return NULL;
if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
* an unlocked target zone. For some HDDs, breaking a sequential
* write stream can lead to lower throughput, so make sure to preserve
* sequential write streams, even if that stream crosses into the next
* zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
if (blk_queue_nonrot(rq->q))
rq = deadline_latter_request(rq);
else
rq = deadline_skip_seq_writes(dd, rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
return rq;
}
/*
......@@ -525,10 +399,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
rq = next_rq;
}
/*
* For a zoned block device, if we only have writes queued and none of
* them can be dispatched, rq will be NULL.
*/
if (!rq)
return NULL;
......@@ -549,10 +419,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
prio = ioprio_class_to_prio[ioprio_class];
dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
/*
* If the request needs its target zone locked, do it.
*/
blk_req_zone_write_lock(rq);
rq->rq_flags |= RQF_STARTED;
return rq;
}
......@@ -722,7 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
......@@ -804,12 +669,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
lockdep_assert_held(&dd->lock);
/*
* This may be a requeue of a write request that has locked its
* target zone. If it is the case, this releases the zone lock.
*/
blk_req_zone_write_unlock(rq);
prio = ioprio_class_to_prio[ioprio_class];
per_prio = &dd->per_prio[prio];
if (!rq->elv.priv[0]) {
......@@ -841,18 +700,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
insert_before = &per_prio->fifo_list[data_dir];
#ifdef CONFIG_BLK_DEV_ZONED
/*
* Insert zoned writes such that requests are sorted by
* position per zone.
*/
if (blk_rq_is_seq_zoned_write(rq)) {
struct request *rq2 = deadline_latter_request(rq);
if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
insert_before = &rq2->queuelist;
}
#endif
list_add_tail(&rq->queuelist, insert_before);
}
}
......@@ -887,33 +734,8 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
enum dd_prio p;
for (p = 0; p <= DD_PRIO_MAX; p++)
if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
return true;
return false;
}
/*
* Callback from inside blk_mq_free_request().
*
* For zoned block devices, write unlock the target zone of
* completed write requests. Do this while holding the zone lock
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* or deadline_next_request() are executing. This function is called for
* all requests, whether or not these requests complete successfully.
*
* For a zoned block device, __dd_dispatch_request() may have stopped
* dispatching requests if all the queued requests are write requests directed
* at zones that are already locked due to on-going write requests. To ensure
* write request dispatch progress in this case, mark the queue as needing a
* restart to ensure that the queue is run again after completion of the
* request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
......@@ -928,21 +750,8 @@ static void dd_finish_request(struct request *rq)
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
if (!rq->elv.priv[0])
return;
if (rq->elv.priv[0])
atomic_inc(&per_prio->stats.completed);
if (blk_queue_is_zoned(q)) {
unsigned long flags;
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
spin_unlock_irqrestore(&dd->zone_lock, flags);
if (dd_has_write_work(rq->mq_hctx))
blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
}
}
static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
......@@ -1266,7 +1075,6 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
.elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
......
......@@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
}
if (*partdef == '(') {
int length;
char *next = strchr(++partdef, ')');
partdef++;
char *next = strsep(&partdef, ")");
if (!next) {
pr_warn("cmdline partition format is invalid.");
......@@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
goto fail;
}
length = min_t(int, next - partdef,
sizeof(new_subpart->name) - 1);
strscpy(new_subpart->name, partdef, length);
partdef = ++next;
strscpy(new_subpart->name, next, sizeof(new_subpart->name));
} else
new_subpart->name[0] = '\0';
......@@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts)
}
}
static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
static int parse_parts(struct cmdline_parts **parts, char *bdevdef)
{
int ret = -EINVAL;
char *next;
int length;
struct cmdline_subpart **next_subpart;
struct cmdline_parts *newparts;
char buf[BDEVNAME_SIZE + 32 + 4];
*parts = NULL;
......@@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
if (!newparts)
return -ENOMEM;
next = strchr(bdevdef, ':');
next = strsep(&bdevdef, ":");
if (!next) {
pr_warn("cmdline partition has no block device.");
goto fail;
}
length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
strscpy(newparts->name, bdevdef, length);
strscpy(newparts->name, next, sizeof(newparts->name));
newparts->nr_subparts = 0;
next_subpart = &newparts->subpart;
while (next && *(++next)) {
bdevdef = next;
next = strchr(bdevdef, ',');
length = (!next) ? (sizeof(buf) - 1) :
min_t(int, next - bdevdef, sizeof(buf) - 1);
strscpy(buf, bdevdef, length);
ret = parse_subpart(next_subpart, buf);
while ((next = strsep(&bdevdef, ","))) {
ret = parse_subpart(next_subpart, next);
if (ret)
goto fail;
......@@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts,
*parts = NULL;
next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
if (!buf)
return -ENOMEM;
next_parts = parts;
while (next && *pbuf) {
next = strchr(pbuf, ';');
if (next)
*next = '\0';
ret = parse_parts(next_parts, pbuf);
while ((next = strsep(&pbuf, ";"))) {
ret = parse_parts(next_parts, next);
if (ret)
goto fail;
if (next)
pbuf = ++next;
next_parts = &(*next_parts)->next_parts;
}
......@@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts;
static int add_part(int slot, struct cmdline_subpart *subpart,
struct parsed_partitions *state)
{
int label_min;
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
......@@ -262,9 +239,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
info = &state->parts[slot].info;
label_min = min_t(int, sizeof(info->volname) - 1,
sizeof(subpart->name));
strscpy(info->volname, subpart->name, label_min);
strscpy(info->volname, subpart->name, sizeof(info->volname));
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
......
......@@ -573,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk)
struct parsed_partitions *state;
int ret = -EAGAIN, p;
if (disk->flags & GENHD_FL_NO_PART)
return 0;
if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
if (!disk_has_partscan(disk))
return 0;
state = check_partition(disk);
......
This diff is collapsed.
This diff is collapsed.
......@@ -82,6 +82,7 @@ struct nullb_device {
unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int zone_max_open; /* max number of open zones */
unsigned int zone_max_active; /* max number of active zones */
unsigned int zone_append_max_sectors; /* Max sectors per zone append command */
unsigned int submit_queues; /* number of submission queues */
unsigned int prev_submit_queues; /* number of submission queues before change */
unsigned int poll_queues; /* number of IOPOLL submission queues */
......@@ -104,6 +105,7 @@ struct nullb_device {
bool no_sched; /* no IO scheduler for the device */
bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */
bool fua; /* Support FUA */
};
struct nullb {
......
This diff is collapsed.
......@@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub)
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
return blk_revalidate_disk_zones(ub->ub_disk, NULL);
return blk_revalidate_disk_zones(ub->ub_disk);
}
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
......@@ -249,8 +249,7 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
blk_queue_required_elevator_features(ub->ub_disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
}
......
......@@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev)
*/
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk, NULL);
err = blk_revalidate_disk_zones(vblk->disk);
if (err)
goto out_cleanup_disk;
}
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment