Commit 8494bcf5 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.20/drivers' of git://git.kernel.dk/linux-block

Pull block driver changes from Jens Axboe:
 "This contains:

   - The 4k/partition fixes for brd from Boaz/Matthew.

   - A few xen front/back block fixes from David Vrabel and Roger Pau
     Monne.

   - Floppy changes from Takashi, cleaning the device file creation.

   - Switching libata to use the new blk-mq tagging policy, removing
     code (and a suboptimal implementation) from libata.  This will
     throw you a merge conflict, since a bug in the original libata
     tagging code was fixed since this code was branched.  Trivial.
     From Shaohua.

   - Conversion of loop to blk-mq, from Ming Lei.

   - Cleanup of the io_schedule() handling in bsg from Peter Zijlstra.
     He claims it improves on unreadable code, which will cost him a
     beer.

   - Maintainer update or NDB, now handled by Markus Pargmann.

   - NVMe:
        - Optimization from me that avoids a kmalloc/kfree per IO for
          smaller (<= 8KB) IO. This cuts about 1% of high IOPS CPU
          overhead.
        - Removal of (now) dead RCU code, a relic from before NVMe was
          converted to blk-mq"

* 'for-3.20/drivers' of git://git.kernel.dk/linux-block:
  xen-blkback: default to X86_32 ABI on x86
  xen-blkfront: fix accounting of reqs when migrating
  xen-blkback,xen-blkfront: add myself as maintainer
  block: Simplify bsg complete all
  floppy: Avoid manual call of device_create_file()
  NVMe: avoid kmalloc/kfree for smaller IO
  MAINTAINERS: Update NBD maintainer
  libata: make sata_sil24 use fifo tag allocator
  libata: move sas ata tag allocation to libata-scsi.c
  libata: use blk taging
  NVMe: within nvme_free_queues(), delete RCU sychro/deferred free
  null_blk: suppress invalid partition info
  brd: Request from fdisk 4k alignment
  brd: Fix all partitions BUGs
  axonram: Fix bug in direct_access
  loop: add blk-mq.h include
  block: loop: don't handle REQ_FUA explicitly
  block: loop: introduce lo_discard() and lo_req_flush()
  block: loop: say goodby to bio
  block: loop: improve performance via blk-mq
parents 3e12cefb b042a3ca
......@@ -6642,9 +6642,10 @@ F: include/uapi/linux/netrom.h
F: net/netrom/
NETWORK BLOCK DEVICE (NBD)
M: Paul Clements <Paul.Clements@steeleye.com>
M: Markus Pargmann <mpa@pengutronix.de>
S: Maintained
L: nbd-general@lists.sourceforge.net
T: git git://git.pengutronix.de/git/mpa/linux-nbd.git
F: Documentation/blockdev/nbd.txt
F: drivers/block/nbd.c
F: include/linux/nbd.h
......@@ -10690,6 +10691,7 @@ F: drivers/pci/*xen*
XEN BLOCK SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
M: Roger Pau Monné <roger.pau@citrix.com>
L: xen-devel@lists.xenproject.org (moderated for non-subscribers)
S: Supported
F: drivers/block/xen-blkback/*
......
......@@ -147,7 +147,7 @@ axon_ram_direct_access(struct block_device *device, sector_t sector,
loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT;
*kaddr = (void *)(bank->ph_addr + offset);
*pfn = virt_to_phys(kaddr) >> PAGE_SHIFT;
*pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT;
return bank->size - offset;
}
......
......@@ -136,42 +136,6 @@ static inline struct hlist_head *bsg_dev_idx_hash(int index)
return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
}
static int bsg_io_schedule(struct bsg_device *bd)
{
DEFINE_WAIT(wait);
int ret = 0;
spin_lock_irq(&bd->lock);
BUG_ON(bd->done_cmds > bd->queued_cmds);
/*
* -ENOSPC or -ENODATA? I'm going for -ENODATA, meaning "I have no
* work to do", even though we return -ENOSPC after this same test
* during bsg_write() -- there, it means our buffer can't have more
* bsg_commands added to it, thus has no space left.
*/
if (bd->done_cmds == bd->queued_cmds) {
ret = -ENODATA;
goto unlock;
}
if (!test_bit(BSG_F_BLOCK, &bd->flags)) {
ret = -EAGAIN;
goto unlock;
}
prepare_to_wait(&bd->wq_done, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bd->lock);
io_schedule();
finish_wait(&bd->wq_done, &wait);
return ret;
unlock:
spin_unlock_irq(&bd->lock);
return ret;
}
static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_v4 *hdr, struct bsg_device *bd,
fmode_t has_write_perm)
......@@ -482,6 +446,30 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
return ret;
}
static bool bsg_complete(struct bsg_device *bd)
{
bool ret = false;
bool spin;
do {
spin_lock_irq(&bd->lock);
BUG_ON(bd->done_cmds > bd->queued_cmds);
/*
* All commands consumed.
*/
if (bd->done_cmds == bd->queued_cmds)
ret = true;
spin = !test_bit(BSG_F_BLOCK, &bd->flags);
spin_unlock_irq(&bd->lock);
} while (!ret && spin);
return ret;
}
static int bsg_complete_all_commands(struct bsg_device *bd)
{
struct bsg_command *bc;
......@@ -492,17 +480,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd)
/*
* wait for all commands to complete
*/
ret = 0;
do {
ret = bsg_io_schedule(bd);
/*
* look for -ENODATA specifically -- we'll sometimes get
* -ERESTARTSYS when we've taken a signal, but we can't
* return until we're done freeing the queue, so ignore
* it. The signal will get handled when we're done freeing
* the bsg_device.
*/
} while (ret != -ENODATA);
io_wait_event(bd->wq_done, bsg_complete(bd));
/*
* discard done commands
......
......@@ -1585,8 +1585,6 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
else
tag = 0;
if (test_and_set_bit(tag, &ap->qc_allocated))
BUG();
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
......@@ -4722,69 +4720,36 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
}
/**
* ata_qc_new - Request an available ATA command, for queueing
* @ap: target port
*
* Some ATA host controllers may implement a queue depth which is less
* than ATA_MAX_QUEUE. So we shouldn't allocate a tag which is beyond
* the hardware limitation.
* ata_qc_new_init - Request an available ATA command, and initialize it
* @dev: Device from whom we request an available command structure
*
* LOCKING:
* None.
*/
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag)
{
struct ata_queued_cmd *qc = NULL;
unsigned int max_queue = ap->host->n_tags;
unsigned int i, tag;
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
if (ap->flags & ATA_FLAG_LOWTAG)
tag = i;
else
tag = tag < max_queue ? tag : 0;
/* the last tag is reserved for internal command. */
if (tag == ATA_TAG_INTERNAL)
continue;
if (!test_and_set_bit(tag, &ap->qc_allocated)) {
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
ap->last_tag = tag;
break;
}
/* libsas case */
if (!ap->scsi_host) {
tag = ata_sas_allocate_tag(ap);
if (tag < 0)
return NULL;
}
return qc;
}
/**
* ata_qc_new_init - Request an available ATA command, and initialize it
* @dev: Device from whom we request an available command structure
*
* LOCKING:
* None.
*/
struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
{
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
qc = ata_qc_new(ap);
if (qc) {
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
ata_qc_reinit(qc);
}
ata_qc_reinit(qc);
return qc;
}
......@@ -4811,7 +4776,8 @@ void ata_qc_free(struct ata_queued_cmd *qc)
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
qc->tag = ATA_TAG_POISON;
clear_bit(tag, &ap->qc_allocated);
if (!ap->scsi_host)
ata_sas_free_tag(tag, ap);
}
}
......
......@@ -756,7 +756,7 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
{
struct ata_queued_cmd *qc;
qc = ata_qc_new_init(dev);
qc = ata_qc_new_init(dev, cmd->request->tag);
if (qc) {
qc->scsicmd = cmd;
qc->scsidone = cmd->scsi_done;
......@@ -3668,6 +3668,9 @@ int ata_scsi_add_hosts(struct ata_host *host, struct scsi_host_template *sht)
*/
shost->max_host_blocked = 1;
if (scsi_init_shared_tag_map(shost, host->n_tags))
goto err_add;
rc = scsi_add_host_with_dma(ap->scsi_host,
&ap->tdev, ap->host->dev);
if (rc)
......@@ -4230,3 +4233,31 @@ int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap)
return rc;
}
EXPORT_SYMBOL_GPL(ata_sas_queuecmd);
int ata_sas_allocate_tag(struct ata_port *ap)
{
unsigned int max_queue = ap->host->n_tags;
unsigned int i, tag;
for (i = 0, tag = ap->sas_last_tag + 1; i < max_queue; i++, tag++) {
if (ap->flags & ATA_FLAG_LOWTAG)
tag = 1;
else
tag = tag < max_queue ? tag : 0;
/* the last tag is reserved for internal command. */
if (tag == ATA_TAG_INTERNAL)
continue;
if (!test_and_set_bit(tag, &ap->sas_tag_allocated)) {
ap->sas_last_tag = tag;
return tag;
}
}
return -1;
}
void ata_sas_free_tag(unsigned int tag, struct ata_port *ap)
{
clear_bit(tag, &ap->sas_tag_allocated);
}
......@@ -63,7 +63,7 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
extern void ata_force_cbl(struct ata_port *ap);
extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag);
extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
u64 block, u32 n_block, unsigned int tf_flags,
unsigned int tag);
......@@ -144,6 +144,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
extern int ata_bus_probe(struct ata_port *ap);
extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
unsigned int id, u64 lun);
int ata_sas_allocate_tag(struct ata_port *ap);
void ata_sas_free_tag(unsigned int tag, struct ata_port *ap);
/* libata-eh.c */
......
......@@ -388,6 +388,7 @@ static struct scsi_host_template sil24_sht = {
.can_queue = SIL24_MAX_CMDS,
.sg_tablesize = SIL24_MAX_SGE,
.dma_boundary = ATA_DMA_BOUNDARY,
.tag_alloc_policy = BLK_TAG_ALLOC_FIFO,
};
static struct ata_port_operations sil24_ops = {
......
......@@ -438,19 +438,18 @@ static const struct block_device_operations brd_fops = {
/*
* And now the modules code and kernel interface.
*/
static int rd_nr;
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
static int max_part;
static int part_shift;
static int part_show = 0;
static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
module_param(rd_nr, int, S_IRUGO);
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
module_param(rd_size, int, S_IRUGO);
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
static int max_part = 1;
module_param(max_part, int, S_IRUGO);
MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
module_param(part_show, int, S_IRUGO);
MODULE_PARM_DESC(part_show, "Control RAM disk visibility in /proc/partitions");
MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
MODULE_ALIAS("rd");
......@@ -487,25 +486,33 @@ static struct brd_device *brd_alloc(int i)
brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
/* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
disk = brd->brd_disk = alloc_disk(1 << part_shift);
disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
disk->major = RAMDISK_MAJOR;
disk->first_minor = i << part_shift;
disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
if (!part_show)
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
......@@ -527,10 +534,11 @@ static void brd_free(struct brd_device *brd)
kfree(brd);
}
static struct brd_device *brd_init_one(int i)
static struct brd_device *brd_init_one(int i, bool *new)
{
struct brd_device *brd;
*new = false;
list_for_each_entry(brd, &brd_devices, brd_list) {
if (brd->brd_number == i)
goto out;
......@@ -541,6 +549,7 @@ static struct brd_device *brd_init_one(int i)
add_disk(brd->brd_disk);
list_add_tail(&brd->brd_list, &brd_devices);
}
*new = true;
out:
return brd;
}
......@@ -556,70 +565,46 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
struct brd_device *brd;
struct kobject *kobj;
bool new;
mutex_lock(&brd_devices_mutex);
brd = brd_init_one(MINOR(dev) >> part_shift);
brd = brd_init_one(MINOR(dev) / max_part, &new);
kobj = brd ? get_disk(brd->brd_disk) : NULL;
mutex_unlock(&brd_devices_mutex);
*part = 0;
if (new)
*part = 0;
return kobj;
}
static int __init brd_init(void)
{
int i, nr;
unsigned long range;
struct brd_device *brd, *next;
int i;
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
* However, this will not work well with user space tool that doesn't
* know about such "feature". In order to not break any existing
* tool, we do the following:
*
* (1) if rd_nr is specified, create that many upfront, and this
* also becomes a hard limit.
* (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT
* (default 16) rd device on module load, user can further
* extend brd device by create dev node themselves and have
* kernel automatically instantiate actual device on-demand.
* (1) if rd_nr is specified, create that many upfront. else
* it defaults to CONFIG_BLK_DEV_RAM_COUNT
* (2) User can further extend brd devices by create dev node themselves
* and have kernel automatically instantiate actual device
* on-demand. Example:
* mknod /path/devnod_name b 1 X # 1 is the rd major
* fdisk -l /path/devnod_name
* If (X / max_part) was not already created it will be created
* dynamically.
*/
part_shift = 0;
if (max_part > 0) {
part_shift = fls(max_part);
/*
* Adjust max_part according to part_shift as it is exported
* to user space so that user can decide correct minor number
* if [s]he want to create more devices.
*
* Note that -1 is required because partition 0 is reserved
* for the whole disk.
*/
max_part = (1UL << part_shift) - 1;
}
if ((1UL << part_shift) > DISK_MAX_PARTS)
return -EINVAL;
if (rd_nr > 1UL << (MINORBITS - part_shift))
return -EINVAL;
if (rd_nr) {
nr = rd_nr;
range = rd_nr << part_shift;
} else {
nr = CONFIG_BLK_DEV_RAM_COUNT;
range = 1UL << MINORBITS;
}
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
for (i = 0; i < nr; i++) {
if (unlikely(!max_part))
max_part = 1;
for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
if (!brd)
goto out_free;
......@@ -631,10 +616,10 @@ static int __init brd_init(void)
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
printk(KERN_INFO "brd: module loaded\n");
pr_info("brd: module loaded\n");
return 0;
out_free:
......@@ -644,21 +629,21 @@ static int __init brd_init(void)
}
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
pr_info("brd: module NOT loaded !!!\n");
return -ENOMEM;
}
static void __exit brd_exit(void)
{
unsigned long range;
struct brd_device *brd, *next;
range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS;
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
pr_info("brd: module unloaded\n");
}
module_init(brd_init);
......
......@@ -4112,6 +4112,13 @@ static ssize_t floppy_cmos_show(struct device *dev,
static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
static struct attribute *floppy_dev_attrs[] = {
&dev_attr_cmos.attr,
NULL
};
ATTRIBUTE_GROUPS(floppy_dev);
static void floppy_device_release(struct device *dev)
{
}
......@@ -4324,16 +4331,12 @@ static int __init do_floppy_init(void)
floppy_device[drive].name = floppy_device_name;
floppy_device[drive].id = drive;
floppy_device[drive].dev.release = floppy_device_release;
floppy_device[drive].dev.groups = floppy_dev_groups;
err = platform_device_register(&floppy_device[drive]);
if (err)
goto out_remove_drives;
err = device_create_file(&floppy_device[drive].dev,
&dev_attr_cmos);
if (err)
goto out_unreg_platform_dev;
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
......@@ -4343,13 +4346,10 @@ static int __init do_floppy_init(void)
return 0;
out_unreg_platform_dev:
platform_device_unregister(&floppy_device[drive]);
out_remove_drives:
while (drive--) {
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
}
......@@ -4594,7 +4594,6 @@ static void __exit floppy_module_exit(void)
if (floppy_available(drive)) {
del_gendisk(disks[drive]);
device_remove_file(&floppy_device[drive].dev, &dev_attr_cmos);
platform_device_unregister(&floppy_device[drive]);
}
blk_cleanup_queue(disks[drive]->queue);
......
This diff is collapsed.
......@@ -11,8 +11,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
......@@ -52,19 +54,23 @@ struct loop_device {
gfp_t old_gfp_mask;
spinlock_t lo_lock;
struct bio_list lo_bio_list;
unsigned int lo_bio_count;
struct list_head write_cmd_head;
struct work_struct write_work;
bool write_started;
int lo_state;
struct mutex lo_ctl_mutex;
struct task_struct *lo_thread;
wait_queue_head_t lo_event;
/* wait queue for incoming requests */
wait_queue_head_t lo_req_wait;
struct request_queue *lo_queue;
struct blk_mq_tag_set tag_set;
struct gendisk *lo_disk;
};
struct loop_cmd {
struct work_struct read_work;
struct request *rq;
struct list_head list;
};
/* Support for loadable transfer modules */
struct loop_func_table {
int number; /* filter type */
......
......@@ -579,7 +579,7 @@ static int null_add_dev(void)
sector_div(size, bs);
set_capacity(disk, size);
disk->flags |= GENHD_FL_EXT_DEVT;
disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major;
disk->first_minor = nullb->index;
disk->fops = &null_fops;
......
......@@ -144,8 +144,37 @@ struct nvme_cmd_info {
void *ctx;
int aborted;
struct nvme_queue *nvmeq;
struct nvme_iod iod[0];
};
/*
* Max size of iod being embedded in the request payload
*/
#define NVME_INT_PAGES 2
#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
static unsigned int nvme_cmd_size(struct nvme_dev *dev)
{
unsigned int ret = sizeof(struct nvme_cmd_info);
ret += sizeof(struct nvme_iod);
ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
return ret;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
......@@ -218,6 +247,19 @@ static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
}
static void *iod_get_private(struct nvme_iod *iod)
{
return (void *) (iod->private & ~0x1UL);
}
/*
* If bit 0 is set, the iod is embedded in the request payload.
*/
static bool iod_should_kfree(struct nvme_iod *iod)
{
return (iod->private & 0x01) == 0;
}
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
......@@ -361,35 +403,53 @@ static __le64 **iod_list(struct nvme_iod *iod)
return ((void *)iod) + iod->offset;
}
/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
static int nvme_npages(unsigned size, struct nvme_dev *dev)
static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
unsigned nseg, unsigned long private)
{
unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
return DIV_ROUND_UP(8 * nprps, dev->page_size - 8);
iod->private = private;
iod->offset = offsetof(struct nvme_iod, sg[nseg]);
iod->npages = -1;
iod->length = nbytes;
iod->nents = 0;
}
static struct nvme_iod *
nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp)
__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
unsigned long priv, gfp_t gfp)
{
struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
sizeof(__le64 *) * nvme_npages(nbytes, dev) +
sizeof(__le64 *) * nvme_npages(bytes, dev) +
sizeof(struct scatterlist) * nseg, gfp);
if (iod) {
iod->offset = offsetof(struct nvme_iod, sg[nseg]);
iod->npages = -1;
iod->length = nbytes;
iod->nents = 0;
iod->first_dma = 0ULL;
}
if (iod)
iod_init(iod, bytes, nseg, priv);
return iod;
}
static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
gfp_t gfp)
{
unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
sizeof(struct nvme_dsm_range);
unsigned long mask = 0;
struct nvme_iod *iod;
if (rq->nr_phys_segments <= NVME_INT_PAGES &&
size <= NVME_INT_BYTES(dev)) {
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
iod = cmd->iod;
mask = 0x01;
iod_init(iod, size, rq->nr_phys_segments,
(unsigned long) rq | 0x01);
return iod;
}
return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
(unsigned long) rq, gfp);
}
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
{
const int last_prp = dev->page_size / 8 - 1;
......@@ -405,7 +465,9 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
prp_dma = next_prp_dma;
}
kfree(iod);
if (iod_should_kfree(iod))
kfree(iod);
}
static int nvme_error_status(u16 status)
......@@ -424,7 +486,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
struct nvme_completion *cqe)
{
struct nvme_iod *iod = ctx;
struct request *req = iod->private;
struct request *req = iod_get_private(iod);
struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
u16 status = le16_to_cpup(&cqe->status) >> 1;
......@@ -585,7 +647,7 @@ static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
struct nvme_ns *ns)
{
struct request *req = iod->private;
struct request *req = iod_get_private(iod);
struct nvme_command *cmnd;
u16 control = 0;
u32 dsmgmt = 0;
......@@ -626,17 +688,12 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
struct nvme_iod *iod;
int psegs = req->nr_phys_segments;
enum dma_data_direction dma_dir;
unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) :
sizeof(struct nvme_dsm_range);
iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC);
iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
if (!iod)
return BLK_MQ_RQ_QUEUE_BUSY;
iod->private = req;
if (req->cmd_flags & REQ_DISCARD) {
void *range;
/*
......@@ -651,10 +708,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto retry_cmd;
iod_list(iod)[0] = (__le64 *)range;
iod->npages = 0;
} else if (psegs) {
} else if (req->nr_phys_segments) {
dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
sg_init_table(iod->sg, psegs);
sg_init_table(iod->sg, req->nr_phys_segments);
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto error_cmd;
......@@ -1137,21 +1194,14 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
{
LLIST_HEAD(q_list);
struct nvme_queue *nvmeq, *next;
struct llist_node *entry;
int i;
for (i = dev->queue_count - 1; i >= lowest; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
llist_add(&nvmeq->node, &q_list);
dev->queue_count--;
dev->queues[i] = NULL;
}
synchronize_rcu();
entry = llist_del_all(&q_list);
llist_for_each_entry_safe(nvmeq, next, entry, node)
nvme_free_queue(nvmeq);
}
}
/**
......@@ -1408,7 +1458,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info);
dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
dev->admin_tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
......@@ -1522,7 +1572,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
}
err = -ENOMEM;
iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL);
iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
if (!iod)
goto put_pages;
......@@ -2148,7 +2198,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
dev->tagset.cmd_size = sizeof(struct nvme_cmd_info);
dev->tagset.cmd_size = nvme_cmd_size(dev);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
......
......@@ -214,6 +214,15 @@ enum blkif_protocol {
BLKIF_PROTOCOL_X86_64 = 3,
};
/*
* Default protocol if the frontend doesn't specify one.
*/
#ifdef CONFIG_X86
# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
#else
# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
#endif
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
......
......@@ -868,11 +868,11 @@ static int connect_ring(struct backend_info *be)
return err;
}
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
"%63s", protocol, NULL);
if (err)
strcpy(protocol, "unspecified, assuming native");
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
......
......@@ -1511,7 +1511,7 @@ static int blkif_recover(struct blkfront_info *info)
merge_bio.tail = copy[i].request->biotail;
bio_list_merge(&bio_list, &merge_bio);
copy[i].request->bio = NULL;
blk_put_request(copy[i].request);
blk_end_request_all(copy[i].request, 0);
}
kfree(copy);
......@@ -1534,7 +1534,7 @@ static int blkif_recover(struct blkfront_info *info)
req->bio = NULL;
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
pr_alert("diskcache flush request found!\n");
__blk_put_request(info->rq, req);
__blk_end_request_all(req, 0);
}
spin_unlock_irq(&info->io_lock);
......
......@@ -823,10 +823,10 @@ struct ata_port {
unsigned int cbl; /* cable type; ATA_CBL_xxx */
struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
unsigned long qc_allocated;
unsigned long sas_tag_allocated; /* for sas tag allocation only */
unsigned int qc_active;
int nr_active_links; /* #links with active qcs */
unsigned int last_tag; /* track next tag hw expects */
unsigned int sas_last_tag; /* track next tag hw expects */
struct ata_link link; /* host default link */
struct ata_link *slave_link; /* see ata_slave_link_init() */
......@@ -1352,6 +1352,7 @@ extern struct device_attribute *ata_common_sdev_attrs[];
.ioctl = ata_scsi_ioctl, \
.queuecommand = ata_scsi_queuecmd, \
.can_queue = ATA_DEF_QUEUE, \
.tag_alloc_policy = BLK_TAG_ALLOC_RR, \
.this_id = ATA_SHT_THIS_ID, \
.cmd_per_lun = ATA_SHT_CMD_PER_LUN, \
.emulated = ATA_SHT_EMULATED, \
......
......@@ -132,13 +132,12 @@ struct nvme_ns {
* allocated to store the PRP list.
*/
struct nvme_iod {
void *private; /* For the use of the submitter of the I/O */
unsigned long private; /* For the use of the submitter of the I/O */
int npages; /* In the PRP list. 0 means small pool in use */
int offset; /* Of PRP list */
int nents; /* Used in scatterlist */
int length; /* Of data, in bytes */
dma_addr_t first_dma;
struct list_head node;
struct scatterlist sg[0];
};
......
......@@ -267,6 +267,21 @@ do { \
__wait_event(wq, condition); \
} while (0)
#define __io_wait_event(wq, condition) \
(void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
io_schedule())
/*
* io_wait_event() -- like wait_event() but with io_schedule()
*/
#define io_wait_event(wq, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__io_wait_event(wq, condition); \
} while (0)
#define __wait_event_freezable(wq, condition) \
___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \
schedule(); try_to_freeze())
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment