Commit 736a2dd2 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux

Pull virtio & lguest updates from Rusty Russell:
 "Lots of virtio work which wasn't quite ready for last merge window.

  Plus I dived into lguest again, reworking the pagetable code so we can
  move the switcher page: our fixmaps sometimes take more than 2MB now..."

Ugh.  Annoying conflicts with the tcm_vhost -> vhost_scsi rename.
Hopefully correctly resolved.

* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (57 commits)
  caif_virtio: Remove bouncing email addresses
  lguest: improve code readability in lg_cpu_start.
  virtio-net: fill only rx queues which are being used
  lguest: map Switcher below fixmap.
  lguest: cache last cpu we ran on.
  lguest: map Switcher text whenever we allocate a new pagetable.
  lguest: don't share Switcher PTE pages between guests.
  lguest: expost switcher_pages array (as lg_switcher_pages).
  lguest: extract shadow PTE walking / allocating.
  lguest: make check_gpte et. al return bool.
  lguest: assume Switcher text is a single page.
  lguest: rename switcher_page to switcher_pages.
  lguest: remove RESERVE_MEM constant.
  lguest: check vaddr not pgd for Switcher protection.
  lguest: prepare to make SWITCHER_ADDR a variable.
  virtio: console: replace EMFILE with EBUSY for already-open port
  virtio-scsi: reset virtqueue affinity when doing cpu hotplug
  virtio-scsi: introduce multiqueue support
  virtio-scsi: push vq lock/unlock into virtscsi_vq_done
  virtio-scsi: pass struct virtio_scsi to virtqueue completion function
  ...
parents 0b2e3b6b 01d779a1
...@@ -6,6 +6,3 @@ kvm/ ...@@ -6,6 +6,3 @@ kvm/
- Kernel Virtual Machine. See also http://linux-kvm.org - Kernel Virtual Machine. See also http://linux-kvm.org
uml/ uml/
- User Mode Linux, builds/runs Linux kernel as a userspace program. - User Mode Linux, builds/runs Linux kernel as a userspace program.
virtio.txt
- Text version of draft virtio spec.
See http://ozlabs.org/~rusty/virtio-spec
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -8743,6 +8743,7 @@ F: drivers/virtio/ ...@@ -8743,6 +8743,7 @@ F: drivers/virtio/
F: drivers/net/virtio_net.c F: drivers/net/virtio_net.c
F: drivers/block/virtio_blk.c F: drivers/block/virtio_blk.c
F: include/linux/virtio_*.h F: include/linux/virtio_*.h
F: include/uapi/linux/virtio_*.h
VIRTIO HOST (VHOST) VIRTIO HOST (VHOST)
M: "Michael S. Tsirkin" <mst@redhat.com> M: "Michael S. Tsirkin" <mst@redhat.com>
......
...@@ -11,18 +11,11 @@ ...@@ -11,18 +11,11 @@
#define GUEST_PL 1 #define GUEST_PL 1
/* Every guest maps the core switcher code. */ /* Page for Switcher text itself, then two pages per cpu */
#define SHARED_SWITCHER_PAGES \ #define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
/* Pages for switcher itself, then two pages per cpu */ /* Where we map the Switcher, in both Host and Guest. */
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) extern unsigned long switcher_addr;
/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
#ifdef CONFIG_X86_PAE
#define SWITCHER_ADDR 0xFFE00000
#else
#define SWITCHER_ADDR 0xFFC00000
#endif
/* Found in switcher.S */ /* Found in switcher.S */
extern unsigned long default_idt_entries[]; extern unsigned long default_idt_entries[];
......
...@@ -110,7 +110,7 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, ...@@ -110,7 +110,7 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
if (!sg) if (!sg)
sg = sglist; sg = sglist;
else { else {
sg->page_link &= ~0x02; sg_unmark_end(sg);
sg = sg_next(sg); sg = sg_next(sg);
} }
......
...@@ -143,7 +143,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, ...@@ -143,7 +143,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
* termination bit to avoid doing a full * termination bit to avoid doing a full
* sg_init_table() in drivers for each command. * sg_init_table() in drivers for each command.
*/ */
(*sg)->page_link &= ~0x02; sg_unmark_end(*sg);
*sg = sg_next(*sg); *sg = sg_next(*sg);
} }
......
...@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/ ...@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
obj-$(CONFIG_OF) += of/ obj-$(CONFIG_OF) += of/
obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_SSB) += ssb/
obj-$(CONFIG_BCMA) += bcma/ obj-$(CONFIG_BCMA) += bcma/
obj-$(CONFIG_VHOST_NET) += vhost/ obj-$(CONFIG_VHOST_RING) += vhost/
obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_VLYNQ) += vlynq/
obj-$(CONFIG_STAGING) += staging/ obj-$(CONFIG_STAGING) += staging/
obj-y += platform/ obj-y += platform/
......
...@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk, ...@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
return vbr; return vbr;
} }
static void virtblk_add_buf_wait(struct virtio_blk *vblk, static int __virtblk_add_req(struct virtqueue *vq,
struct virtblk_req *vbr, struct virtblk_req *vbr,
unsigned long out, struct scatterlist *data_sg,
unsigned long in) bool have_data)
{ {
DEFINE_WAIT(wait); struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
unsigned int num_out = 0, num_in = 0;
int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
for (;;) { sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
prepare_to_wait_exclusive(&vblk->queue_wait, &wait, sgs[num_out++] = &hdr;
TASK_UNINTERRUPTIBLE);
spin_lock_irq(vblk->disk->queue->queue_lock); /*
if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, * If this is a packet command we need a couple of additional headers.
GFP_ATOMIC) < 0) { * Behind the normal outhdr we put a segment with the scsi command
spin_unlock_irq(vblk->disk->queue->queue_lock); * block, and before the normal inhdr we put the sense data and the
io_schedule(); * inhdr with additional status information.
} else { */
virtqueue_kick(vblk->vq); if (type == VIRTIO_BLK_T_SCSI_CMD) {
spin_unlock_irq(vblk->disk->queue->queue_lock); sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
break; sgs[num_out++] = &cmd;
} }
if (have_data) {
if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
sgs[num_out++] = data_sg;
else
sgs[num_out + num_in++] = data_sg;
} }
finish_wait(&vblk->queue_wait, &wait); if (type == VIRTIO_BLK_T_SCSI_CMD) {
sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
sgs[num_out + num_in++] = &sense;
sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
sgs[num_out + num_in++] = &inhdr;
}
sg_init_one(&status, &vbr->status, sizeof(vbr->status));
sgs[num_out + num_in++] = &status;
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
} }
static inline void virtblk_add_req(struct virtblk_req *vbr, static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
unsigned int out, unsigned int in)
{ {
struct virtio_blk *vblk = vbr->vblk; struct virtio_blk *vblk = vbr->vblk;
DEFINE_WAIT(wait);
int ret;
spin_lock_irq(vblk->disk->queue->queue_lock); spin_lock_irq(vblk->disk->queue->queue_lock);
if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr, while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
GFP_ATOMIC) < 0)) { have_data)) < 0)) {
prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(vblk->disk->queue->queue_lock); spin_unlock_irq(vblk->disk->queue->queue_lock);
virtblk_add_buf_wait(vblk, vbr, out, in); io_schedule();
return; spin_lock_irq(vblk->disk->queue->queue_lock);
finish_wait(&vblk->queue_wait, &wait);
} }
virtqueue_kick(vblk->vq); virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock); spin_unlock_irq(vblk->disk->queue->queue_lock);
} }
static int virtblk_bio_send_flush(struct virtblk_req *vbr) static void virtblk_bio_send_flush(struct virtblk_req *vbr)
{ {
unsigned int out = 0, in = 0;
vbr->flags |= VBLK_IS_FLUSH; vbr->flags |= VBLK_IS_FLUSH;
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0; vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = 0; vbr->out_hdr.ioprio = 0;
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
virtblk_add_req(vbr, out, in);
return 0; virtblk_add_req(vbr, false);
} }
static int virtblk_bio_send_data(struct virtblk_req *vbr) static void virtblk_bio_send_data(struct virtblk_req *vbr)
{ {
struct virtio_blk *vblk = vbr->vblk; struct virtio_blk *vblk = vbr->vblk;
unsigned int num, out = 0, in = 0;
struct bio *bio = vbr->bio; struct bio *bio = vbr->bio;
bool have_data;
vbr->flags &= ~VBLK_IS_FLUSH; vbr->flags &= ~VBLK_IS_FLUSH;
vbr->out_hdr.type = 0; vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector; vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio); vbr->out_hdr.ioprio = bio_prio(bio);
sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
have_data = true;
num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out); if (bio->bi_rw & REQ_WRITE)
sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) {
if (bio->bi_rw & REQ_WRITE) {
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num; else
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN; vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num; } else
} have_data = false;
}
virtblk_add_req(vbr, out, in); virtblk_add_req(vbr, have_data);
return 0;
} }
static void virtblk_bio_send_data_work(struct work_struct *work) static void virtblk_bio_send_data_work(struct work_struct *work)
...@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq) ...@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
static bool do_req(struct request_queue *q, struct virtio_blk *vblk, static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
struct request *req) struct request *req)
{ {
unsigned long num, out = 0, in = 0; unsigned int num;
struct virtblk_req *vbr; struct virtblk_req *vbr;
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC); vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
...@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, ...@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
} }
} }
sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); num = blk_rq_map_sg(q, vbr->req, vblk->sg);
/*
* If this is a packet command we need a couple of additional headers.
* Behind the normal outhdr we put a segment with the scsi command
* block, and before the normal inhdr we put the sense data and the
* inhdr with additional status information before the normal inhdr.
*/
if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
sizeof(vbr->in_hdr));
}
sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
sizeof(vbr->status));
if (num) { if (num) {
if (rq_data_dir(vbr->req) == WRITE) { if (rq_data_dir(vbr->req) == WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
out += num; else
} else {
vbr->out_hdr.type |= VIRTIO_BLK_T_IN; vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
in += num;
}
} }
if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
GFP_ATOMIC) < 0) {
mempool_free(vbr, vblk->pool); mempool_free(vbr, vblk->pool);
return false; return false;
} }
...@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work) ...@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
struct virtio_device *vdev = vblk->vdev; struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue; struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10]; char cap_str_2[10], cap_str_10[10];
char *envp[] = { "RESIZE=1", NULL };
u64 capacity, size; u64 capacity, size;
mutex_lock(&vblk->config_lock); mutex_lock(&vblk->config_lock);
...@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work) ...@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
set_capacity(vblk->disk, capacity); set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk); revalidate_disk(vblk->disk);
kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
done: done:
mutex_unlock(&vblk->config_lock); mutex_unlock(&vblk->config_lock);
} }
......
...@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size) ...@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
sg_init_one(&sg, buf, size); sg_init_one(&sg, buf, size);
/* There should always be room for one buffer. */ /* There should always be room for one buffer. */
if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0) if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
BUG(); BUG();
virtqueue_kick(vq); virtqueue_kick(vq);
......
...@@ -78,8 +78,8 @@ struct ports_driver_data { ...@@ -78,8 +78,8 @@ struct ports_driver_data {
}; };
static struct ports_driver_data pdrvdata; static struct ports_driver_data pdrvdata;
DEFINE_SPINLOCK(pdrvdata_lock); static DEFINE_SPINLOCK(pdrvdata_lock);
DECLARE_COMPLETION(early_console_added); static DECLARE_COMPLETION(early_console_added);
/* This struct holds information that's relevant only for console ports */ /* This struct holds information that's relevant only for console ports */
struct console { struct console {
...@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf) ...@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
sg_init_one(sg, buf->buf, buf->size); sg_init_one(sg, buf->buf, buf->size);
ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC); ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
virtqueue_kick(vq); virtqueue_kick(vq);
if (!ret) if (!ret)
ret = vq->num_free; ret = vq->num_free;
...@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, ...@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
sg_init_one(sg, &cpkt, sizeof(cpkt)); sg_init_one(sg, &cpkt, sizeof(cpkt));
spin_lock(&portdev->c_ovq_lock); spin_lock(&portdev->c_ovq_lock);
if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) { if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
virtqueue_kick(vq); virtqueue_kick(vq);
while (!virtqueue_get_buf(vq, &len)) while (!virtqueue_get_buf(vq, &len))
cpu_relax(); cpu_relax();
...@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg, ...@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
reclaim_consumed_buffers(port); reclaim_consumed_buffers(port);
err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC); err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
/* Tell Host to go! */ /* Tell Host to go! */
virtqueue_kick(out_vq); virtqueue_kick(out_vq);
...@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp) ...@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
spin_lock_irq(&port->inbuf_lock); spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) { if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock); spin_unlock_irq(&port->inbuf_lock);
ret = -EMFILE; ret = -EBUSY;
goto out; goto out;
} }
...@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) ...@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
return hvc_instantiate(0, 0, &hv_ops); return hvc_instantiate(0, 0, &hv_ops);
} }
int init_port_console(struct port *port) static int init_port_console(struct port *port)
{ {
int ret; int ret;
......
...@@ -5,10 +5,9 @@ config LGUEST ...@@ -5,10 +5,9 @@ config LGUEST
---help--- ---help---
This is a very simple module which allows you to run This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the multiple instances of the same Linux kernel, using the
"lguest" command found in the Documentation/virtual/lguest "lguest" command found in the tools/lguest directory.
directory.
Note that "lguest" is pronounced to rhyme with "fell quest", Note that "lguest" is pronounced to rhyme with "fell quest",
not "rustyvisor". See Documentation/virtual/lguest/lguest.txt. not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y. If unsure, say N. If curious, say M. If masochistic, say Y.
...@@ -20,9 +20,9 @@ ...@@ -20,9 +20,9 @@
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include "lg.h" #include "lg.h"
unsigned long switcher_addr;
struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma; static struct vm_struct *switcher_vma;
static struct page **switcher_page;
/* This One Big lock protects all inter-guest data structures. */ /* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock); DEFINE_MUTEX(lguest_lock);
...@@ -52,13 +52,21 @@ static __init int map_switcher(void) ...@@ -52,13 +52,21 @@ static __init int map_switcher(void)
* easy. * easy.
*/ */
/* We assume Switcher text fits into a single page. */
if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
end_switcher_text - start_switcher_text);
return -EINVAL;
}
/* /*
* We allocate an array of struct page pointers. map_vm_area() wants * We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages. * this, rather than just an array of pages.
*/ */
switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
GFP_KERNEL); * TOTAL_SWITCHER_PAGES,
if (!switcher_page) { GFP_KERNEL);
if (!lg_switcher_pages) {
err = -ENOMEM; err = -ENOMEM;
goto out; goto out;
} }
...@@ -68,32 +76,29 @@ static __init int map_switcher(void) ...@@ -68,32 +76,29 @@ static __init int map_switcher(void)
* so we make sure they're zeroed. * so we make sure they're zeroed.
*/ */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!switcher_page[i]) { if (!lg_switcher_pages[i]) {
err = -ENOMEM; err = -ENOMEM;
goto free_some_pages; goto free_some_pages;
} }
} }
/* /*
* First we check that the Switcher won't overlap the fixmap area at * We place the Switcher underneath the fixmap area, which is the
* the top of memory. It's currently nowhere near, but it could have * highest virtual address we can get. This is important, since we
* very strange effects if it ever happened. * tell the Guest it can't access this memory, so we want its ceiling
* as high as possible.
*/ */
if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
err = -ENOMEM;
printk("lguest: mapping switcher would thwack fixmap\n");
goto free_pages;
}
/* /*
* Now we reserve the "virtual memory area" we want: 0xFFC00000 * Now we reserve the "virtual memory area" we want. We might
* (SWITCHER_ADDR). We might not get it in theory, but in practice * not get it in theory, but in practice it's worked so far.
* it's worked so far. The end address needs +1 because __get_vm_area * The end address needs +1 because __get_vm_area allocates an
* allocates an extra guard page, so we need space for that. * extra guard page, so we need space for that.
*/ */
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) { if (!switcher_vma) {
err = -ENOMEM; err = -ENOMEM;
...@@ -103,12 +108,12 @@ static __init int map_switcher(void) ...@@ -103,12 +108,12 @@ static __init int map_switcher(void)
/* /*
* This code actually sets up the pages we've allocated to appear at * This code actually sets up the pages we've allocated to appear at
* SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our * kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't * array of struct pages. It increments that pointer, but we don't
* care. * care.
*/ */
pagep = switcher_page; pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) { if (err) {
printk("lguest: map_vm_area failed: %i\n", err); printk("lguest: map_vm_area failed: %i\n", err);
...@@ -133,8 +138,8 @@ static __init int map_switcher(void) ...@@ -133,8 +138,8 @@ static __init int map_switcher(void)
i = TOTAL_SWITCHER_PAGES; i = TOTAL_SWITCHER_PAGES;
free_some_pages: free_some_pages:
for (--i; i >= 0; i--) for (--i; i >= 0; i--)
__free_pages(switcher_page[i], 0); __free_pages(lg_switcher_pages[i], 0);
kfree(switcher_page); kfree(lg_switcher_pages);
out: out:
return err; return err;
} }
...@@ -149,8 +154,8 @@ static void unmap_switcher(void) ...@@ -149,8 +154,8 @@ static void unmap_switcher(void)
vunmap(switcher_vma->addr); vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */ /* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
__free_pages(switcher_page[i], 0); __free_pages(lg_switcher_pages[i], 0);
kfree(switcher_page); kfree(lg_switcher_pages);
} }
/*H:032 /*H:032
...@@ -323,15 +328,10 @@ static int __init init(void) ...@@ -323,15 +328,10 @@ static int __init init(void)
if (err) if (err)
goto out; goto out;
/* Now we set up the pagetable implementation for the Guests. */
err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
if (err)
goto unmap;
/* We might need to reserve an interrupt vector. */ /* We might need to reserve an interrupt vector. */
err = init_interrupts(); err = init_interrupts();
if (err) if (err)
goto free_pgtables; goto unmap;
/* /dev/lguest needs to be registered. */ /* /dev/lguest needs to be registered. */
err = lguest_device_init(); err = lguest_device_init();
...@@ -346,8 +346,6 @@ static int __init init(void) ...@@ -346,8 +346,6 @@ static int __init init(void)
free_interrupts: free_interrupts:
free_interrupts(); free_interrupts();
free_pgtables:
free_pagetables();
unmap: unmap:
unmap_switcher(); unmap_switcher();
out: out:
...@@ -359,7 +357,6 @@ static void __exit fini(void) ...@@ -359,7 +357,6 @@ static void __exit fini(void)
{ {
lguest_device_remove(); lguest_device_remove();
free_interrupts(); free_interrupts();
free_pagetables();
unmap_switcher(); unmap_switcher();
lguest_arch_host_fini(); lguest_arch_host_fini();
......
...@@ -14,11 +14,10 @@ ...@@ -14,11 +14,10 @@
#include <asm/lguest.h> #include <asm/lguest.h>
void free_pagetables(void);
int init_pagetables(struct page **switcher_page, unsigned int pages);
struct pgdir { struct pgdir {
unsigned long gpgdir; unsigned long gpgdir;
bool switcher_mapped;
int last_host_cpu;
pgd_t *pgdir; pgd_t *pgdir;
}; };
...@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg, ...@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len); unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
extern struct page **lg_switcher_pages;
/*H:035 /*H:035
* Using memory-copy operations like that is usually inconvient, so we * Using memory-copy operations like that is usually inconvient, so we
......
...@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) ...@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
*/ */
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{ {
/* We have a limited number the number of CPUs in the lguest struct. */ /* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus)) if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL; return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */ /* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id; cpu->id = id;
cpu->lg = container_of((cpu - id), struct lguest, cpus[0]); cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++; cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */ /* Each CPU has a timer it can set. */
...@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) ...@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
if (!cpu->regs_page) if (!cpu->regs_page)
return -ENOMEM; return -ENOMEM;
/* We actually put the registers at the bottom of the page. */ /* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/* /*
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* converted Guest pages when running the Guest. * converted Guest pages when running the Guest.
:*/ :*/
/* Copyright (C) Rusty Russell IBM Corporation 2006. /* Copyright (C) Rusty Russell IBM Corporation 2013.
* GPL v2 and any later version */ * GPL v2 and any later version */
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/gfp.h> #include <linux/gfp.h>
...@@ -62,22 +62,11 @@ ...@@ -62,22 +62,11 @@
* will need the last pmd entry of the last pmd page. * will need the last pmd entry of the last pmd page.
*/ */
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
#define RESERVE_MEM 2U
#define CHECK_GPGD_MASK _PAGE_PRESENT #define CHECK_GPGD_MASK _PAGE_PRESENT
#else #else
#define RESERVE_MEM 4U
#define CHECK_GPGD_MASK _PAGE_TABLE #define CHECK_GPGD_MASK _PAGE_TABLE
#endif #endif
/*
* We actually need a separate PTE page for each CPU. Remember that after the
* Switcher code itself comes two pages for each CPU, and we don't want this
* CPU's guest to see the pages of any other CPU.
*/
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
/*H:320 /*H:320
* The page table code is curly enough to need helper functions to keep it * The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage * clear and clean. The kernel itself provides many of them; one advantage
...@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) ...@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{ {
unsigned int index = pgd_index(vaddr); unsigned int index = pgd_index(vaddr);
#ifndef CONFIG_X86_PAE
/* We kill any Guest trying to touch the Switcher addresses. */
if (index >= SWITCHER_PGD_INDEX) {
kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
#endif
/* Return a pointer index'th pgd entry for the i'th page table. */ /* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index]; return &cpu->lg->pgdirs[i].pgdir[index];
} }
...@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) ...@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
unsigned int index = pmd_index(vaddr); unsigned int index = pmd_index(vaddr);
pmd_t *page; pmd_t *page;
/* We kill any Guest trying to touch the Switcher addresses. */
if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
index >= SWITCHER_PMD_INDEX) {
kill_guest(cpu, "attempt to access switcher pages");
index = 0;
}
/* You should never call this if the PGD entry wasn't valid */ /* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
page = __va(pgd_pfn(spgd) << PAGE_SHIFT); page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
...@@ -275,122 +250,177 @@ static void release_pte(pte_t pte) ...@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
} }
/*:*/ /*:*/
static void check_gpte(struct lg_cpu *cpu, pte_t gpte) static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{ {
if ((pte_flags(gpte) & _PAGE_PSE) || if ((pte_flags(gpte) & _PAGE_PSE) ||
pte_pfn(gpte) >= cpu->lg->pfn_limit) pte_pfn(gpte) >= cpu->lg->pfn_limit) {
kill_guest(cpu, "bad page table entry"); kill_guest(cpu, "bad page table entry");
return false;
}
return true;
} }
static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{ {
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
(pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page directory entry"); kill_guest(cpu, "bad page directory entry");
return false;
}
return true;
} }
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{ {
if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
(pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page middle directory entry"); kill_guest(cpu, "bad page middle directory entry");
return false;
}
return true;
} }
#endif #endif
/*H:330 /*H:331
* (i) Looking up a page table entry when the Guest faults. * This is the core routine to walk the shadow page tables and find the page
* * table entry for a specific address.
* We saw this call in run_guest(): when we see a page fault in the Guest, we
* come here. That's because we only set up the shadow page tables lazily as
* they're needed, so we get page faults all the time and quietly fix them up
* and return to the Guest without it knowing.
* *
* If we fixed up the fault (ie. we mapped the address), this routine returns * If allocate is set, then we allocate any missing levels, setting the flags
* true. Otherwise, it was a real fault and we need to tell the Guest. * on the new page directory and mid-level directories using the arguments
* (which are copied from the Guest's page table entries).
*/ */
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
int pgd_flags, int pmd_flags)
{ {
pgd_t gpgd;
pgd_t *spgd; pgd_t *spgd;
unsigned long gpte_ptr;
pte_t gpte;
pte_t *spte;
/* Mid level for PAE. */ /* Mid level for PAE. */
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
pmd_t *spmd; pmd_t *spmd;
pmd_t gpmd;
#endif #endif
/* First step: get the top-level Guest page table entry. */ /* Get top level entry. */
if (unlikely(cpu->linear_pages)) {
/* Faking up a linear mapping. */
gpgd = __pgd(CHECK_GPGD_MASK);
} else {
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
return false;
}
/* Now look at the matching shadow entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */ /* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page(GFP_KERNEL); unsigned long ptepage;
/* If they didn't want us to allocate anything, stop. */
if (!allocate)
return NULL;
ptepage = get_zeroed_page(GFP_KERNEL);
/* /*
* This is not really the Guest's fault, but killing it is * This is not really the Guest's fault, but killing it is
* simple for this corner case. * simple for this corner case.
*/ */
if (!ptepage) { if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page"); kill_guest(cpu, "out of memory allocating pte page");
return false; return NULL;
} }
/* We check that the Guest pgd is OK. */
check_gpgd(cpu, gpgd);
/* /*
* And we copy the flags to the shadow PGD entry. The page * And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated. * number in the shadow PGD is the page we just allocated.
*/ */
set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
} }
/*
* Intel's Physical Address Extension actually uses three levels of
* page tables, so we need to look in the mid-level.
*/
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
if (unlikely(cpu->linear_pages)) { /* Now look at the mid-level shadow entry. */
/* Faking up a linear mapping. */
gpmd = __pmd(_PAGE_TABLE);
} else {
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
/* Middle level not present? We can't map it in. */
if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
return false;
}
/* Now look at the matching shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr); spmd = spmd_addr(cpu, *spgd, vaddr);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */ /* No shadow entry: allocate a new shadow PTE page. */
unsigned long ptepage = get_zeroed_page(GFP_KERNEL); unsigned long ptepage;
/* If they didn't want us to allocate anything, stop. */
if (!allocate)
return NULL;
ptepage = get_zeroed_page(GFP_KERNEL);
/* /*
* This is not really the Guest's fault, but killing it is * This is not really the Guest's fault, but killing it is
* simple for this corner case. * simple for this corner case.
*/ */
if (!ptepage) { if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page"); kill_guest(cpu, "out of memory allocating pmd page");
return false; return NULL;
} }
/* We check that the Guest pmd is OK. */
check_gpmd(cpu, gpmd);
/* /*
* And we copy the flags to the shadow PMD entry. The page * And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated. * number in the shadow PMD is the page we just allocated.
*/ */
set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
}
#endif
/* Get the pointer to the shadow PTE entry we're going to set. */
return spte_addr(cpu, *spgd, vaddr);
}
/*H:330
* (i) Looking up a page table entry when the Guest faults.
*
* We saw this call in run_guest(): when we see a page fault in the Guest, we
* come here. That's because we only set up the shadow page tables lazily as
* they're needed, so we get page faults all the time and quietly fix them up
* and return to the Guest without it knowing.
*
* If we fixed up the fault (ie. we mapped the address), this routine returns
* true. Otherwise, it was a real fault and we need to tell the Guest.
*/
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
{
unsigned long gpte_ptr;
pte_t gpte;
pte_t *spte;
pmd_t gpmd;
pgd_t gpgd;
/* We never demand page the Switcher, so trying is a mistake. */
if (vaddr >= switcher_addr)
return false;
/* First step: get the top-level Guest page table entry. */
if (unlikely(cpu->linear_pages)) {
/* Faking up a linear mapping. */
gpgd = __pgd(CHECK_GPGD_MASK);
} else {
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
return false;
/*
* This kills the Guest if it has weird flags or tries to
* refer to a "physical" address outside the bounds.
*/
if (!check_gpgd(cpu, gpgd))
return false;
}
/* This "mid-level" entry is only used for non-linear, PAE mode. */
gpmd = __pmd(_PAGE_TABLE);
#ifdef CONFIG_X86_PAE
if (likely(!cpu->linear_pages)) {
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
/* Middle level not present? We can't map it in. */
if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
return false;
/*
* This kills the Guest if it has weird flags or tries to
* refer to a "physical" address outside the bounds.
*/
if (!check_gpmd(cpu, gpmd))
return false;
} }
/* /*
...@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Check that the Guest PTE flags are OK, and the page number is below * Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary). * the pfn_limit (ie. not mapping the Launcher binary).
*/ */
check_gpte(cpu, gpte); if (!check_gpte(cpu, gpte))
return false;
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte); gpte = pte_mkyoung(gpte);
...@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte); gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */ /* Get the pointer to the shadow PTE entry we're going to set. */
spte = spte_addr(cpu, *spgd, vaddr); spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
if (!spte)
return false;
/* /*
* If there was a valid shadow PTE entry here before, we release it. * If there was a valid shadow PTE entry here before, we release it.
...@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
*/ */
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{ {
pgd_t *spgd; pte_t *spte;
unsigned long flags; unsigned long flags;
#ifdef CONFIG_X86_PAE /* You can't put your stack in the Switcher! */
pmd_t *spmd; if (vaddr >= switcher_addr)
#endif
/* Look at the current top level entry: is it present? */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
return false; return false;
#ifdef CONFIG_X86_PAE /* If there's no shadow PTE, it's not writable. */
spmd = spmd_addr(cpu, *spgd, vaddr); spte = find_spte(cpu, vaddr, false, 0, 0);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) if (!spte)
return false; return false;
#endif
/* /*
* Check the flags on the pte entry itself: it must be present and * Check the flags on the pte entry itself: it must be present and
* writable. * writable.
*/ */
flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); flags = pte_flags(*spte);
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
} }
...@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, ...@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir) int *blank_pgdir)
{ {
unsigned int next; unsigned int next;
#ifdef CONFIG_X86_PAE
pmd_t *pmd_table;
#endif
/* /*
* We pick one entry at random to throw out. Choosing the Least * We pick one entry at random to throw out. Choosing the Least
...@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, ...@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir) if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd; next = cpu->cpu_pgd;
else { else {
#ifdef CONFIG_X86_PAE
/* /*
* In PAE mode, allocate a pmd page and populate the * This is a blank page, so there are no kernel
* last pgd entry. * mappings: caller must map the stack!
*/ */
pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd_table) {
free_page((long)cpu->lg->pgdirs[next].pgdir);
set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
next = cpu->cpu_pgd;
} else {
set_pgd(cpu->lg->pgdirs[next].pgdir +
SWITCHER_PGD_INDEX,
__pgd(__pa(pmd_table) | _PAGE_PRESENT));
/*
* This is a blank page, so there are no kernel
* mappings: caller must map the stack!
*/
*blank_pgdir = 1;
}
#else
*blank_pgdir = 1; *blank_pgdir = 1;
#endif
} }
} }
/* Record which Guest toplevel this shadows. */ /* Record which Guest toplevel this shadows. */
...@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, ...@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
/* Release all the non-kernel mappings. */ /* Release all the non-kernel mappings. */
flush_user_mappings(cpu->lg, next); flush_user_mappings(cpu->lg, next);
/* This hasn't run on any CPU at all. */
cpu->lg->pgdirs[next].last_host_cpu = -1;
return next; return next;
} }
/*H:501
* We do need the Switcher code mapped at all times, so we allocate that
* part of the Guest page table here. We map the Switcher code immediately,
* but defer mapping of the guest register page and IDT/LDT etc page until
* just before we run the guest in map_switcher_in_guest().
*
* We *could* do this setup in map_switcher_in_guest(), but at that point
* we've interrupts disabled, and allocating pages like that is fraught: we
* can't sleep if we need to free up some memory.
*/
static bool allocate_switcher_mapping(struct lg_cpu *cpu)
{
int i;
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
CHECK_GPGD_MASK, _PAGE_TABLE);
if (!pte)
return false;
/*
* Map the switcher page if not already there. It might
* already be there because we call allocate_switcher_mapping()
* in guest_set_pgd() just in case it did discard our Switcher
* mapping, but it probably didn't.
*/
if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
/* Get a reference to the Switcher page. */
get_page(lg_switcher_pages[0]);
/* Create a read-only, exectuable, kernel-style PTE */
set_pte(pte,
mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
}
}
cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
return true;
}
/*H:470 /*H:470
* Finally, a routine which throws away everything: all PGD entries in all * Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables, including the Guest's kernel mappings. This is used * the shadow page tables, including the Guest's kernel mappings. This is used
...@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg) ...@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
unsigned int i, j; unsigned int i, j;
/* Every shadow pagetable this Guest has */ /* Every shadow pagetable this Guest has */
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
if (lg->pgdirs[i].pgdir) { if (!lg->pgdirs[i].pgdir)
#ifdef CONFIG_X86_PAE continue;
pgd_t *spgd;
pmd_t *pmdpage; /* Every PGD entry. */
unsigned int k; for (j = 0; j < PTRS_PER_PGD; j++)
release_pgd(lg->pgdirs[i].pgdir + j);
/* Get the last pmd page. */ lg->pgdirs[i].switcher_mapped = false;
spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; lg->pgdirs[i].last_host_cpu = -1;
pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); }
/*
* And release the pmd entries of that pmd page,
* except for the switcher pmd.
*/
for (k = 0; k < SWITCHER_PMD_INDEX; k++)
release_pmd(&pmdpage[k]);
#endif
/* Every PGD entry except the Switcher at the top */
for (j = 0; j < SWITCHER_PGD_INDEX; j++)
release_pgd(lg->pgdirs[i].pgdir + j);
}
} }
/* /*
...@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) ...@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
release_all_pagetables(cpu->lg); release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */ /* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu); pin_stack_pages(cpu);
/* And we need Switcher allocated. */
if (!allocate_switcher_mapping(cpu))
kill_guest(cpu, "Cannot populate switcher mapping");
} }
/*H:430 /*H:430
...@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) ...@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
newpgdir = new_pgdir(cpu, pgtable, &repin); newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */ /* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir; cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */ /*
* If it was completely blank, we map in the Guest kernel stack and
* the Switcher.
*/
if (repin) if (repin)
pin_stack_pages(cpu); pin_stack_pages(cpu);
if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
if (!allocate_switcher_mapping(cpu))
kill_guest(cpu, "Cannot populate switcher mapping");
}
} }
/*:*/ /*:*/
...@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, ...@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
* micro-benchmark. * micro-benchmark.
*/ */
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
check_gpte(cpu, gpte); if (!check_gpte(cpu, gpte))
return;
set_pte(spte, set_pte(spte,
gpte_to_spte(cpu, gpte, gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY)); pte_flags(gpte) & _PAGE_DIRTY));
...@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, ...@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
void guest_set_pte(struct lg_cpu *cpu, void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte) unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{ {
/* We don't let you remap the Switcher; we need it to get back! */
if (vaddr >= switcher_addr) {
kill_guest(cpu, "attempt to set pte into Switcher pages");
return;
}
/* /*
* Kernel mappings must be changed on all top levels. Slow, but doesn't * Kernel mappings must be changed on all top levels. Slow, but doesn't
* happen often. * happen often.
...@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) ...@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{ {
int pgdir; int pgdir;
if (idx >= SWITCHER_PGD_INDEX) if (idx > PTRS_PER_PGD) {
kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
idx, PTRS_PER_PGD);
return; return;
}
/* If they're talking about a page table we have a shadow for... */ /* If they're talking about a page table we have a shadow for... */
pgdir = find_pgdir(lg, gpgdir); pgdir = find_pgdir(lg, gpgdir);
if (pgdir < ARRAY_SIZE(lg->pgdirs)) if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
/* ... throw it away. */ /* ... throw it away. */
release_pgd(lg->pgdirs[pgdir].pgdir + idx); release_pgd(lg->pgdirs[pgdir].pgdir + idx);
/* That might have been the Switcher mapping, remap it. */
if (!allocate_switcher_mapping(&lg->cpus[0])) {
kill_guest(&lg->cpus[0],
"Cannot populate switcher mapping");
}
}
} }
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
...@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) ...@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
* we will populate on future faults. The Guest doesn't have any actual * we will populate on future faults. The Guest doesn't have any actual
* pagetables yet, so we set linear_pages to tell demand_page() to fake it * pagetables yet, so we set linear_pages to tell demand_page() to fake it
* for the moment. * for the moment.
*
* We do need the Switcher to be mapped at all times, so we allocate that
* part of the Guest page table here.
*/ */
int init_guest_pagetable(struct lguest *lg) int init_guest_pagetable(struct lguest *lg)
{ {
...@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg) ...@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
/* We start with a linear mapping until the initialize. */ /* We start with a linear mapping until the initialize. */
cpu->linear_pages = true; cpu->linear_pages = true;
/* Allocate the page tables for the Switcher. */
if (!allocate_switcher_mapping(cpu)) {
release_all_pagetables(lg);
return -ENOMEM;
}
return 0; return 0;
} }
/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lg_cpu *cpu) void page_table_guest_data_init(struct lg_cpu *cpu)
{ {
/*
* We tell the Guest that it can't use the virtual addresses
* used by the Switcher. This trick is equivalent to 4GB -
* switcher_addr.
*/
u32 top = ~switcher_addr + 1;
/* We get the kernel address: above this is all kernel memory. */ /* We get the kernel address: above this is all kernel memory. */
if (get_user(cpu->lg->kernel_address, if (get_user(cpu->lg->kernel_address,
&cpu->lg->lguest_data->kernel_address) &cpu->lg->lguest_data->kernel_address)
/* /*
* We tell the Guest that it can't use the top 2 or 4 MB * We tell the Guest that it can't use the top virtual
* of virtual addresses used by the Switcher. * addresses (used by the Switcher).
*/ */
|| put_user(RESERVE_MEM * 1024 * 1024, || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
&cpu->lg->lguest_data->reserve_mem)) {
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
return; return;
} }
...@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu) ...@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
* "pgd_index(lg->kernel_address)". This assumes it won't hit the * "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now. * Switcher mappings, so check that now.
*/ */
#ifdef CONFIG_X86_PAE if (cpu->lg->kernel_address >= switcher_addr)
if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
#else
if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
#endif
kill_guest(cpu, "bad kernel address %#lx", kill_guest(cpu, "bad kernel address %#lx",
cpu->lg->kernel_address); cpu->lg->kernel_address);
} }
...@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg) ...@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
free_page((long)lg->pgdirs[i].pgdir); free_page((long)lg->pgdirs[i].pgdir);
} }
/*H:480 /*H:481
* (vi) Mapping the Switcher when the Guest is about to run. * This clears the Switcher mappings for cpu #i.
*
* The Switcher and the two pages for this CPU need to be visible in the
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages
* for each CPU already set up, we just need to hook them in now we know which
* Guest is about to run on this CPU.
*/ */
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
{ {
pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
pte_t regs_pte; pte_t *pte;
#ifdef CONFIG_X86_PAE /* Clear the mappings for both pages. */
pmd_t switcher_pmd; pte = find_spte(cpu, base, false, 0, 0);
pmd_t *pmd_table; release_pte(*pte);
set_pte(pte, __pte(0));
switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
PAGE_KERNEL_EXEC);
/* Figure out where the pmd page is, by reading the PGD, and converting
* it to a virtual address. */
pmd_table = __va(pgd_pfn(cpu->lg->
pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
<< PAGE_SHIFT);
/* Now write it into the shadow page table. */
set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
#else
pgd_t switcher_pgd;
/* pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
* Make the last PGD entry for this Guest point to the Switcher's PTE release_pte(*pte);
* page for this CPU (with appropriate flags). set_pte(pte, __pte(0));
*/
switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
#endif
/*
* We also change the Switcher PTE page. When we're running the Guest,
* we want the Guest's "regs" page to appear where the first Switcher
* page for this CPU is. This is an optimization: when the Switcher
* saves the Guest registers, it saves them into the first page of this
* CPU's "struct lguest_pages": if we make sure the Guest's register
* page is already mapped there, we don't have to copy them out
* again.
*/
regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
} }
/*:*/
static void free_switcher_pte_pages(void) /*H:480
{ * (vi) Mapping the Switcher when the Guest is about to run.
unsigned int i; *
* The Switcher and the two pages for this CPU need to be visible in the Guest
for_each_possible_cpu(i) * (and not the pages for other CPUs).
free_page((long)switcher_pte_page(i));
}
/*H:520
* Setting up the Switcher PTE page for given CPU is fairly easy, given
* the CPU number and the "struct page"s for the Switcher code itself.
* *
* Currently the Switcher is less than a page long, so "pages" is always 1. * The pages for the pagetables have all been allocated before: we just need
* to make sure the actual PTEs are up-to-date for the CPU we're about to run
* on.
*/ */
static __init void populate_switcher_pte_page(unsigned int cpu, void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
struct page *switcher_page[],
unsigned int pages)
{ {
unsigned int i; unsigned long base;
pte_t *pte = switcher_pte_page(cpu); struct page *percpu_switcher_page, *regs_page;
pte_t *pte;
struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
/* Switcher page should always be mapped by now! */
BUG_ON(!pgdir->switcher_mapped);
/*
* Remember that we have two pages for each Host CPU, so we can run a
* Guest on each CPU without them interfering. We need to make sure
* those pages are mapped correctly in the Guest, but since we usually
* run on the same CPU, we cache that, and only update the mappings
* when we move.
*/
if (pgdir->last_host_cpu == raw_smp_processor_id())
return;
/* The first entries are easy: they map the Switcher code. */ /* -1 means unknown so we remove everything. */
for (i = 0; i < pages; i++) { if (pgdir->last_host_cpu == -1) {
set_pte(&pte[i], mk_pte(switcher_page[i], unsigned int i;
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); for_each_possible_cpu(i)
remove_switcher_percpu_map(cpu, i);
} else {
/* We know exactly what CPU mapping to remove. */
remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
} }
/* The only other thing we map is this CPU's pair of pages. */ /*
i = pages + cpu*2; * When we're running the Guest, we want the Guest's "regs" page to
* appear where the first Switcher page for this CPU is. This is an
/* First page (Guest registers) is writable from the Guest */ * optimization: when the Switcher saves the Guest registers, it saves
set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), * them into the first page of this CPU's "struct lguest_pages": if we
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); * make sure the Guest's register page is already mapped there, we
* don't have to copy them out again.
*/
/* Find the shadow PTE for this regs page. */
base = switcher_addr + PAGE_SIZE
+ raw_smp_processor_id() * sizeof(struct lguest_pages);
pte = find_spte(cpu, base, false, 0, 0);
regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
get_page(regs_page);
set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
/* /*
* The second page contains the "struct lguest_ro_state", and is * We map the second page of the struct lguest_pages read-only in
* read-only. * the Guest: the IDT, GDT and other things it's not supposed to
* change.
*/ */
set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); percpu_switcher_page
= lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
get_page(percpu_switcher_page);
set_pte(pte, mk_pte(percpu_switcher_page,
__pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
pgdir->last_host_cpu = raw_smp_processor_id();
} }
/* /*H:490
* We've made it through the page table code. Perhaps our tired brains are * We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over. * still processing the details, or perhaps we're simply glad it's over.
* *
...@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu, ...@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
* *
* There is just one file remaining in the Host. * There is just one file remaining in the Host.
*/ */
/*H:510
* At boot or module load time, init_pagetables() allocates and populates
* the Switcher PTE page for each CPU.
*/
__init int init_pagetables(struct page **switcher_page, unsigned int pages)
{
unsigned int i;
for_each_possible_cpu(i) {
switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!switcher_pte_page(i)) {
free_switcher_pte_pages();
return -ENOMEM;
}
populate_switcher_pte_page(i, switcher_page, pages);
}
return 0;
}
/*:*/
/* Cleaning up simply involves freeing the PTE page for each CPU. */
void free_pagetables(void)
{
free_switcher_pte_pages();
}
...@@ -59,14 +59,13 @@ static struct { ...@@ -59,14 +59,13 @@ static struct {
/* Offset from where switcher.S was compiled to where we've copied it */ /* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void) static unsigned long switcher_offset(void)
{ {
return SWITCHER_ADDR - (unsigned long)start_switcher_text; return switcher_addr - (unsigned long)start_switcher_text;
} }
/* This cpu's struct lguest_pages. */ /* This cpu's struct lguest_pages (after the Switcher text page) */
static struct lguest_pages *lguest_pages(unsigned int cpu) static struct lguest_pages *lguest_pages(unsigned int cpu)
{ {
return &(((struct lguest_pages *) return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
(SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
} }
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu); static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
......
...@@ -40,3 +40,17 @@ config CAIF_HSI ...@@ -40,3 +40,17 @@ config CAIF_HSI
The caif low level driver for CAIF over HSI. The caif low level driver for CAIF over HSI.
Be aware that if you enable this then you also need to Be aware that if you enable this then you also need to
enable a low-level HSI driver. enable a low-level HSI driver.
config CAIF_VIRTIO
tristate "CAIF virtio transport driver"
depends on CAIF
select VHOST_RING
select VIRTIO
select GENERIC_ALLOCATOR
default n
---help---
The caif driver for CAIF over Virtio.
if CAIF_VIRTIO
source "drivers/vhost/Kconfig"
endif
...@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o ...@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o
# HSI interface # HSI interface
obj-$(CONFIG_CAIF_HSI) += caif_hsi.o obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
# Virtio interface
obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
/*
* Copyright (C) ST-Ericsson AB 2013
* Authors: Vicram Arv
* Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
* Sjur Brendeland
* License terms: GNU General Public License (GPL) version 2
*/
#include <linux/module.h>
#include <linux/if_arp.h>
#include <linux/virtio.h>
#include <linux/vringh.h>
#include <linux/debugfs.h>
#include <linux/spinlock.h>
#include <linux/genalloc.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/virtio_ids.h>
#include <linux/virtio_caif.h>
#include <linux/virtio_ring.h>
#include <linux/dma-mapping.h>
#include <net/caif/caif_dev.h>
#include <linux/virtio_config.h>
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Vicram Arv");
MODULE_AUTHOR("Sjur Brendeland");
MODULE_DESCRIPTION("Virtio CAIF Driver");
/* NAPI schedule quota */
#define CFV_DEFAULT_QUOTA 32
/* Defaults used if virtio config space is unavailable */
#define CFV_DEF_MTU_SIZE 4096
#define CFV_DEF_HEADROOM 32
#define CFV_DEF_TAILROOM 32
/* Required IP header alignment */
#define IP_HDR_ALIGN 4
/* struct cfv_napi_contxt - NAPI context info
* @riov: IOV holding data read from the ring. Note that riov may
* still hold data when cfv_rx_poll() returns.
* @head: Last descriptor ID we received from vringh_getdesc_kern.
* We use this to put descriptor back on the used ring. USHRT_MAX is
* used to indicate invalid head-id.
*/
struct cfv_napi_context {
struct vringh_kiov riov;
unsigned short head;
};
/* struct cfv_stats - statistics for debugfs
* @rx_napi_complete: Number of NAPI completions (RX)
* @rx_napi_resched: Number of calls where the full quota was used (RX)
* @rx_nomem: Number of SKB alloc failures (RX)
* @rx_kicks: Number of RX kicks
* @tx_full_ring: Number times TX ring was full
* @tx_no_mem: Number of times TX went out of memory
* @tx_flow_on: Number of flow on (TX)
* @tx_kicks: Number of TX kicks
*/
struct cfv_stats {
u32 rx_napi_complete;
u32 rx_napi_resched;
u32 rx_nomem;
u32 rx_kicks;
u32 tx_full_ring;
u32 tx_no_mem;
u32 tx_flow_on;
u32 tx_kicks;
};
/* struct cfv_info - Caif Virtio control structure
* @cfdev: caif common header
* @vdev: Associated virtio device
* @vr_rx: rx/downlink host vring
* @vq_tx: tx/uplink virtqueue
* @ndev: CAIF link layer device
* @watermark_tx: indicates number of free descriptors we need
* to reopen the tx-queues after overload.
* @tx_lock: protects vq_tx from concurrent use
* @tx_release_tasklet: Tasklet for freeing consumed TX buffers
* @napi: Napi context used in cfv_rx_poll()
* @ctx: Context data used in cfv_rx_poll()
* @tx_hr: transmit headroom
* @rx_hr: receive headroom
* @tx_tr: transmit tail room
* @rx_tr: receive tail room
* @mtu: transmit max size
* @mru: receive max size
* @allocsz: size of dma memory reserved for TX buffers
* @alloc_addr: virtual address to dma memory for TX buffers
* @alloc_dma: dma address to dma memory for TX buffers
* @genpool: Gen Pool used for allocating TX buffers
* @reserved_mem: Pointer to memory reserve allocated from genpool
* @reserved_size: Size of memory reserve allocated from genpool
* @stats: Statistics exposed in sysfs
* @debugfs: Debugfs dentry for statistic counters
*/
struct cfv_info {
struct caif_dev_common cfdev;
struct virtio_device *vdev;
struct vringh *vr_rx;
struct virtqueue *vq_tx;
struct net_device *ndev;
unsigned int watermark_tx;
/* Protect access to vq_tx */
spinlock_t tx_lock;
struct tasklet_struct tx_release_tasklet;
struct napi_struct napi;
struct cfv_napi_context ctx;
u16 tx_hr;
u16 rx_hr;
u16 tx_tr;
u16 rx_tr;
u32 mtu;
u32 mru;
size_t allocsz;
void *alloc_addr;
dma_addr_t alloc_dma;
struct gen_pool *genpool;
unsigned long reserved_mem;
size_t reserved_size;
struct cfv_stats stats;
struct dentry *debugfs;
};
/* struct buf_info - maintains transmit buffer data handle
* @size: size of transmit buffer
* @dma_handle: handle to allocated dma device memory area
* @vaddr: virtual address mapping to allocated memory area
*/
struct buf_info {
size_t size;
u8 *vaddr;
};
/* Called from virtio device, in IRQ context */
static void cfv_release_cb(struct virtqueue *vq_tx)
{
struct cfv_info *cfv = vq_tx->vdev->priv;
++cfv->stats.tx_kicks;
tasklet_schedule(&cfv->tx_release_tasklet);
}
static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
{
if (!buf_info)
return;
gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
buf_info->size);
kfree(buf_info);
}
/* This is invoked whenever the remote processor completed processing
* a TX msg we just sent, and the buffer is put back to the used ring.
*/
static void cfv_release_used_buf(struct virtqueue *vq_tx)
{
struct cfv_info *cfv = vq_tx->vdev->priv;
unsigned long flags;
BUG_ON(vq_tx != cfv->vq_tx);
for (;;) {
unsigned int len;
struct buf_info *buf_info;
/* Get used buffer from used ring to recycle used descriptors */
spin_lock_irqsave(&cfv->tx_lock, flags);
buf_info = virtqueue_get_buf(vq_tx, &len);
spin_unlock_irqrestore(&cfv->tx_lock, flags);
/* Stop looping if there are no more buffers to free */
if (!buf_info)
break;
free_buf_info(cfv, buf_info);
/* watermark_tx indicates if we previously stopped the tx
* queues. If we have enough free stots in the virtio ring,
* re-establish memory reserved and open up tx queues.
*/
if (cfv->vq_tx->num_free <= cfv->watermark_tx)
continue;
/* Re-establish memory reserve */
if (cfv->reserved_mem == 0 && cfv->genpool)
cfv->reserved_mem =
gen_pool_alloc(cfv->genpool,
cfv->reserved_size);
/* Open up the tx queues */
if (cfv->reserved_mem) {
cfv->watermark_tx =
virtqueue_get_vring_size(cfv->vq_tx);
netif_tx_wake_all_queues(cfv->ndev);
/* Buffers are recycled in cfv_netdev_tx, so
* disable notifications when queues are opened.
*/
virtqueue_disable_cb(cfv->vq_tx);
++cfv->stats.tx_flow_on;
} else {
/* if no memory reserve, wait for more free slots */
WARN_ON(cfv->watermark_tx >
virtqueue_get_vring_size(cfv->vq_tx));
cfv->watermark_tx +=
virtqueue_get_vring_size(cfv->vq_tx) / 4;
}
}
}
/* Allocate a SKB and copy packet data to it */
static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
struct cfv_info *cfv,
u8 *frm, u32 frm_len)
{
struct sk_buff *skb;
u32 cfpkt_len, pad_len;
*err = 0;
/* Verify that packet size with down-link header and mtu size */
if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
netdev_err(cfv->ndev,
"Invalid frmlen:%u mtu:%u hr:%d tr:%d\n",
frm_len, cfv->mru, cfv->rx_hr,
cfv->rx_tr);
*err = -EPROTO;
return NULL;
}
cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
if (!skb) {
*err = -ENOMEM;
return NULL;
}
skb_reserve(skb, cfv->rx_hr + pad_len);
memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
return skb;
}
/* Get packets from the host vring */
static int cfv_rx_poll(struct napi_struct *napi, int quota)
{
struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
int rxcnt = 0;
int err = 0;
void *buf;
struct sk_buff *skb;
struct vringh_kiov *riov = &cfv->ctx.riov;
unsigned int skb_len;
again:
do {
skb = NULL;
/* Put the previous iovec back on the used ring and
* fetch a new iovec if we have processed all elements.
*/
if (riov->i == riov->used) {
if (cfv->ctx.head != USHRT_MAX) {
vringh_complete_kern(cfv->vr_rx,
cfv->ctx.head,
0);
cfv->ctx.head = USHRT_MAX;
}
err = vringh_getdesc_kern(
cfv->vr_rx,
riov,
NULL,
&cfv->ctx.head,
GFP_ATOMIC);
if (err <= 0)
goto exit;
}
buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
/* TODO: Add check on valid buffer address */
skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
riov->iov[riov->i].iov_len);
if (unlikely(err))
goto exit;
/* Push received packet up the stack. */
skb_len = skb->len;
skb->protocol = htons(ETH_P_CAIF);
skb_reset_mac_header(skb);
skb->dev = cfv->ndev;
err = netif_receive_skb(skb);
if (unlikely(err)) {
++cfv->ndev->stats.rx_dropped;
} else {
++cfv->ndev->stats.rx_packets;
cfv->ndev->stats.rx_bytes += skb_len;
}
++riov->i;
++rxcnt;
} while (rxcnt < quota);
++cfv->stats.rx_napi_resched;
goto out;
exit:
switch (err) {
case 0:
++cfv->stats.rx_napi_complete;
/* Really out of patckets? (stolen from virtio_net)*/
napi_complete(napi);
if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
napi_schedule_prep(napi)) {
vringh_notify_disable_kern(cfv->vr_rx);
__napi_schedule(napi);
goto again;
}
break;
case -ENOMEM:
++cfv->stats.rx_nomem;
dev_kfree_skb(skb);
/* Stop NAPI poll on OOM, we hope to be polled later */
napi_complete(napi);
vringh_notify_enable_kern(cfv->vr_rx);
break;
default:
/* We're doomed, any modem fault is fatal */
netdev_warn(cfv->ndev, "Bad ring, disable device\n");
cfv->ndev->stats.rx_dropped = riov->used - riov->i;
napi_complete(napi);
vringh_notify_disable_kern(cfv->vr_rx);
netif_carrier_off(cfv->ndev);
break;
}
out:
if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
vringh_notify(cfv->vr_rx);
return rxcnt;
}
static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
{
struct cfv_info *cfv = vdev->priv;
++cfv->stats.rx_kicks;
vringh_notify_disable_kern(cfv->vr_rx);
napi_schedule(&cfv->napi);
}
static void cfv_destroy_genpool(struct cfv_info *cfv)
{
if (cfv->alloc_addr)
dma_free_coherent(cfv->vdev->dev.parent->parent,
cfv->allocsz, cfv->alloc_addr,
cfv->alloc_dma);
if (!cfv->genpool)
return;
gen_pool_free(cfv->genpool, cfv->reserved_mem,
cfv->reserved_size);
gen_pool_destroy(cfv->genpool);
cfv->genpool = NULL;
}
static int cfv_create_genpool(struct cfv_info *cfv)
{
int err;
/* dma_alloc can only allocate whole pages, and we need a more
* fine graned allocation so we use genpool. We ask for space needed
* by IP and a full ring. If the dma allcoation fails we retry with a
* smaller allocation size.
*/
err = -ENOMEM;
cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
(ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
return -EINVAL;
for (;;) {
if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
netdev_info(cfv->ndev, "Not enough device memory\n");
return -ENOMEM;
}
cfv->alloc_addr = dma_alloc_coherent(
cfv->vdev->dev.parent->parent,
cfv->allocsz, &cfv->alloc_dma,
GFP_ATOMIC);
if (cfv->alloc_addr)
break;
cfv->allocsz = (cfv->allocsz * 3) >> 2;
}
netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
cfv->allocsz);
/* Allocate on 128 bytes boundaries (1 << 7)*/
cfv->genpool = gen_pool_create(7, -1);
if (!cfv->genpool)
goto err;
err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
(phys_addr_t)virt_to_phys(cfv->alloc_addr),
cfv->allocsz, -1);
if (err)
goto err;
/* Reserve some memory for low memory situations. If we hit the roof
* in the memory pool, we stop TX flow and release the reserve.
*/
cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
cfv->reserved_size);
if (!cfv->reserved_mem) {
err = -ENOMEM;
goto err;
}
cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
return 0;
err:
cfv_destroy_genpool(cfv);
return err;
}
/* Enable the CAIF interface and allocate the memory-pool */
static int cfv_netdev_open(struct net_device *netdev)
{
struct cfv_info *cfv = netdev_priv(netdev);
if (cfv_create_genpool(cfv))
return -ENOMEM;
netif_carrier_on(netdev);
napi_enable(&cfv->napi);
/* Schedule NAPI to read any pending packets */
napi_schedule(&cfv->napi);
return 0;
}
/* Disable the CAIF interface and free the memory-pool */
static int cfv_netdev_close(struct net_device *netdev)
{
struct cfv_info *cfv = netdev_priv(netdev);
unsigned long flags;
struct buf_info *buf_info;
/* Disable interrupts, queues and NAPI polling */
netif_carrier_off(netdev);
virtqueue_disable_cb(cfv->vq_tx);
vringh_notify_disable_kern(cfv->vr_rx);
napi_disable(&cfv->napi);
/* Release any TX buffers on both used and avilable rings */
cfv_release_used_buf(cfv->vq_tx);
spin_lock_irqsave(&cfv->tx_lock, flags);
while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
free_buf_info(cfv, buf_info);
spin_unlock_irqrestore(&cfv->tx_lock, flags);
/* Release all dma allocated memory and destroy the pool */
cfv_destroy_genpool(cfv);
return 0;
}
/* Allocate a buffer in dma-memory and copy skb to it */
static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
struct sk_buff *skb,
struct scatterlist *sg)
{
struct caif_payload_info *info = (void *)&skb->cb;
struct buf_info *buf_info = NULL;
u8 pad_len, hdr_ofs;
if (!cfv->genpool)
goto err;
if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
goto err;
}
buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
if (unlikely(!buf_info))
goto err;
/* Make the IP header aligned in tbe buffer */
hdr_ofs = cfv->tx_hr + info->hdr_len;
pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
/* allocate dma memory buffer */
buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
if (unlikely(!buf_info->vaddr))
goto err;
/* copy skbuf contents to send buffer */
skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
sg_init_one(sg, buf_info->vaddr + pad_len,
skb->len + cfv->tx_hr + cfv->rx_hr);
return buf_info;
err:
kfree(buf_info);
return NULL;
}
/* Put the CAIF packet on the virtio ring and kick the receiver */
static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
{
struct cfv_info *cfv = netdev_priv(netdev);
struct buf_info *buf_info;
struct scatterlist sg;
unsigned long flags;
bool flow_off = false;
int ret;
/* garbage collect released buffers */
cfv_release_used_buf(cfv->vq_tx);
spin_lock_irqsave(&cfv->tx_lock, flags);
/* Flow-off check takes into account number of cpus to make sure
* virtqueue will not be overfilled in any possible smp conditions.
*
* Flow-on is triggered when sufficient buffers are freed
*/
if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
flow_off = true;
cfv->stats.tx_full_ring++;
}
/* If we run out of memory, we release the memory reserve and retry
* allocation.
*/
buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
if (unlikely(!buf_info)) {
cfv->stats.tx_no_mem++;
flow_off = true;
if (cfv->reserved_mem && cfv->genpool) {
gen_pool_free(cfv->genpool, cfv->reserved_mem,
cfv->reserved_size);
cfv->reserved_mem = 0;
buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
}
}
if (unlikely(flow_off)) {
/* Turn flow on when a 1/4 of the descriptors are released */
cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
/* Enable notifications of recycled TX buffers */
virtqueue_enable_cb(cfv->vq_tx);
netif_tx_stop_all_queues(netdev);
}
if (unlikely(!buf_info)) {
/* If the memory reserve does it's job, this shouldn't happen */
netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
goto err;
}
ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
if (unlikely((ret < 0))) {
/* If flow control works, this shouldn't happen */
netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
ret);
goto err;
}
/* update netdev statistics */
cfv->ndev->stats.tx_packets++;
cfv->ndev->stats.tx_bytes += skb->len;
spin_unlock_irqrestore(&cfv->tx_lock, flags);
/* tell the remote processor it has a pending message to read */
virtqueue_kick(cfv->vq_tx);
dev_kfree_skb(skb);
return NETDEV_TX_OK;
err:
spin_unlock_irqrestore(&cfv->tx_lock, flags);
cfv->ndev->stats.tx_dropped++;
free_buf_info(cfv, buf_info);
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
static void cfv_tx_release_tasklet(unsigned long drv)
{
struct cfv_info *cfv = (struct cfv_info *)drv;
cfv_release_used_buf(cfv->vq_tx);
}
static const struct net_device_ops cfv_netdev_ops = {
.ndo_open = cfv_netdev_open,
.ndo_stop = cfv_netdev_close,
.ndo_start_xmit = cfv_netdev_tx,
};
static void cfv_netdev_setup(struct net_device *netdev)
{
netdev->netdev_ops = &cfv_netdev_ops;
netdev->type = ARPHRD_CAIF;
netdev->tx_queue_len = 100;
netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
netdev->mtu = CFV_DEF_MTU_SIZE;
netdev->destructor = free_netdev;
}
/* Create debugfs counters for the device */
static inline void debugfs_init(struct cfv_info *cfv)
{
cfv->debugfs =
debugfs_create_dir(netdev_name(cfv->ndev), NULL);
if (IS_ERR(cfv->debugfs))
return;
debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_napi_complete);
debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_napi_resched);
debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_nomem);
debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
&cfv->stats.rx_kicks);
debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_full_ring);
debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_no_mem);
debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_kicks);
debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
&cfv->stats.tx_flow_on);
}
/* Setup CAIF for the a virtio device */
static int cfv_probe(struct virtio_device *vdev)
{
vq_callback_t *vq_cbs = cfv_release_cb;
vrh_callback_t *vrh_cbs = cfv_recv;
const char *names = "output";
const char *cfv_netdev_name = "cfvrt";
struct net_device *netdev;
struct cfv_info *cfv;
int err = -EINVAL;
netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
cfv_netdev_setup);
if (!netdev)
return -ENOMEM;
cfv = netdev_priv(netdev);
cfv->vdev = vdev;
cfv->ndev = netdev;
spin_lock_init(&cfv->tx_lock);
/* Get the RX virtio ring. This is a "host side vring". */
err = -ENODEV;
if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
goto err;
err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
if (err)
goto err;
/* Get the TX virtio ring. This is a "guest side vring". */
err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
if (err)
goto err;
/* Get the CAIF configuration from virtio config space, if available */
#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
&_var, \
FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
if (vdev->config->get) {
GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
} else {
cfv->tx_hr = CFV_DEF_HEADROOM;
cfv->rx_hr = CFV_DEF_HEADROOM;
cfv->tx_tr = CFV_DEF_TAILROOM;
cfv->rx_tr = CFV_DEF_TAILROOM;
cfv->mtu = CFV_DEF_MTU_SIZE;
cfv->mru = CFV_DEF_MTU_SIZE;
}
netdev->needed_headroom = cfv->tx_hr;
netdev->needed_tailroom = cfv->tx_tr;
/* Disable buffer release interrupts unless we have stopped TX queues */
virtqueue_disable_cb(cfv->vq_tx);
netdev->mtu = cfv->mtu - cfv->tx_tr;
vdev->priv = cfv;
/* Initialize NAPI poll context data */
vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
cfv->ctx.head = USHRT_MAX;
netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
tasklet_init(&cfv->tx_release_tasklet,
cfv_tx_release_tasklet,
(unsigned long)cfv);
/* Carrier is off until netdevice is opened */
netif_carrier_off(netdev);
/* register Netdev */
err = register_netdev(netdev);
if (err) {
dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
goto err;
}
debugfs_init(cfv);
return 0;
err:
netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
if (cfv->vr_rx)
vdev->vringh_config->del_vrhs(cfv->vdev);
if (cfv->vdev)
vdev->config->del_vqs(cfv->vdev);
free_netdev(netdev);
return err;
}
static void cfv_remove(struct virtio_device *vdev)
{
struct cfv_info *cfv = vdev->priv;
rtnl_lock();
dev_close(cfv->ndev);
rtnl_unlock();
tasklet_kill(&cfv->tx_release_tasklet);
debugfs_remove_recursive(cfv->debugfs);
vringh_kiov_cleanup(&cfv->ctx.riov);
vdev->config->reset(vdev);
vdev->vringh_config->del_vrhs(cfv->vdev);
cfv->vr_rx = NULL;
vdev->config->del_vqs(cfv->vdev);
unregister_netdev(cfv->ndev);
}
static struct virtio_device_id id_table[] = {
{ VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
{ 0 },
};
static unsigned int features[] = {
};
static struct virtio_driver caif_virtio_driver = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = cfv_probe,
.remove = cfv_remove,
};
module_virtio_driver(caif_virtio_driver);
MODULE_DEVICE_TABLE(virtio, id_table);
...@@ -39,7 +39,6 @@ module_param(gso, bool, 0444); ...@@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
#define GOOD_COPY_LEN 128 #define GOOD_COPY_LEN 128
#define VIRTNET_SEND_COMMAND_SG_MAX 2
#define VIRTNET_DRIVER_VERSION "1.0.0" #define VIRTNET_DRIVER_VERSION "1.0.0"
struct virtnet_stats { struct virtnet_stats {
...@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp) ...@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
skb_to_sgvec(skb, rq->sg + 1, 0, skb->len); skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp); err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
if (err < 0) if (err < 0)
dev_kfree_skb(skb); dev_kfree_skb(skb);
...@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp) ...@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
/* chain first in list head */ /* chain first in list head */
first->private = (unsigned long)list; first->private = (unsigned long)list;
err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2, err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
first, gfp); first, gfp);
if (err < 0) if (err < 0)
give_pages(rq, first); give_pages(rq, first);
...@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp) ...@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
sg_init_one(rq->sg, page_address(page), PAGE_SIZE); sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp); err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
if (err < 0) if (err < 0)
give_pages(rq, page); give_pages(rq, page);
...@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work) ...@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work)
bool still_empty; bool still_empty;
int i; int i;
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->curr_queue_pairs; i++) {
struct receive_queue *rq = &vi->rq[i]; struct receive_queue *rq = &vi->rq[i];
napi_disable(&rq->napi); napi_disable(&rq->napi);
...@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev) ...@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev); struct virtnet_info *vi = netdev_priv(dev);
int i; int i;
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->curr_queue_pairs; i++) {
/* Make sure we have some buffers: if oom use wq. */ /* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0); schedule_delayed_work(&vi->refill, 0);
...@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) ...@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr); sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1; num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
return virtqueue_add_buf(sq->vq, sq->sg, num_sg, return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
0, skb, GFP_ATOMIC);
} }
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
...@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
* never fail unless improperly formated. * never fail unless improperly formated.
*/ */
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
struct scatterlist *data, int out, int in) struct scatterlist *out,
struct scatterlist *in)
{ {
struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2]; struct scatterlist *sgs[4], hdr, stat;
struct virtio_net_ctrl_hdr ctrl; struct virtio_net_ctrl_hdr ctrl;
virtio_net_ctrl_ack status = ~0; virtio_net_ctrl_ack status = ~0;
unsigned int tmp; unsigned out_num = 0, in_num = 0, tmp;
int i;
/* Caller should know better */ /* Caller should know better */
BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) || BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
(out + in > VIRTNET_SEND_COMMAND_SG_MAX));
out++; /* Add header */
in++; /* Add return status */
ctrl.class = class; ctrl.class = class;
ctrl.cmd = cmd; ctrl.cmd = cmd;
/* Add header */
sg_init_one(&hdr, &ctrl, sizeof(ctrl));
sgs[out_num++] = &hdr;
sg_init_table(sg, out + in); if (out)
sgs[out_num++] = out;
if (in)
sgs[out_num + in_num++] = in;
sg_set_buf(&sg[0], &ctrl, sizeof(ctrl)); /* Add return status. */
for_each_sg(data, s, out + in - 2, i) sg_init_one(&stat, &status, sizeof(status));
sg_set_buf(&sg[i + 1], sg_virt(s), s->length); sgs[out_num + in_num++] = &stat;
sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0); BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
< 0);
virtqueue_kick(vi->cvq); virtqueue_kick(vi->cvq);
...@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p) ...@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
sg_init_one(&sg, addr->sa_data, dev->addr_len); sg_init_one(&sg, addr->sa_data, dev->addr_len);
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_ADDR_SET, VIRTIO_NET_CTRL_MAC_ADDR_SET,
&sg, 1, 0)) { &sg, NULL)) {
dev_warn(&vdev->dev, dev_warn(&vdev->dev,
"Failed to set mac address by vq command.\n"); "Failed to set mac address by vq command.\n");
return -EINVAL; return -EINVAL;
...@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi) ...@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
{ {
rtnl_lock(); rtnl_lock();
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
0, 0))
dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
rtnl_unlock(); rtnl_unlock();
} }
...@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) ...@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
struct scatterlist sg; struct scatterlist sg;
struct virtio_net_ctrl_mq s; struct virtio_net_ctrl_mq s;
struct net_device *dev = vi->dev; struct net_device *dev = vi->dev;
int i;
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
return 0; return 0;
...@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) ...@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
sg_init_one(&sg, &s, sizeof(s)); sg_init_one(&sg, &s, sizeof(s));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){ VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
queue_pairs); queue_pairs);
return -EINVAL; return -EINVAL;
} else } else {
for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
vi->curr_queue_pairs = queue_pairs; vi->curr_queue_pairs = queue_pairs;
}
return 0; return 0;
} }
...@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) ...@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_PROMISC, VIRTIO_NET_CTRL_RX_PROMISC,
sg, 1, 0)) sg, NULL))
dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
promisc ? "en" : "dis"); promisc ? "en" : "dis");
...@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) ...@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_ALLMULTI, VIRTIO_NET_CTRL_RX_ALLMULTI,
sg, 1, 0)) sg, NULL))
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
allmulti ? "en" : "dis"); allmulti ? "en" : "dis");
...@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) ...@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_TABLE_SET, VIRTIO_NET_CTRL_MAC_TABLE_SET,
sg, 2, 0)) sg, NULL))
dev_warn(&dev->dev, "Failed to set MAC fitler table.\n"); dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
kfree(buf); kfree(buf);
...@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, ...@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid)); sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0)) VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
return 0; return 0;
} }
...@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev, ...@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid)); sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0)) VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
return 0; return 0;
} }
...@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev) ...@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev)
} }
/* Last of all, set up some receive buffers. */ /* Last of all, set up some receive buffers. */
for (i = 0; i < vi->max_queue_pairs; i++) { for (i = 0; i < vi->curr_queue_pairs; i++) {
try_fill_recv(&vi->rq[i], GFP_KERNEL); try_fill_recv(&vi->rq[i], GFP_KERNEL);
/* If we didn't even get one input buffer, we're useless. */ /* If we didn't even get one input buffer, we're useless. */
...@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev) ...@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev)
netif_device_attach(vi->dev); netif_device_attach(vi->dev);
for (i = 0; i < vi->max_queue_pairs; i++) for (i = 0; i < vi->curr_queue_pairs; i++)
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL)) if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0); schedule_delayed_work(&vi->refill, 0);
......
...@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst, ...@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
mutex_lock(&vrp->tx_lock); mutex_lock(&vrp->tx_lock);
/* add message to the remote processor's virtqueue */ /* add message to the remote processor's virtqueue */
err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL); err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
if (err) { if (err) {
/* /*
* need to reclaim the buffer here, otherwise it's lost * need to reclaim the buffer here, otherwise it's lost
* (memory won't leak, but rpmsg won't use it again for TX). * (memory won't leak, but rpmsg won't use it again for TX).
* this will wait for a buffer management overhaul. * this will wait for a buffer management overhaul.
*/ */
dev_err(dev, "virtqueue_add_buf failed: %d\n", err); dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
goto out; goto out;
} }
...@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq) ...@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
sg_init_one(&sg, msg, RPMSG_BUF_SIZE); sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
/* add the buffer back to the remote processor's virtqueue */ /* add the buffer back to the remote processor's virtqueue */
err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL); err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
if (err < 0) { if (err < 0) {
dev_err(dev, "failed to add a virtqueue buffer: %d\n", err); dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
return; return;
...@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev) ...@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE); sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr, err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
GFP_KERNEL); GFP_KERNEL);
WARN_ON(err); /* sanity check; this can't really happen */ WARN_ON(err); /* sanity check; this can't really happen */
} }
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
* *
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/mempool.h> #include <linux/mempool.h>
...@@ -20,12 +22,14 @@ ...@@ -20,12 +22,14 @@
#include <linux/virtio_ids.h> #include <linux/virtio_ids.h>
#include <linux/virtio_config.h> #include <linux/virtio_config.h>
#include <linux/virtio_scsi.h> #include <linux/virtio_scsi.h>
#include <linux/cpu.h>
#include <scsi/scsi_host.h> #include <scsi/scsi_host.h>
#include <scsi/scsi_device.h> #include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h> #include <scsi/scsi_cmnd.h>
#define VIRTIO_SCSI_MEMPOOL_SZ 64 #define VIRTIO_SCSI_MEMPOOL_SZ 64
#define VIRTIO_SCSI_EVENT_LEN 8 #define VIRTIO_SCSI_EVENT_LEN 8
#define VIRTIO_SCSI_VQ_BASE 2
/* Command queue element */ /* Command queue element */
struct virtio_scsi_cmd { struct virtio_scsi_cmd {
...@@ -57,27 +61,61 @@ struct virtio_scsi_vq { ...@@ -57,27 +61,61 @@ struct virtio_scsi_vq {
struct virtqueue *vq; struct virtqueue *vq;
}; };
/* Per-target queue state */ /*
* Per-target queue state.
*
* This struct holds the data needed by the queue steering policy. When a
* target is sent multiple requests, we need to drive them to the same queue so
* that FIFO processing order is kept. However, if a target was idle, we can
* choose a queue arbitrarily. In this case the queue is chosen according to
* the current VCPU, so the driver expects the number of request queues to be
* equal to the number of VCPUs. This makes it easy and fast to select the
* queue, and also lets the driver optimize the IRQ affinity for the virtqueues
* (each virtqueue's affinity is set to the CPU that "owns" the queue).
*
* An interesting effect of this policy is that only writes to req_vq need to
* take the tgt_lock. Read can be done outside the lock because:
*
* - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
* In that case, no other CPU is reading req_vq: even if they were in
* virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
*
* - reads of req_vq only occur when the target is not idle (reqs != 0).
* A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
*
* Similarly, decrements of reqs are never concurrent with writes of req_vq.
* Thus they can happen outside the tgt_lock, provided of course we make reqs
* an atomic_t.
*/
struct virtio_scsi_target_state { struct virtio_scsi_target_state {
/* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */ /* This spinlock never held at the same time as vq_lock. */
spinlock_t tgt_lock; spinlock_t tgt_lock;
/* For sglist construction when adding commands to the virtqueue. */ /* Count of outstanding requests. */
struct scatterlist sg[]; atomic_t reqs;
/* Currently active virtqueue for requests sent to this target. */
struct virtio_scsi_vq *req_vq;
}; };
/* Driver instance state */ /* Driver instance state */
struct virtio_scsi { struct virtio_scsi {
struct virtio_device *vdev; struct virtio_device *vdev;
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vq;
/* Get some buffers ready for event vq */ /* Get some buffers ready for event vq */
struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN]; struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
struct virtio_scsi_target_state *tgt[]; u32 num_queues;
/* If the affinity hint is set for virtqueues */
bool affinity_hint_set;
/* CPU hotplug notifier */
struct notifier_block nb;
struct virtio_scsi_vq ctrl_vq;
struct virtio_scsi_vq event_vq;
struct virtio_scsi_vq req_vqs[];
}; };
static struct kmem_cache *virtscsi_cmd_cache; static struct kmem_cache *virtscsi_cmd_cache;
...@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid) ...@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
* *
* Called with vq_lock held. * Called with vq_lock held.
*/ */
static void virtscsi_complete_cmd(void *buf) static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
{ {
struct virtio_scsi_cmd *cmd = buf; struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc; struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd; struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
dev_dbg(&sc->device->sdev_gendev, dev_dbg(&sc->device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n", "cmd %p response %u status %#02x sense_len %u\n",
...@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf) ...@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf)
mempool_free(cmd, virtscsi_cmd_pool); mempool_free(cmd, virtscsi_cmd_pool);
sc->scsi_done(sc); sc->scsi_done(sc);
atomic_dec(&tgt->reqs);
} }
static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf)) static void virtscsi_vq_done(struct virtio_scsi *vscsi,
struct virtio_scsi_vq *virtscsi_vq,
void (*fn)(struct virtio_scsi *vscsi, void *buf))
{ {
void *buf; void *buf;
unsigned int len; unsigned int len;
unsigned long flags;
struct virtqueue *vq = virtscsi_vq->vq;
spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
do { do {
virtqueue_disable_cb(vq); virtqueue_disable_cb(vq);
while ((buf = virtqueue_get_buf(vq, &len)) != NULL) while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
fn(buf); fn(vscsi, buf);
} while (!virtqueue_enable_cb(vq)); } while (!virtqueue_enable_cb(vq));
spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
} }
static void virtscsi_req_done(struct virtqueue *vq) static void virtscsi_req_done(struct virtqueue *vq)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
unsigned long flags; int index = vq->index - VIRTIO_SCSI_VQ_BASE;
struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags); /*
virtscsi_vq_done(vq, virtscsi_complete_cmd); * Read req_vq before decrementing the reqs field in
spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags); * virtscsi_complete_cmd.
*
* With barriers:
*
* CPU #0 virtscsi_queuecommand_multi (CPU #1)
* ------------------------------------------------------------
* lock vq_lock
* read req_vq
* read reqs (reqs = 1)
* write reqs (reqs = 0)
* increment reqs (reqs = 1)
* write req_vq
*
* Possible reordering without barriers:
*
* CPU #0 virtscsi_queuecommand_multi (CPU #1)
* ------------------------------------------------------------
* lock vq_lock
* read reqs (reqs = 1)
* write reqs (reqs = 0)
* increment reqs (reqs = 1)
* write req_vq
* read (wrong) req_vq
*
* We do not need a full smp_rmb, because req_vq is required to get
* to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
* in the virtqueue as the user token.
*/
smp_read_barrier_depends();
virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
}; };
static void virtscsi_complete_free(void *buf) static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
{ {
struct virtio_scsi_cmd *cmd = buf; struct virtio_scsi_cmd *cmd = buf;
...@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq) ...@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
unsigned long flags;
spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags); virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
virtscsi_vq_done(vq, virtscsi_complete_free);
spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
}; };
static int virtscsi_kick_event(struct virtio_scsi *vscsi, static int virtscsi_kick_event(struct virtio_scsi *vscsi,
...@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, ...@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
GFP_ATOMIC); GFP_ATOMIC);
if (!err) if (!err)
virtqueue_kick(vscsi->event_vq.vq); virtqueue_kick(vscsi->event_vq.vq);
...@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi) ...@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
} }
static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi, static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
struct virtio_scsi_event *event) struct virtio_scsi_event *event)
{ {
struct scsi_device *sdev; struct scsi_device *sdev;
struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev); struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
...@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work) ...@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work)
virtscsi_kick_event(vscsi, event_node); virtscsi_kick_event(vscsi, event_node);
} }
static void virtscsi_complete_event(void *buf) static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
{ {
struct virtio_scsi_event_node *event_node = buf; struct virtio_scsi_event_node *event_node = buf;
...@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq) ...@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev); struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
unsigned long flags;
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
virtscsi_vq_done(vq, virtscsi_complete_event);
spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
}; };
static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
struct scsi_data_buffer *sdb)
{
struct sg_table *table = &sdb->table;
struct scatterlist *sg_elem;
unsigned int idx = *p_idx;
int i;
for_each_sg(table->sgl, sg_elem, table->nents, i)
sg[idx++] = *sg_elem;
*p_idx = idx;
}
/** /**
* virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
* @vscsi : virtio_scsi state * @vq : the struct virtqueue we're talking about
* @cmd : command structure * @cmd : command structure
* @out_num : number of read-only elements
* @in_num : number of write-only elements
* @req_size : size of the request buffer * @req_size : size of the request buffer
* @resp_size : size of the response buffer * @resp_size : size of the response buffer
* * @gfp : flags to use for memory allocations
* Called with tgt_lock held.
*/ */
static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt, static int virtscsi_add_cmd(struct virtqueue *vq,
struct virtio_scsi_cmd *cmd, struct virtio_scsi_cmd *cmd,
unsigned *out_num, unsigned *in_num, size_t req_size, size_t resp_size, gfp_t gfp)
size_t req_size, size_t resp_size)
{ {
struct scsi_cmnd *sc = cmd->sc; struct scsi_cmnd *sc = cmd->sc;
struct scatterlist *sg = tgt->sg; struct scatterlist *sgs[4], req, resp;
unsigned int idx = 0; struct sg_table *out, *in;
unsigned out_num = 0, in_num = 0;
out = in = NULL;
if (sc && sc->sc_data_direction != DMA_NONE) {
if (sc->sc_data_direction != DMA_FROM_DEVICE)
out = &scsi_out(sc)->table;
if (sc->sc_data_direction != DMA_TO_DEVICE)
in = &scsi_in(sc)->table;
}
/* Request header. */ /* Request header. */
sg_set_buf(&sg[idx++], &cmd->req, req_size); sg_init_one(&req, &cmd->req, req_size);
sgs[out_num++] = &req;
/* Data-out buffer. */ /* Data-out buffer. */
if (sc && sc->sc_data_direction != DMA_FROM_DEVICE) if (out)
virtscsi_map_sgl(sg, &idx, scsi_out(sc)); sgs[out_num++] = out->sgl;
*out_num = idx;
/* Response header. */ /* Response header. */
sg_set_buf(&sg[idx++], &cmd->resp, resp_size); sg_init_one(&resp, &cmd->resp, resp_size);
sgs[out_num + in_num++] = &resp;
/* Data-in buffer */ /* Data-in buffer */
if (sc && sc->sc_data_direction != DMA_TO_DEVICE) if (in)
virtscsi_map_sgl(sg, &idx, scsi_in(sc)); sgs[out_num + in_num++] = in->sgl;
*in_num = idx - *out_num; return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
} }
static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
struct virtio_scsi_vq *vq,
struct virtio_scsi_cmd *cmd, struct virtio_scsi_cmd *cmd,
size_t req_size, size_t resp_size, gfp_t gfp) size_t req_size, size_t resp_size, gfp_t gfp)
{ {
unsigned int out_num, in_num;
unsigned long flags; unsigned long flags;
int err; int err;
bool needs_kick = false; bool needs_kick = false;
spin_lock_irqsave(&tgt->tgt_lock, flags); spin_lock_irqsave(&vq->vq_lock, flags);
virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size); err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
spin_lock(&vq->vq_lock);
err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
spin_unlock(&tgt->tgt_lock);
if (!err) if (!err)
needs_kick = virtqueue_kick_prepare(vq->vq); needs_kick = virtqueue_kick_prepare(vq->vq);
...@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt, ...@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
return err; return err;
} }
static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
struct virtio_scsi_vq *req_vq,
struct scsi_cmnd *sc)
{ {
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
struct virtio_scsi_cmd *cmd; struct virtio_scsi_cmd *cmd;
int ret; int ret;
...@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) ...@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE); BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len); memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd, if (virtscsi_kick_cmd(req_vq, cmd,
sizeof cmd->req.cmd, sizeof cmd->resp.cmd, sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
GFP_ATOMIC) == 0) GFP_ATOMIC) == 0)
ret = 0; ret = 0;
...@@ -478,14 +537,62 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) ...@@ -478,14 +537,62 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
return ret; return ret;
} }
static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
struct scsi_cmnd *sc)
{
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
atomic_inc(&tgt->reqs);
return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
}
static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
struct virtio_scsi_target_state *tgt)
{
struct virtio_scsi_vq *vq;
unsigned long flags;
u32 queue_num;
spin_lock_irqsave(&tgt->tgt_lock, flags);
/*
* The memory barrier after atomic_inc_return matches
* the smp_read_barrier_depends() in virtscsi_req_done.
*/
if (atomic_inc_return(&tgt->reqs) > 1)
vq = ACCESS_ONCE(tgt->req_vq);
else {
queue_num = smp_processor_id();
while (unlikely(queue_num >= vscsi->num_queues))
queue_num -= vscsi->num_queues;
tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
}
spin_unlock_irqrestore(&tgt->tgt_lock, flags);
return vq;
}
static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
struct scsi_cmnd *sc)
{
struct virtio_scsi *vscsi = shost_priv(sh);
struct virtio_scsi_target_state *tgt =
scsi_target(sc->device)->hostdata;
struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
return virtscsi_queuecommand(vscsi, req_vq, sc);
}
static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd) static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
{ {
DECLARE_COMPLETION_ONSTACK(comp); DECLARE_COMPLETION_ONSTACK(comp);
struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
int ret = FAILED; int ret = FAILED;
cmd->comp = &comp; cmd->comp = &comp;
if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd, if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
sizeof cmd->req.tmf, sizeof cmd->resp.tmf, sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
GFP_NOIO) < 0) GFP_NOIO) < 0)
goto out; goto out;
...@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc) ...@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
return virtscsi_tmf(vscsi, cmd); return virtscsi_tmf(vscsi, cmd);
} }
static struct scsi_host_template virtscsi_host_template = { static int virtscsi_target_alloc(struct scsi_target *starget)
{
struct virtio_scsi_target_state *tgt =
kmalloc(sizeof(*tgt), GFP_KERNEL);
if (!tgt)
return -ENOMEM;
spin_lock_init(&tgt->tgt_lock);
atomic_set(&tgt->reqs, 0);
tgt->req_vq = NULL;
starget->hostdata = tgt;
return 0;
}
static void virtscsi_target_destroy(struct scsi_target *starget)
{
struct virtio_scsi_target_state *tgt = starget->hostdata;
kfree(tgt);
}
static struct scsi_host_template virtscsi_host_template_single = {
.module = THIS_MODULE,
.name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi",
.this_id = -1,
.queuecommand = virtscsi_queuecommand_single,
.eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset,
.can_queue = 1024,
.dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING,
.target_alloc = virtscsi_target_alloc,
.target_destroy = virtscsi_target_destroy,
};
static struct scsi_host_template virtscsi_host_template_multi = {
.module = THIS_MODULE, .module = THIS_MODULE,
.name = "Virtio SCSI HBA", .name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi", .proc_name = "virtio_scsi",
.queuecommand = virtscsi_queuecommand,
.this_id = -1, .this_id = -1,
.queuecommand = virtscsi_queuecommand_multi,
.eh_abort_handler = virtscsi_abort, .eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset, .eh_device_reset_handler = virtscsi_device_reset,
.can_queue = 1024, .can_queue = 1024,
.dma_boundary = UINT_MAX, .dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING, .use_clustering = ENABLE_CLUSTERING,
.target_alloc = virtscsi_target_alloc,
.target_destroy = virtscsi_target_destroy,
}; };
#define virtscsi_config_get(vdev, fld) \ #define virtscsi_config_get(vdev, fld) \
...@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = { ...@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = {
&__val, sizeof(__val)); \ &__val, sizeof(__val)); \
}) })
static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq, static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
struct virtqueue *vq)
{ {
spin_lock_init(&virtscsi_vq->vq_lock); int i;
virtscsi_vq->vq = vq; int cpu;
/* In multiqueue mode, when the number of cpu is equal
* to the number of request queues, we let the qeueues
* to be private to one cpu by setting the affinity hint
* to eliminate the contention.
*/
if ((vscsi->num_queues == 1 ||
vscsi->num_queues != num_online_cpus()) && affinity) {
if (vscsi->affinity_hint_set)
affinity = false;
else
return;
}
if (affinity) {
i = 0;
for_each_online_cpu(cpu) {
virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
i++;
}
vscsi->affinity_hint_set = true;
} else {
for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
vscsi->affinity_hint_set = false;
}
} }
static struct virtio_scsi_target_state *virtscsi_alloc_tgt( static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
struct virtio_device *vdev, int sg_elems)
{ {
struct virtio_scsi_target_state *tgt; get_online_cpus();
gfp_t gfp_mask = GFP_KERNEL; __virtscsi_set_affinity(vscsi, affinity);
put_online_cpus();
/* We need extra sg elements at head and tail. */ }
tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
gfp_mask);
if (!tgt) static int virtscsi_cpu_callback(struct notifier_block *nfb,
return NULL; unsigned long action, void *hcpu)
{
struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
switch(action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
__virtscsi_set_affinity(vscsi, true);
break;
default:
break;
}
return NOTIFY_OK;
}
spin_lock_init(&tgt->tgt_lock); static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
sg_init_table(tgt->sg, sg_elems + 2); struct virtqueue *vq)
return tgt; {
spin_lock_init(&virtscsi_vq->vq_lock);
virtscsi_vq->vq = vq;
} }
static void virtscsi_scan(struct virtio_device *vdev) static void virtscsi_scan(struct virtio_device *vdev)
...@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev) ...@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
{ {
struct Scsi_Host *sh = virtio_scsi_host(vdev); struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
u32 i, num_targets;
virtscsi_set_affinity(vscsi, false);
/* Stop all the virtqueues. */ /* Stop all the virtqueues. */
vdev->config->reset(vdev); vdev->config->reset(vdev);
num_targets = sh->max_id;
for (i = 0; i < num_targets; i++) {
kfree(vscsi->tgt[i]);
vscsi->tgt[i] = NULL;
}
vdev->config->del_vqs(vdev); vdev->config->del_vqs(vdev);
} }
static int virtscsi_init(struct virtio_device *vdev, static int virtscsi_init(struct virtio_device *vdev,
struct virtio_scsi *vscsi, int num_targets) struct virtio_scsi *vscsi)
{ {
int err; int err;
struct virtqueue *vqs[3]; u32 i;
u32 i, sg_elems; u32 num_vqs;
vq_callback_t **callbacks;
const char **names;
struct virtqueue **vqs;
num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
if (!callbacks || !vqs || !names) {
err = -ENOMEM;
goto out;
}
vq_callback_t *callbacks[] = { callbacks[0] = virtscsi_ctrl_done;
virtscsi_ctrl_done, callbacks[1] = virtscsi_event_done;
virtscsi_event_done, names[0] = "control";
virtscsi_req_done names[1] = "event";
}; for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
const char *names[] = { callbacks[i] = virtscsi_req_done;
"control", names[i] = "request";
"event", }
"request"
};
/* Discover virtqueues and write information to configuration. */ /* Discover virtqueues and write information to configuration. */
err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names); err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
if (err) if (err)
return err; goto out;
virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]); virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
virtscsi_init_vq(&vscsi->event_vq, vqs[1]); virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
virtscsi_init_vq(&vscsi->req_vq, vqs[2]); for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
vqs[i]);
virtscsi_set_affinity(vscsi, true);
virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE); virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE); virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
...@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev, ...@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev,
if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
virtscsi_kick_event_all(vscsi); virtscsi_kick_event_all(vscsi);
/* We need to know how many segments before we allocate. */
sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
for (i = 0; i < num_targets; i++) {
vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
if (!vscsi->tgt[i]) {
err = -ENOMEM;
goto out;
}
}
err = 0; err = 0;
out: out:
kfree(names);
kfree(callbacks);
kfree(vqs);
if (err) if (err)
virtscsi_remove_vqs(vdev); virtscsi_remove_vqs(vdev);
return err; return err;
...@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev) ...@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
int err; int err;
u32 sg_elems, num_targets; u32 sg_elems, num_targets;
u32 cmd_per_lun; u32 cmd_per_lun;
u32 num_queues;
struct scsi_host_template *hostt;
/* We need to know how many queues before we allocate. */
num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
/* Allocate memory and link the structs together. */
num_targets = virtscsi_config_get(vdev, max_target) + 1; num_targets = virtscsi_config_get(vdev, max_target) + 1;
shost = scsi_host_alloc(&virtscsi_host_template,
sizeof(*vscsi)
+ num_targets * sizeof(struct virtio_scsi_target_state));
if (num_queues == 1)
hostt = &virtscsi_host_template_single;
else
hostt = &virtscsi_host_template_multi;
shost = scsi_host_alloc(hostt,
sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
if (!shost) if (!shost)
return -ENOMEM; return -ENOMEM;
...@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev) ...@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev)
shost->sg_tablesize = sg_elems; shost->sg_tablesize = sg_elems;
vscsi = shost_priv(shost); vscsi = shost_priv(shost);
vscsi->vdev = vdev; vscsi->vdev = vdev;
vscsi->num_queues = num_queues;
vdev->priv = shost; vdev->priv = shost;
err = virtscsi_init(vdev, vscsi, num_targets); err = virtscsi_init(vdev, vscsi);
if (err) if (err)
goto virtscsi_init_failed; goto virtscsi_init_failed;
vscsi->nb.notifier_call = &virtscsi_cpu_callback;
err = register_hotcpu_notifier(&vscsi->nb);
if (err) {
pr_err("registering cpu notifier failed\n");
goto scsi_add_host_failed;
}
cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1; cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue); shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF; shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
...@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev) ...@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
scsi_remove_host(shost); scsi_remove_host(shost);
unregister_hotcpu_notifier(&vscsi->nb);
virtscsi_remove_vqs(vdev); virtscsi_remove_vqs(vdev);
scsi_host_put(shost); scsi_host_put(shost);
} }
...@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev) ...@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
struct Scsi_Host *sh = virtio_scsi_host(vdev); struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh); struct virtio_scsi *vscsi = shost_priv(sh);
return virtscsi_init(vdev, vscsi, sh->max_id); return virtscsi_init(vdev, vscsi);
} }
#endif #endif
...@@ -794,8 +1001,7 @@ static int __init init(void) ...@@ -794,8 +1001,7 @@ static int __init init(void)
virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0); virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
if (!virtscsi_cmd_cache) { if (!virtscsi_cmd_cache) {
printk(KERN_ERR "kmem_cache_create() for " pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
"virtscsi_cmd_cache failed\n");
goto error; goto error;
} }
...@@ -804,8 +1010,7 @@ static int __init init(void) ...@@ -804,8 +1010,7 @@ static int __init init(void)
mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ, mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
virtscsi_cmd_cache); virtscsi_cmd_cache);
if (!virtscsi_cmd_pool) { if (!virtscsi_cmd_pool) {
printk(KERN_ERR "mempool_create() for" pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
"virtscsi_cmd_pool failed\n");
goto error; goto error;
} }
ret = register_virtio_driver(&virtio_scsi_driver); ret = register_virtio_driver(&virtio_scsi_driver);
......
config VHOST_NET config VHOST_NET
tristate "Host kernel accelerator for virtio net" tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP) depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
select VHOST_RING
---help--- ---help---
This kernel module can be loaded in host kernel to accelerate This kernel module can be loaded in host kernel to accelerate
guest networking with virtio_net. Not to be confused with virtio_net guest networking with virtio_net. Not to be confused with virtio_net
...@@ -12,7 +13,14 @@ config VHOST_NET ...@@ -12,7 +13,14 @@ config VHOST_NET
config VHOST_SCSI config VHOST_SCSI
tristate "VHOST_SCSI TCM fabric driver" tristate "VHOST_SCSI TCM fabric driver"
depends on TARGET_CORE && EVENTFD && m depends on TARGET_CORE && EVENTFD && m
select VHOST_RING
default n default n
---help--- ---help---
Say M here to enable the vhost_scsi TCM fabric module Say M here to enable the vhost_scsi TCM fabric module
for use with virtio-scsi guests for use with virtio-scsi guests
config VHOST_RING
tristate
---help---
This option is selected by any driver which needs to access
the host side of a virtio ring.
...@@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o ...@@ -3,3 +3,5 @@ vhost_net-y := vhost.o net.o
obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
vhost_scsi-y := scsi.o vhost_scsi-y := scsi.o
obj-$(CONFIG_VHOST_RING) += vringh.o
...@@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl, ...@@ -282,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
return vhost_test_reset_owner(n); return vhost_test_reset_owner(n);
default: default:
mutex_lock(&n->dev.mutex); mutex_lock(&n->dev.mutex);
r = vhost_dev_ioctl(&n->dev, ioctl, arg); r = vhost_dev_ioctl(&n->dev, ioctl, argp);
if (r == -ENOIOCTLCMD)
r = vhost_vring_ioctl(&n->dev, ioctl, argp);
vhost_test_flush(n); vhost_test_flush(n);
mutex_unlock(&n->dev.mutex); mutex_unlock(&n->dev.mutex);
return r; return r;
......
/*
* Helpers for the host side of a virtio ring.
*
* Since these may be in userspace, we use (inline) accessors.
*/
#include <linux/vringh.h>
#include <linux/virtio_ring.h>
#include <linux/kernel.h>
#include <linux/ratelimit.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/export.h>
static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
{
static DEFINE_RATELIMIT_STATE(vringh_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
if (__ratelimit(&vringh_rs)) {
va_list ap;
va_start(ap, fmt);
printk(KERN_NOTICE "vringh:");
vprintk(fmt, ap);
va_end(ap);
}
}
/* Returns vring->num if empty, -ve on error. */
static inline int __vringh_get_head(const struct vringh *vrh,
int (*getu16)(u16 *val, const u16 *p),
u16 *last_avail_idx)
{
u16 avail_idx, i, head;
int err;
err = getu16(&avail_idx, &vrh->vring.avail->idx);
if (err) {
vringh_bad("Failed to access avail idx at %p",
&vrh->vring.avail->idx);
return err;
}
if (*last_avail_idx == avail_idx)
return vrh->vring.num;
/* Only get avail ring entries after they have been exposed by guest. */
virtio_rmb(vrh->weak_barriers);
i = *last_avail_idx & (vrh->vring.num - 1);
err = getu16(&head, &vrh->vring.avail->ring[i]);
if (err) {
vringh_bad("Failed to read head: idx %d address %p",
*last_avail_idx, &vrh->vring.avail->ring[i]);
return err;
}
if (head >= vrh->vring.num) {
vringh_bad("Guest says index %u > %u is available",
head, vrh->vring.num);
return -EINVAL;
}
(*last_avail_idx)++;
return head;
}
/* Copy some bytes to/from the iovec. Returns num copied. */
static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
void *ptr, size_t len,
int (*xfer)(void *addr, void *ptr,
size_t len))
{
int err, done = 0;
while (len && iov->i < iov->used) {
size_t partlen;
partlen = min(iov->iov[iov->i].iov_len, len);
err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
if (err)
return err;
done += partlen;
len -= partlen;
ptr += partlen;
iov->consumed += partlen;
iov->iov[iov->i].iov_len -= partlen;
iov->iov[iov->i].iov_base += partlen;
if (!iov->iov[iov->i].iov_len) {
/* Fix up old iov element then increment. */
iov->iov[iov->i].iov_len = iov->consumed;
iov->iov[iov->i].iov_base -= iov->consumed;
iov->consumed = 0;
iov->i++;
}
}
return done;
}
/* May reduce *len if range is shorter. */
static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
struct vringh_range *range,
bool (*getrange)(struct vringh *,
u64, struct vringh_range *))
{
if (addr < range->start || addr > range->end_incl) {
if (!getrange(vrh, addr, range))
return false;
}
BUG_ON(addr < range->start || addr > range->end_incl);
/* To end of memory? */
if (unlikely(addr + *len == 0)) {
if (range->end_incl == -1ULL)
return true;
goto truncate;
}
/* Otherwise, don't wrap. */
if (addr + *len < addr) {
vringh_bad("Wrapping descriptor %zu@0x%llx",
*len, (unsigned long long)addr);
return false;
}
if (unlikely(addr + *len - 1 > range->end_incl))
goto truncate;
return true;
truncate:
*len = range->end_incl + 1 - addr;
return true;
}
static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
struct vringh_range *range,
bool (*getrange)(struct vringh *,
u64, struct vringh_range *))
{
return true;
}
/* No reason for this code to be inline. */
static int move_to_indirect(int *up_next, u16 *i, void *addr,
const struct vring_desc *desc,
struct vring_desc **descs, int *desc_max)
{
/* Indirect tables can't have indirect. */
if (*up_next != -1) {
vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
return -EINVAL;
}
if (unlikely(desc->len % sizeof(struct vring_desc))) {
vringh_bad("Strange indirect len %u", desc->len);
return -EINVAL;
}
/* We will check this when we follow it! */
if (desc->flags & VRING_DESC_F_NEXT)
*up_next = desc->next;
else
*up_next = -2;
*descs = addr;
*desc_max = desc->len / sizeof(struct vring_desc);
/* Now, start at the first indirect. */
*i = 0;
return 0;
}
static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
{
struct kvec *new;
unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
if (new_num < 8)
new_num = 8;
flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
if (flag)
new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
else {
new = kmalloc(new_num * sizeof(struct iovec), gfp);
if (new) {
memcpy(new, iov->iov,
iov->max_num * sizeof(struct iovec));
flag = VRINGH_IOV_ALLOCATED;
}
}
if (!new)
return -ENOMEM;
iov->iov = new;
iov->max_num = (new_num | flag);
return 0;
}
static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
struct vring_desc **descs, int *desc_max)
{
u16 i = *up_next;
*up_next = -1;
*descs = vrh->vring.desc;
*desc_max = vrh->vring.num;
return i;
}
static int slow_copy(struct vringh *vrh, void *dst, const void *src,
bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
struct vringh_range *range,
bool (*getrange)(struct vringh *vrh,
u64,
struct vringh_range *)),
bool (*getrange)(struct vringh *vrh,
u64 addr,
struct vringh_range *r),
struct vringh_range *range,
int (*copy)(void *dst, const void *src, size_t len))
{
size_t part, len = sizeof(struct vring_desc);
do {
u64 addr;
int err;
part = len;
addr = (u64)(unsigned long)src - range->offset;
if (!rcheck(vrh, addr, &part, range, getrange))
return -EINVAL;
err = copy(dst, src, part);
if (err)
return err;
dst += part;
src += part;
len -= part;
} while (len);
return 0;
}
static inline int
__vringh_iov(struct vringh *vrh, u16 i,
struct vringh_kiov *riov,
struct vringh_kiov *wiov,
bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
struct vringh_range *range,
bool (*getrange)(struct vringh *, u64,
struct vringh_range *)),
bool (*getrange)(struct vringh *, u64, struct vringh_range *),
gfp_t gfp,
int (*copy)(void *dst, const void *src, size_t len))
{
int err, count = 0, up_next, desc_max;
struct vring_desc desc, *descs;
struct vringh_range range = { -1ULL, 0 }, slowrange;
bool slow = false;
/* We start traversing vring's descriptor table. */
descs = vrh->vring.desc;
desc_max = vrh->vring.num;
up_next = -1;
if (riov)
riov->i = riov->used = 0;
else if (wiov)
wiov->i = wiov->used = 0;
else
/* You must want something! */
BUG();
for (;;) {
void *addr;
struct vringh_kiov *iov;
size_t len;
if (unlikely(slow))
err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
&slowrange, copy);
else
err = copy(&desc, &descs[i], sizeof(desc));
if (unlikely(err))
goto fail;
if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
/* Make sure it's OK, and get offset. */
len = desc.len;
if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
err = -EINVAL;
goto fail;
}
if (unlikely(len != desc.len)) {
slow = true;
/* We need to save this range to use offset */
slowrange = range;
}
addr = (void *)(long)(desc.addr + range.offset);
err = move_to_indirect(&up_next, &i, addr, &desc,
&descs, &desc_max);
if (err)
goto fail;
continue;
}
if (count++ == vrh->vring.num) {
vringh_bad("Descriptor loop in %p", descs);
err = -ELOOP;
goto fail;
}
if (desc.flags & VRING_DESC_F_WRITE)
iov = wiov;
else {
iov = riov;
if (unlikely(wiov && wiov->i)) {
vringh_bad("Readable desc %p after writable",
&descs[i]);
err = -EINVAL;
goto fail;
}
}
if (!iov) {
vringh_bad("Unexpected %s desc",
!wiov ? "writable" : "readable");
err = -EPROTO;
goto fail;
}
again:
/* Make sure it's OK, and get offset. */
len = desc.len;
if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
err = -EINVAL;
goto fail;
}
addr = (void *)(unsigned long)(desc.addr + range.offset);
if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
err = resize_iovec(iov, gfp);
if (err)
goto fail;
}
iov->iov[iov->used].iov_base = addr;
iov->iov[iov->used].iov_len = len;
iov->used++;
if (unlikely(len != desc.len)) {
desc.len -= len;
desc.addr += len;
goto again;
}
if (desc.flags & VRING_DESC_F_NEXT) {
i = desc.next;
} else {
/* Just in case we need to finish traversing above. */
if (unlikely(up_next > 0)) {
i = return_from_indirect(vrh, &up_next,
&descs, &desc_max);
slow = false;
} else
break;
}
if (i >= desc_max) {
vringh_bad("Chained index %u > %u", i, desc_max);
err = -EINVAL;
goto fail;
}
}
return 0;
fail:
return err;
}
static inline int __vringh_complete(struct vringh *vrh,
const struct vring_used_elem *used,
unsigned int num_used,
int (*putu16)(u16 *p, u16 val),
int (*putused)(struct vring_used_elem *dst,
const struct vring_used_elem
*src, unsigned num))
{
struct vring_used *used_ring;
int err;
u16 used_idx, off;
used_ring = vrh->vring.used;
used_idx = vrh->last_used_idx + vrh->completed;
off = used_idx % vrh->vring.num;
/* Compiler knows num_used == 1 sometimes, hence extra check */
if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
u16 part = vrh->vring.num - off;
err = putused(&used_ring->ring[off], used, part);
if (!err)
err = putused(&used_ring->ring[0], used + part,
num_used - part);
} else
err = putused(&used_ring->ring[off], used, num_used);
if (err) {
vringh_bad("Failed to write %u used entries %u at %p",
num_used, off, &used_ring->ring[off]);
return err;
}
/* Make sure buffer is written before we update index. */
virtio_wmb(vrh->weak_barriers);
err = putu16(&vrh->vring.used->idx, used_idx + num_used);
if (err) {
vringh_bad("Failed to update used index at %p",
&vrh->vring.used->idx);
return err;
}
vrh->completed += num_used;
return 0;
}
static inline int __vringh_need_notify(struct vringh *vrh,
int (*getu16)(u16 *val, const u16 *p))
{
bool notify;
u16 used_event;
int err;
/* Flush out used index update. This is paired with the
* barrier that the Guest executes when enabling
* interrupts. */
virtio_mb(vrh->weak_barriers);
/* Old-style, without event indices. */
if (!vrh->event_indices) {
u16 flags;
err = getu16(&flags, &vrh->vring.avail->flags);
if (err) {
vringh_bad("Failed to get flags at %p",
&vrh->vring.avail->flags);
return err;
}
return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
}
/* Modern: we know when other side wants to know. */
err = getu16(&used_event, &vring_used_event(&vrh->vring));
if (err) {
vringh_bad("Failed to get used event idx at %p",
&vring_used_event(&vrh->vring));
return err;
}
/* Just in case we added so many that we wrap. */
if (unlikely(vrh->completed > 0xffff))
notify = true;
else
notify = vring_need_event(used_event,
vrh->last_used_idx + vrh->completed,
vrh->last_used_idx);
vrh->last_used_idx += vrh->completed;
vrh->completed = 0;
return notify;
}
static inline bool __vringh_notify_enable(struct vringh *vrh,
int (*getu16)(u16 *val, const u16 *p),
int (*putu16)(u16 *p, u16 val))
{
u16 avail;
if (!vrh->event_indices) {
/* Old-school; update flags. */
if (putu16(&vrh->vring.used->flags, 0) != 0) {
vringh_bad("Clearing used flags %p",
&vrh->vring.used->flags);
return true;
}
} else {
if (putu16(&vring_avail_event(&vrh->vring),
vrh->last_avail_idx) != 0) {
vringh_bad("Updating avail event index %p",
&vring_avail_event(&vrh->vring));
return true;
}
}
/* They could have slipped one in as we were doing that: make
* sure it's written, then check again. */
virtio_mb(vrh->weak_barriers);
if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
vringh_bad("Failed to check avail idx at %p",
&vrh->vring.avail->idx);
return true;
}
/* This is unlikely, so we just leave notifications enabled
* (if we're using event_indices, we'll only get one
* notification anyway). */
return avail == vrh->last_avail_idx;
}
static inline void __vringh_notify_disable(struct vringh *vrh,
int (*putu16)(u16 *p, u16 val))
{
if (!vrh->event_indices) {
/* Old-school; update flags. */
if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
vringh_bad("Setting used flags %p",
&vrh->vring.used->flags);
}
}
}
/* Userspace access helpers: in this case, addresses are really userspace. */
static inline int getu16_user(u16 *val, const u16 *p)
{
return get_user(*val, (__force u16 __user *)p);
}
static inline int putu16_user(u16 *p, u16 val)
{
return put_user(val, (__force u16 __user *)p);
}
static inline int copydesc_user(void *dst, const void *src, size_t len)
{
return copy_from_user(dst, (__force void __user *)src, len) ?
-EFAULT : 0;
}
static inline int putused_user(struct vring_used_elem *dst,
const struct vring_used_elem *src,
unsigned int num)
{
return copy_to_user((__force void __user *)dst, src,
sizeof(*dst) * num) ? -EFAULT : 0;
}
static inline int xfer_from_user(void *src, void *dst, size_t len)
{
return copy_from_user(dst, (__force void __user *)src, len) ?
-EFAULT : 0;
}
static inline int xfer_to_user(void *dst, void *src, size_t len)
{
return copy_to_user((__force void __user *)dst, src, len) ?
-EFAULT : 0;
}
/**
* vringh_init_user - initialize a vringh for a userspace vring.
* @vrh: the vringh to initialize.
* @features: the feature bits for this ring.
* @num: the number of elements.
* @weak_barriers: true if we only need memory barriers, not I/O.
* @desc: the userpace descriptor pointer.
* @avail: the userpace avail pointer.
* @used: the userpace used pointer.
*
* Returns an error if num is invalid: you should check pointers
* yourself!
*/
int vringh_init_user(struct vringh *vrh, u32 features,
unsigned int num, bool weak_barriers,
struct vring_desc __user *desc,
struct vring_avail __user *avail,
struct vring_used __user *used)
{
/* Sane power of 2 please! */
if (!num || num > 0xffff || (num & (num - 1))) {
vringh_bad("Bad ring size %u", num);
return -EINVAL;
}
vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
vrh->weak_barriers = weak_barriers;
vrh->completed = 0;
vrh->last_avail_idx = 0;
vrh->last_used_idx = 0;
vrh->vring.num = num;
/* vring expects kernel addresses, but only used via accessors. */
vrh->vring.desc = (__force struct vring_desc *)desc;
vrh->vring.avail = (__force struct vring_avail *)avail;
vrh->vring.used = (__force struct vring_used *)used;
return 0;
}
EXPORT_SYMBOL(vringh_init_user);
/**
* vringh_getdesc_user - get next available descriptor from userspace ring.
* @vrh: the userspace vring.
* @riov: where to put the readable descriptors (or NULL)
* @wiov: where to put the writable descriptors (or NULL)
* @getrange: function to call to check ranges.
* @head: head index we received, for passing to vringh_complete_user().
*
* Returns 0 if there was no descriptor, 1 if there was, or -errno.
*
* Note that on error return, you can tell the difference between an
* invalid ring and a single invalid descriptor: in the former case,
* *head will be vrh->vring.num. You may be able to ignore an invalid
* descriptor, but there's not much you can do with an invalid ring.
*
* Note that you may need to clean up riov and wiov, even on error!
*/
int vringh_getdesc_user(struct vringh *vrh,
struct vringh_iov *riov,
struct vringh_iov *wiov,
bool (*getrange)(struct vringh *vrh,
u64 addr, struct vringh_range *r),
u16 *head)
{
int err;
*head = vrh->vring.num;
err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
if (err < 0)
return err;
/* Empty... */
if (err == vrh->vring.num)
return 0;
/* We need the layouts to be the identical for this to work */
BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
offsetof(struct vringh_iov, iov));
BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
offsetof(struct vringh_iov, i));
BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
offsetof(struct vringh_iov, used));
BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
offsetof(struct vringh_iov, max_num));
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
offsetof(struct kvec, iov_base));
BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
offsetof(struct kvec, iov_len));
BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
!= sizeof(((struct kvec *)NULL)->iov_base));
BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
!= sizeof(((struct kvec *)NULL)->iov_len));
*head = err;
err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
(struct vringh_kiov *)wiov,
range_check, getrange, GFP_KERNEL, copydesc_user);
if (err)
return err;
return 1;
}
EXPORT_SYMBOL(vringh_getdesc_user);
/**
* vringh_iov_pull_user - copy bytes from vring_iov.
* @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
* @dst: the place to copy.
* @len: the maximum length to copy.
*
* Returns the bytes copied <= len or a negative errno.
*/
ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
{
return vringh_iov_xfer((struct vringh_kiov *)riov,
dst, len, xfer_from_user);
}
EXPORT_SYMBOL(vringh_iov_pull_user);
/**
* vringh_iov_push_user - copy bytes into vring_iov.
* @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
* @dst: the place to copy.
* @len: the maximum length to copy.
*
* Returns the bytes copied <= len or a negative errno.
*/
ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
const void *src, size_t len)
{
return vringh_iov_xfer((struct vringh_kiov *)wiov,
(void *)src, len, xfer_to_user);
}
EXPORT_SYMBOL(vringh_iov_push_user);
/**
* vringh_abandon_user - we've decided not to handle the descriptor(s).
* @vrh: the vring.
* @num: the number of descriptors to put back (ie. num
* vringh_get_user() to undo).
*
* The next vringh_get_user() will return the old descriptor(s) again.
*/
void vringh_abandon_user(struct vringh *vrh, unsigned int num)
{
/* We only update vring_avail_event(vr) when we want to be notified,
* so we haven't changed that yet. */
vrh->last_avail_idx -= num;
}
EXPORT_SYMBOL(vringh_abandon_user);
/**
* vringh_complete_user - we've finished with descriptor, publish it.
* @vrh: the vring.
* @head: the head as filled in by vringh_getdesc_user.
* @len: the length of data we have written.
*
* You should check vringh_need_notify_user() after one or more calls
* to this function.
*/
int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
{
struct vring_used_elem used;
used.id = head;
used.len = len;
return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
}
EXPORT_SYMBOL(vringh_complete_user);
/**
* vringh_complete_multi_user - we've finished with many descriptors.
* @vrh: the vring.
* @used: the head, length pairs.
* @num_used: the number of used elements.
*
* You should check vringh_need_notify_user() after one or more calls
* to this function.
*/
int vringh_complete_multi_user(struct vringh *vrh,
const struct vring_used_elem used[],
unsigned num_used)
{
return __vringh_complete(vrh, used, num_used,
putu16_user, putused_user);
}
EXPORT_SYMBOL(vringh_complete_multi_user);
/**
* vringh_notify_enable_user - we want to know if something changes.
* @vrh: the vring.
*
* This always enables notifications, but returns false if there are
* now more buffers available in the vring.
*/
bool vringh_notify_enable_user(struct vringh *vrh)
{
return __vringh_notify_enable(vrh, getu16_user, putu16_user);
}
EXPORT_SYMBOL(vringh_notify_enable_user);
/**
* vringh_notify_disable_user - don't tell us if something changes.
* @vrh: the vring.
*
* This is our normal running state: we disable and then only enable when
* we're going to sleep.
*/
void vringh_notify_disable_user(struct vringh *vrh)
{
__vringh_notify_disable(vrh, putu16_user);
}
EXPORT_SYMBOL(vringh_notify_disable_user);
/**
* vringh_need_notify_user - must we tell the other side about used buffers?
* @vrh: the vring we've called vringh_complete_user() on.
*
* Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
*/
int vringh_need_notify_user(struct vringh *vrh)
{
return __vringh_need_notify(vrh, getu16_user);
}
EXPORT_SYMBOL(vringh_need_notify_user);
/* Kernelspace access helpers. */
static inline int getu16_kern(u16 *val, const u16 *p)
{
*val = ACCESS_ONCE(*p);
return 0;
}
static inline int putu16_kern(u16 *p, u16 val)
{
ACCESS_ONCE(*p) = val;
return 0;
}
static inline int copydesc_kern(void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);
return 0;
}
static inline int putused_kern(struct vring_used_elem *dst,
const struct vring_used_elem *src,
unsigned int num)
{
memcpy(dst, src, num * sizeof(*dst));
return 0;
}
static inline int xfer_kern(void *src, void *dst, size_t len)
{
memcpy(dst, src, len);
return 0;
}
/**
* vringh_init_kern - initialize a vringh for a kernelspace vring.
* @vrh: the vringh to initialize.
* @features: the feature bits for this ring.
* @num: the number of elements.
* @weak_barriers: true if we only need memory barriers, not I/O.
* @desc: the userpace descriptor pointer.
* @avail: the userpace avail pointer.
* @used: the userpace used pointer.
*
* Returns an error if num is invalid.
*/
int vringh_init_kern(struct vringh *vrh, u32 features,
unsigned int num, bool weak_barriers,
struct vring_desc *desc,
struct vring_avail *avail,
struct vring_used *used)
{
/* Sane power of 2 please! */
if (!num || num > 0xffff || (num & (num - 1))) {
vringh_bad("Bad ring size %u", num);
return -EINVAL;
}
vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
vrh->weak_barriers = weak_barriers;
vrh->completed = 0;
vrh->last_avail_idx = 0;
vrh->last_used_idx = 0;
vrh->vring.num = num;
vrh->vring.desc = desc;
vrh->vring.avail = avail;
vrh->vring.used = used;
return 0;
}
EXPORT_SYMBOL(vringh_init_kern);
/**
* vringh_getdesc_kern - get next available descriptor from kernelspace ring.
* @vrh: the kernelspace vring.
* @riov: where to put the readable descriptors (or NULL)
* @wiov: where to put the writable descriptors (or NULL)
* @head: head index we received, for passing to vringh_complete_kern().
* @gfp: flags for allocating larger riov/wiov.
*
* Returns 0 if there was no descriptor, 1 if there was, or -errno.
*
* Note that on error return, you can tell the difference between an
* invalid ring and a single invalid descriptor: in the former case,
* *head will be vrh->vring.num. You may be able to ignore an invalid
* descriptor, but there's not much you can do with an invalid ring.
*
* Note that you may need to clean up riov and wiov, even on error!
*/
int vringh_getdesc_kern(struct vringh *vrh,
struct vringh_kiov *riov,
struct vringh_kiov *wiov,
u16 *head,
gfp_t gfp)
{
int err;
err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
if (err < 0)
return err;
/* Empty... */
if (err == vrh->vring.num)
return 0;
*head = err;
err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
gfp, copydesc_kern);
if (err)
return err;
return 1;
}
EXPORT_SYMBOL(vringh_getdesc_kern);
/**
* vringh_iov_pull_kern - copy bytes from vring_iov.
* @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
* @dst: the place to copy.
* @len: the maximum length to copy.
*
* Returns the bytes copied <= len or a negative errno.
*/
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
{
return vringh_iov_xfer(riov, dst, len, xfer_kern);
}
EXPORT_SYMBOL(vringh_iov_pull_kern);
/**
* vringh_iov_push_kern - copy bytes into vring_iov.
* @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
* @dst: the place to copy.
* @len: the maximum length to copy.
*
* Returns the bytes copied <= len or a negative errno.
*/
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len)
{
return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
}
EXPORT_SYMBOL(vringh_iov_push_kern);
/**
* vringh_abandon_kern - we've decided not to handle the descriptor(s).
* @vrh: the vring.
* @num: the number of descriptors to put back (ie. num
* vringh_get_kern() to undo).
*
* The next vringh_get_kern() will return the old descriptor(s) again.
*/
void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
{
/* We only update vring_avail_event(vr) when we want to be notified,
* so we haven't changed that yet. */
vrh->last_avail_idx -= num;
}
EXPORT_SYMBOL(vringh_abandon_kern);
/**
* vringh_complete_kern - we've finished with descriptor, publish it.
* @vrh: the vring.
* @head: the head as filled in by vringh_getdesc_kern.
* @len: the length of data we have written.
*
* You should check vringh_need_notify_kern() after one or more calls
* to this function.
*/
int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
{
struct vring_used_elem used;
used.id = head;
used.len = len;
return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
}
EXPORT_SYMBOL(vringh_complete_kern);
/**
* vringh_notify_enable_kern - we want to know if something changes.
* @vrh: the vring.
*
* This always enables notifications, but returns false if there are
* now more buffers available in the vring.
*/
bool vringh_notify_enable_kern(struct vringh *vrh)
{
return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
}
EXPORT_SYMBOL(vringh_notify_enable_kern);
/**
* vringh_notify_disable_kern - don't tell us if something changes.
* @vrh: the vring.
*
* This is our normal running state: we disable and then only enable when
* we're going to sleep.
*/
void vringh_notify_disable_kern(struct vringh *vrh)
{
__vringh_notify_disable(vrh, putu16_kern);
}
EXPORT_SYMBOL(vringh_notify_disable_kern);
/**
* vringh_need_notify_kern - must we tell the other side about used buffers?
* @vrh: the vring we've called vringh_complete_kern() on.
*
* Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
*/
int vringh_need_notify_kern(struct vringh *vrh)
{
return __vringh_need_notify(vrh, getu16_kern);
}
EXPORT_SYMBOL(vringh_need_notify_kern);
...@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) ...@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
/* We should always be able to add one buffer to an empty queue. */ /* We should always be able to add one buffer to an empty queue. */
if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG(); BUG();
virtqueue_kick(vq); virtqueue_kick(vq);
...@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb) ...@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
if (!virtqueue_get_buf(vq, &len)) if (!virtqueue_get_buf(vq, &len))
return; return;
sg_init_one(&sg, vb->stats, sizeof(vb->stats)); sg_init_one(&sg, vb->stats, sizeof(vb->stats));
if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0) if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG(); BUG();
virtqueue_kick(vq); virtqueue_kick(vq);
} }
...@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb) ...@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
* use it to signal us later. * use it to signal us later.
*/ */
sg_init_one(&sg, vb->stats, sizeof vb->stats); sg_init_one(&sg, vb->stats, sizeof vb->stats);
if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL) if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
< 0) < 0)
BUG(); BUG();
virtqueue_kick(vb->stats_vq); virtqueue_kick(vb->stats_vq);
......
...@@ -24,27 +24,6 @@ ...@@ -24,27 +24,6 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/hrtimer.h> #include <linux/hrtimer.h>
/* virtio guest is communicating with a virtual "device" that actually runs on
* a host processor. Memory barriers are used to control SMP effects. */
#ifdef CONFIG_SMP
/* Where possible, use SMP barriers which are more lightweight than mandatory
* barriers, because mandatory barriers control MMIO effects on accesses
* through relaxed memory I/O windows (which virtio-pci does not use). */
#define virtio_mb(vq) \
do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
#define virtio_rmb(vq) \
do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
#define virtio_wmb(vq) \
do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
#else
/* We must force memory ordering even if guest is UP since host could be
* running on another CPU, but SMP barriers are defined to barrier() in that
* configuration. So fall back to mandatory barriers instead. */
#define virtio_mb(vq) mb()
#define virtio_rmb(vq) rmb()
#define virtio_wmb(vq) wmb()
#endif
#ifdef DEBUG #ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */ /* For development, we want to crash whenever the ring is screwed. */
#define BAD_RING(_vq, fmt, args...) \ #define BAD_RING(_vq, fmt, args...) \
...@@ -119,16 +98,36 @@ struct vring_virtqueue ...@@ -119,16 +98,36 @@ struct vring_virtqueue
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
unsigned int *count)
{
return sg_next(sg);
}
static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
unsigned int *count)
{
if (--(*count) == 0)
return NULL;
return sg + 1;
}
/* Set up an indirect table of descriptors and add it to the queue. */ /* Set up an indirect table of descriptors and add it to the queue. */
static int vring_add_indirect(struct vring_virtqueue *vq, static inline int vring_add_indirect(struct vring_virtqueue *vq,
struct scatterlist sg[], struct scatterlist *sgs[],
unsigned int out, struct scatterlist *(*next)
unsigned int in, (struct scatterlist *, unsigned int *),
gfp_t gfp) unsigned int total_sg,
unsigned int total_out,
unsigned int total_in,
unsigned int out_sgs,
unsigned int in_sgs,
gfp_t gfp)
{ {
struct vring_desc *desc; struct vring_desc *desc;
unsigned head; unsigned head;
int i; struct scatterlist *sg;
int i, n;
/* /*
* We require lowmem mappings for the descriptors because * We require lowmem mappings for the descriptors because
...@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq, ...@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
*/ */
gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH); gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp); desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
if (!desc) if (!desc)
return -ENOMEM; return -ENOMEM;
/* Transfer entries from the sg list into the indirect page */ /* Transfer entries from the sg lists into the indirect page */
for (i = 0; i < out; i++) { i = 0;
desc[i].flags = VRING_DESC_F_NEXT; for (n = 0; n < out_sgs; n++) {
desc[i].addr = sg_phys(sg); for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
desc[i].len = sg->length; desc[i].flags = VRING_DESC_F_NEXT;
desc[i].next = i+1; desc[i].addr = sg_phys(sg);
sg++; desc[i].len = sg->length;
desc[i].next = i+1;
i++;
}
} }
for (; i < (out + in); i++) { for (; n < (out_sgs + in_sgs); n++) {
desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
desc[i].addr = sg_phys(sg); desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
desc[i].len = sg->length; desc[i].addr = sg_phys(sg);
desc[i].next = i+1; desc[i].len = sg->length;
sg++; desc[i].next = i+1;
i++;
}
} }
BUG_ON(i != total_sg);
/* Last one doesn't continue. */ /* Last one doesn't continue. */
desc[i-1].flags &= ~VRING_DESC_F_NEXT; desc[i-1].flags &= ~VRING_DESC_F_NEXT;
...@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq, ...@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
return head; return head;
} }
/** static inline int virtqueue_add(struct virtqueue *_vq,
* virtqueue_add_buf - expose buffer to other end struct scatterlist *sgs[],
* @vq: the struct virtqueue we're talking about. struct scatterlist *(*next)
* @sg: the description of the buffer(s). (struct scatterlist *, unsigned int *),
* @out_num: the number of sg readable by other side unsigned int total_out,
* @in_num: the number of sg which are writable (after readable ones) unsigned int total_in,
* @data: the token identifying the buffer. unsigned int out_sgs,
* @gfp: how to do memory allocations (if necessary). unsigned int in_sgs,
* void *data,
* Caller must ensure we don't call this with other virtqueue operations gfp_t gfp)
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_buf(struct virtqueue *_vq,
struct scatterlist sg[],
unsigned int out,
unsigned int in,
void *data,
gfp_t gfp)
{ {
struct vring_virtqueue *vq = to_vvq(_vq); struct vring_virtqueue *vq = to_vvq(_vq);
unsigned int i, avail, uninitialized_var(prev); struct scatterlist *sg;
unsigned int i, n, avail, uninitialized_var(prev), total_sg;
int head; int head;
START_USE(vq); START_USE(vq);
...@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq, ...@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
} }
#endif #endif
total_sg = total_in + total_out;
/* If the host supports indirect descriptor tables, and we have multiple /* If the host supports indirect descriptor tables, and we have multiple
* buffers, then go indirect. FIXME: tune this threshold */ * buffers, then go indirect. FIXME: tune this threshold */
if (vq->indirect && (out + in) > 1 && vq->vq.num_free) { if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
head = vring_add_indirect(vq, sg, out, in, gfp); head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
total_in,
out_sgs, in_sgs, gfp);
if (likely(head >= 0)) if (likely(head >= 0))
goto add_head; goto add_head;
} }
BUG_ON(out + in > vq->vring.num); BUG_ON(total_sg > vq->vring.num);
BUG_ON(out + in == 0); BUG_ON(total_sg == 0);
if (vq->vq.num_free < out + in) { if (vq->vq.num_free < total_sg) {
pr_debug("Can't add buf len %i - avail = %i\n", pr_debug("Can't add buf len %i - avail = %i\n",
out + in, vq->vq.num_free); total_sg, vq->vq.num_free);
/* FIXME: for historical reasons, we force a notify here if /* FIXME: for historical reasons, we force a notify here if
* there are outgoing parts to the buffer. Presumably the * there are outgoing parts to the buffer. Presumably the
* host should service the ring ASAP. */ * host should service the ring ASAP. */
if (out) if (out_sgs)
vq->notify(&vq->vq); vq->notify(&vq->vq);
END_USE(vq); END_USE(vq);
return -ENOSPC; return -ENOSPC;
} }
/* We're about to use some buffers from the free list. */ /* We're about to use some buffers from the free list. */
vq->vq.num_free -= out + in; vq->vq.num_free -= total_sg;
head = vq->free_head; head = i = vq->free_head;
for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { for (n = 0; n < out_sgs; n++) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT; for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
vq->vring.desc[i].len = sg->length; vq->vring.desc[i].addr = sg_phys(sg);
prev = i; vq->vring.desc[i].len = sg->length;
sg++; prev = i;
i = vq->vring.desc[i].next;
}
} }
for (; in; i = vq->vring.desc[i].next, in--) { for (; n < (out_sgs + in_sgs); n++) {
vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
vq->vring.desc[i].addr = sg_phys(sg); vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
vq->vring.desc[i].len = sg->length; vq->vring.desc[i].addr = sg_phys(sg);
prev = i; vq->vring.desc[i].len = sg->length;
sg++; prev = i;
i = vq->vring.desc[i].next;
}
} }
/* Last one doesn't continue. */ /* Last one doesn't continue. */
vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
...@@ -276,7 +280,7 @@ int virtqueue_add_buf(struct virtqueue *_vq, ...@@ -276,7 +280,7 @@ int virtqueue_add_buf(struct virtqueue *_vq,
/* Descriptors and available array need to be set before we expose the /* Descriptors and available array need to be set before we expose the
* new available array entries. */ * new available array entries. */
virtio_wmb(vq); virtio_wmb(vq->weak_barriers);
vq->vring.avail->idx++; vq->vring.avail->idx++;
vq->num_added++; vq->num_added++;
...@@ -290,8 +294,121 @@ int virtqueue_add_buf(struct virtqueue *_vq, ...@@ -290,8 +294,121 @@ int virtqueue_add_buf(struct virtqueue *_vq,
return 0; return 0;
} }
/**
* virtqueue_add_buf - expose buffer to other end
* @vq: the struct virtqueue we're talking about.
* @sg: the description of the buffer(s).
* @out_num: the number of sg readable by other side
* @in_num: the number of sg which are writable (after readable ones)
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_buf(struct virtqueue *_vq,
struct scatterlist sg[],
unsigned int out,
unsigned int in,
void *data,
gfp_t gfp)
{
struct scatterlist *sgs[2];
sgs[0] = sg;
sgs[1] = sg + out;
return virtqueue_add(_vq, sgs, sg_next_arr,
out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_buf); EXPORT_SYMBOL_GPL(virtqueue_add_buf);
/**
* virtqueue_add_sgs - expose buffers to other end
* @vq: the struct virtqueue we're talking about.
* @sgs: array of terminated scatterlists.
* @out_num: the number of scatterlists readable by other side
* @in_num: the number of scatterlists which are writable (after readable ones)
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_sgs(struct virtqueue *_vq,
struct scatterlist *sgs[],
unsigned int out_sgs,
unsigned int in_sgs,
void *data,
gfp_t gfp)
{
unsigned int i, total_out, total_in;
/* Count them first. */
for (i = total_out = total_in = 0; i < out_sgs; i++) {
struct scatterlist *sg;
for (sg = sgs[i]; sg; sg = sg_next(sg))
total_out++;
}
for (; i < out_sgs + in_sgs; i++) {
struct scatterlist *sg;
for (sg = sgs[i]; sg; sg = sg_next(sg))
total_in++;
}
return virtqueue_add(_vq, sgs, sg_next_chained,
total_out, total_in, out_sgs, in_sgs, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
/**
* virtqueue_add_outbuf - expose output buffers to other end
* @vq: the struct virtqueue we're talking about.
* @sgs: array of scatterlists (need not be terminated!)
* @num: the number of scatterlists readable by other side
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_outbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp)
{
return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
/**
* virtqueue_add_inbuf - expose input buffers to other end
* @vq: the struct virtqueue we're talking about.
* @sgs: array of scatterlists (need not be terminated!)
* @num: the number of scatterlists writable by other side
* @data: the token identifying the buffer.
* @gfp: how to do memory allocations (if necessary).
*
* Caller must ensure we don't call this with other virtqueue operations
* at the same time (except where noted).
*
* Returns zero or a negative error (ie. ENOSPC, ENOMEM).
*/
int virtqueue_add_inbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp)
{
return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
}
EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
/** /**
* virtqueue_kick_prepare - first half of split virtqueue_kick call. * virtqueue_kick_prepare - first half of split virtqueue_kick call.
* @vq: the struct virtqueue * @vq: the struct virtqueue
...@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) ...@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
START_USE(vq); START_USE(vq);
/* We need to expose available array entries before checking avail /* We need to expose available array entries before checking avail
* event. */ * event. */
virtio_mb(vq); virtio_mb(vq->weak_barriers);
old = vq->vring.avail->idx - vq->num_added; old = vq->vring.avail->idx - vq->num_added;
new = vq->vring.avail->idx; new = vq->vring.avail->idx;
...@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) ...@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
} }
/* Only get used array entries after they have been exposed by host. */ /* Only get used array entries after they have been exposed by host. */
virtio_rmb(vq); virtio_rmb(vq->weak_barriers);
last_used = (vq->last_used_idx & (vq->vring.num - 1)); last_used = (vq->last_used_idx & (vq->vring.num - 1));
i = vq->vring.used->ring[last_used].id; i = vq->vring.used->ring[last_used].id;
...@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) ...@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
* the read in the next get_buf call. */ * the read in the next get_buf call. */
if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
vring_used_event(&vq->vring) = vq->last_used_idx; vring_used_event(&vq->vring) = vq->last_used_idx;
virtio_mb(vq); virtio_mb(vq->weak_barriers);
} }
#ifdef DEBUG #ifdef DEBUG
...@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) ...@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
* entry. Always do both to keep code simple. */ * entry. Always do both to keep code simple. */
vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
vring_used_event(&vq->vring) = vq->last_used_idx; vring_used_event(&vq->vring) = vq->last_used_idx;
virtio_mb(vq); virtio_mb(vq->weak_barriers);
if (unlikely(more_used(vq))) { if (unlikely(more_used(vq))) {
END_USE(vq); END_USE(vq);
return false; return false;
...@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) ...@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
/* TODO: tune this threshold */ /* TODO: tune this threshold */
bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
vring_used_event(&vq->vring) = vq->last_used_idx + bufs; vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
virtio_mb(vq); virtio_mb(vq->weak_barriers);
if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
END_USE(vq); END_USE(vq);
return false; return false;
......
...@@ -171,6 +171,22 @@ static inline void sg_mark_end(struct scatterlist *sg) ...@@ -171,6 +171,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
sg->page_link &= ~0x01; sg->page_link &= ~0x01;
} }
/**
* sg_unmark_end - Undo setting the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Removes the termination marker from the given entry of the scatterlist.
*
**/
static inline void sg_unmark_end(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
sg->page_link &= ~0x02;
}
/** /**
* sg_phys - Return physical address of an sg entry * sg_phys - Return physical address of an sg entry
* @sg: SG entry * @sg: SG entry
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <linux/device.h> #include <linux/device.h>
#include <linux/mod_devicetable.h> #include <linux/mod_devicetable.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/vringh.h>
/** /**
* virtqueue - a queue to register buffers for sending or receiving. * virtqueue - a queue to register buffers for sending or receiving.
...@@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq, ...@@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq,
void *data, void *data,
gfp_t gfp); gfp_t gfp);
int virtqueue_add_outbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_inbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_sgs(struct virtqueue *vq,
struct scatterlist *sgs[],
unsigned int out_sgs,
unsigned int in_sgs,
void *data,
gfp_t gfp);
void virtqueue_kick(struct virtqueue *vq); void virtqueue_kick(struct virtqueue *vq);
bool virtqueue_kick_prepare(struct virtqueue *vq); bool virtqueue_kick_prepare(struct virtqueue *vq);
...@@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq); ...@@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
* @dev: underlying device. * @dev: underlying device.
* @id: the device type identification (used to match it with a driver). * @id: the device type identification (used to match it with a driver).
* @config: the configuration ops for this device. * @config: the configuration ops for this device.
* @vringh_config: configuration ops for host vrings.
* @vqs: the list of virtqueues for this device. * @vqs: the list of virtqueues for this device.
* @features: the features supported by both driver and device. * @features: the features supported by both driver and device.
* @priv: private pointer for the driver's use. * @priv: private pointer for the driver's use.
...@@ -73,6 +92,7 @@ struct virtio_device { ...@@ -73,6 +92,7 @@ struct virtio_device {
struct device dev; struct device dev;
struct virtio_device_id id; struct virtio_device_id id;
const struct virtio_config_ops *config; const struct virtio_config_ops *config;
const struct vringh_config_ops *vringh_config;
struct list_head vqs; struct list_head vqs;
/* Note that this is a Linux set_bit-style bitmap. */ /* Note that this is a Linux set_bit-style bitmap. */
unsigned long features[1]; unsigned long features[1];
......
/*
* Copyright (C) ST-Ericsson AB 2012
* Author: Sjur Brændeland <sjur.brandeland@stericsson.com>
*
* This header is BSD licensed so
* anyone can use the definitions to implement compatible remote processors
*/
#ifndef VIRTIO_CAIF_H
#define VIRTIO_CAIF_H
#include <linux/types.h>
struct virtio_caif_transf_config {
u16 headroom;
u16 tailroom;
u32 mtu;
u8 reserved[4];
};
struct virtio_caif_config {
struct virtio_caif_transf_config uplink, downlink;
u8 reserved[8];
};
#endif
...@@ -4,6 +4,63 @@ ...@@ -4,6 +4,63 @@
#include <linux/irqreturn.h> #include <linux/irqreturn.h>
#include <uapi/linux/virtio_ring.h> #include <uapi/linux/virtio_ring.h>
/*
* Barriers in virtio are tricky. Non-SMP virtio guests can't assume
* they're not on an SMP host system, so they need to assume real
* barriers. Non-SMP virtio hosts could skip the barriers, but does
* anyone care?
*
* For virtio_pci on SMP, we don't need to order with respect to MMIO
* accesses through relaxed memory I/O windows, so smp_mb() et al are
* sufficient.
*
* For using virtio to talk to real devices (eg. other heterogeneous
* CPUs) we do need real barriers. In theory, we could be using both
* kinds of virtio, so it's a runtime decision, and the branch is
* actually quite cheap.
*/
#ifdef CONFIG_SMP
static inline void virtio_mb(bool weak_barriers)
{
if (weak_barriers)
smp_mb();
else
mb();
}
static inline void virtio_rmb(bool weak_barriers)
{
if (weak_barriers)
smp_rmb();
else
rmb();
}
static inline void virtio_wmb(bool weak_barriers)
{
if (weak_barriers)
smp_wmb();
else
wmb();
}
#else
static inline void virtio_mb(bool weak_barriers)
{
mb();
}
static inline void virtio_rmb(bool weak_barriers)
{
rmb();
}
static inline void virtio_wmb(bool weak_barriers)
{
wmb();
}
#endif
struct virtio_device; struct virtio_device;
struct virtqueue; struct virtqueue;
......
/*
* Linux host-side vring helpers; for when the kernel needs to access
* someone else's vring.
*
* Copyright IBM Corporation, 2013.
* Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Written by: Rusty Russell <rusty@rustcorp.com.au>
*/
#ifndef _LINUX_VRINGH_H
#define _LINUX_VRINGH_H
#include <uapi/linux/virtio_ring.h>
#include <linux/uio.h>
#include <linux/slab.h>
#include <asm/barrier.h>
/* virtio_ring with information needed for host access. */
struct vringh {
/* Guest publishes used event idx (note: we always do). */
bool event_indices;
/* Can we get away with weak barriers? */
bool weak_barriers;
/* Last available index we saw (ie. where we're up to). */
u16 last_avail_idx;
/* Last index we used. */
u16 last_used_idx;
/* How many descriptors we've completed since last need_notify(). */
u32 completed;
/* The vring (note: it may contain user pointers!) */
struct vring vring;
/* The function to call to notify the guest about added buffers */
void (*notify)(struct vringh *);
};
/**
* struct vringh_config_ops - ops for creating a host vring from a virtio driver
* @find_vrhs: find the host vrings and instantiate them
* vdev: the virtio_device
* nhvrs: the number of host vrings to find
* hvrs: on success, includes new host vrings
* callbacks: array of driver callbacks, for each host vring
* include a NULL entry for vqs that do not need a callback
* Returns 0 on success or error status
* @del_vrhs: free the host vrings found by find_vrhs().
*/
struct virtio_device;
typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
struct vringh_config_ops {
int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
struct vringh *vrhs[], vrh_callback_t *callbacks[]);
void (*del_vrhs)(struct virtio_device *vdev);
};
/* The memory the vring can access, and what offset to apply. */
struct vringh_range {
u64 start, end_incl;
u64 offset;
};
/**
* struct vringh_iov - iovec mangler.
*
* Mangles iovec in place, and restores it.
* Remaining data is iov + i, of used - i elements.
*/
struct vringh_iov {
struct iovec *iov;
size_t consumed; /* Within iov[i] */
unsigned i, used, max_num;
};
/**
* struct vringh_iov - kvec mangler.
*
* Mangles kvec in place, and restores it.
* Remaining data is iov + i, of used - i elements.
*/
struct vringh_kiov {
struct kvec *iov;
size_t consumed; /* Within iov[i] */
unsigned i, used, max_num;
};
/* Flag on max_num to indicate we're kmalloced. */
#define VRINGH_IOV_ALLOCATED 0x8000000
/* Helpers for userspace vrings. */
int vringh_init_user(struct vringh *vrh, u32 features,
unsigned int num, bool weak_barriers,
struct vring_desc __user *desc,
struct vring_avail __user *avail,
struct vring_used __user *used);
static inline void vringh_iov_init(struct vringh_iov *iov,
struct iovec *iovec, unsigned num)
{
iov->used = iov->i = 0;
iov->consumed = 0;
iov->max_num = num;
iov->iov = iovec;
}
static inline void vringh_iov_reset(struct vringh_iov *iov)
{
iov->iov[iov->i].iov_len += iov->consumed;
iov->iov[iov->i].iov_base -= iov->consumed;
iov->consumed = 0;
iov->i = 0;
}
static inline void vringh_iov_cleanup(struct vringh_iov *iov)
{
if (iov->max_num & VRINGH_IOV_ALLOCATED)
kfree(iov->iov);
iov->max_num = iov->used = iov->i = iov->consumed = 0;
iov->iov = NULL;
}
/* Convert a descriptor into iovecs. */
int vringh_getdesc_user(struct vringh *vrh,
struct vringh_iov *riov,
struct vringh_iov *wiov,
bool (*getrange)(struct vringh *vrh,
u64 addr, struct vringh_range *r),
u16 *head);
/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
const void *src, size_t len);
/* Mark a descriptor as used. */
int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
int vringh_complete_multi_user(struct vringh *vrh,
const struct vring_used_elem used[],
unsigned num_used);
/* Pretend we've never seen descriptor (for easy error handling). */
void vringh_abandon_user(struct vringh *vrh, unsigned int num);
/* Do we need to fire the eventfd to notify the other side? */
int vringh_need_notify_user(struct vringh *vrh);
bool vringh_notify_enable_user(struct vringh *vrh);
void vringh_notify_disable_user(struct vringh *vrh);
/* Helpers for kernelspace vrings. */
int vringh_init_kern(struct vringh *vrh, u32 features,
unsigned int num, bool weak_barriers,
struct vring_desc *desc,
struct vring_avail *avail,
struct vring_used *used);
static inline void vringh_kiov_init(struct vringh_kiov *kiov,
struct kvec *kvec, unsigned num)
{
kiov->used = kiov->i = 0;
kiov->consumed = 0;
kiov->max_num = num;
kiov->iov = kvec;
}
static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
{
kiov->iov[kiov->i].iov_len += kiov->consumed;
kiov->iov[kiov->i].iov_base -= kiov->consumed;
kiov->consumed = 0;
kiov->i = 0;
}
static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
{
if (kiov->max_num & VRINGH_IOV_ALLOCATED)
kfree(kiov->iov);
kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
kiov->iov = NULL;
}
int vringh_getdesc_kern(struct vringh *vrh,
struct vringh_kiov *riov,
struct vringh_kiov *wiov,
u16 *head,
gfp_t gfp);
ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
const void *src, size_t len);
void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
bool vringh_notify_enable_kern(struct vringh *vrh);
void vringh_notify_disable_kern(struct vringh *vrh);
int vringh_need_notify_kern(struct vringh *vrh);
/* Notify the guest about buffers added to the used ring */
static inline void vringh_notify(struct vringh *vrh)
{
if (vrh->notify)
vrh->notify(vrh);
}
#endif /* _LINUX_VRINGH_H */
...@@ -52,8 +52,8 @@ struct virtio_balloon_config ...@@ -52,8 +52,8 @@ struct virtio_balloon_config
#define VIRTIO_BALLOON_S_NR 6 #define VIRTIO_BALLOON_S_NR 6
struct virtio_balloon_stat { struct virtio_balloon_stat {
u16 tag; __u16 tag;
u64 val; __u64 val;
} __attribute__((packed)); } __attribute__((packed));
#endif /* _LINUX_VIRTIO_BALLOON_H */ #endif /* _LINUX_VIRTIO_BALLOON_H */
...@@ -38,5 +38,6 @@ ...@@ -38,5 +38,6 @@
#define VIRTIO_ID_SCSI 8 /* virtio scsi */ #define VIRTIO_ID_SCSI 8 /* virtio scsi */
#define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_9P 9 /* 9p virtio console */
#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
#define VIRTIO_ID_CAIF 12 /* Virtio caif */
#endif /* _LINUX_VIRTIO_IDS_H */ #endif /* _LINUX_VIRTIO_IDS_H */
...@@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start, ...@@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start,
if (s > count) if (s > count)
s = count; s = count;
BUG_ON(index > limit); BUG_ON(index > limit);
/* Make sure we don't terminate early. */
sg_unmark_end(&sg[index]);
sg_set_buf(&sg[index++], data, s); sg_set_buf(&sg[index++], data, s);
count -= s; count -= s;
data += s; data += s;
} }
if (index-start)
sg_mark_end(&sg[index - 1]);
return index-start; return index-start;
} }
...@@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit, ...@@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
s = rest_of_page(data); s = rest_of_page(data);
if (s > count) if (s > count)
s = count; s = count;
/* Make sure we don't terminate early. */
sg_unmark_end(&sg[index]);
sg_set_page(&sg[index++], pdata[i++], s, data_off); sg_set_page(&sg[index++], pdata[i++], s, data_off);
data_off = 0; data_off = 0;
data += s; data += s;
count -= s; count -= s;
nr_pages--; nr_pages--;
} }
if (index-start)
sg_mark_end(&sg[index - 1]);
return index - start; return index - start;
} }
...@@ -256,9 +264,10 @@ static int ...@@ -256,9 +264,10 @@ static int
p9_virtio_request(struct p9_client *client, struct p9_req_t *req) p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
{ {
int err; int err;
int in, out; int in, out, out_sgs, in_sgs;
unsigned long flags; unsigned long flags;
struct virtio_chan *chan = client->trans; struct virtio_chan *chan = client->trans;
struct scatterlist *sgs[2];
p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n"); p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
...@@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) ...@@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
req_retry: req_retry:
spin_lock_irqsave(&chan->lock, flags); spin_lock_irqsave(&chan->lock, flags);
out_sgs = in_sgs = 0;
/* Handle out VirtIO ring buffers */ /* Handle out VirtIO ring buffers */
out = pack_sg_list(chan->sg, 0, out = pack_sg_list(chan->sg, 0,
VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
if (out)
sgs[out_sgs++] = chan->sg;
in = pack_sg_list(chan->sg, out, in = pack_sg_list(chan->sg, out,
VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
GFP_ATOMIC); GFP_ATOMIC);
if (err < 0) { if (err < 0) {
if (err == -ENOSPC) { if (err == -ENOSPC) {
...@@ -289,7 +303,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req) ...@@ -289,7 +303,7 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
} else { } else {
spin_unlock_irqrestore(&chan->lock, flags); spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS, p9_debug(P9_DEBUG_TRANS,
"virtio rpc add_buf returned failure\n"); "virtio rpc add_sgs returned failure\n");
return -EIO; return -EIO;
} }
} }
...@@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, ...@@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
char *uidata, char *uodata, int inlen, char *uidata, char *uodata, int inlen,
int outlen, int in_hdr_len, int kern_buf) int outlen, int in_hdr_len, int kern_buf)
{ {
int in, out, err; int in, out, err, out_sgs, in_sgs;
unsigned long flags; unsigned long flags;
int in_nr_pages = 0, out_nr_pages = 0; int in_nr_pages = 0, out_nr_pages = 0;
struct page **in_pages = NULL, **out_pages = NULL; struct page **in_pages = NULL, **out_pages = NULL;
struct virtio_chan *chan = client->trans; struct virtio_chan *chan = client->trans;
struct scatterlist *sgs[4];
p9_debug(P9_DEBUG_TRANS, "virtio request\n"); p9_debug(P9_DEBUG_TRANS, "virtio request\n");
...@@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, ...@@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
req->status = REQ_STATUS_SENT; req->status = REQ_STATUS_SENT;
req_retry_pinned: req_retry_pinned:
spin_lock_irqsave(&chan->lock, flags); spin_lock_irqsave(&chan->lock, flags);
out_sgs = in_sgs = 0;
/* out data */ /* out data */
out = pack_sg_list(chan->sg, 0, out = pack_sg_list(chan->sg, 0,
VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
if (out_pages) if (out)
sgs[out_sgs++] = chan->sg;
if (out_pages) {
sgs[out_sgs++] = chan->sg + out;
out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM, out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
out_pages, out_nr_pages, uodata, outlen); out_pages, out_nr_pages, uodata, outlen);
}
/* /*
* Take care of in data * Take care of in data
* For example TREAD have 11. * For example TREAD have 11.
...@@ -412,11 +436,17 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, ...@@ -412,11 +436,17 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
*/ */
in = pack_sg_list(chan->sg, out, in = pack_sg_list(chan->sg, out,
VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
if (in_pages) if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
if (in_pages) {
sgs[out_sgs + in_sgs++] = chan->sg + out + in;
in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM, in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
in_pages, in_nr_pages, uidata, inlen); in_pages, in_nr_pages, uidata, inlen);
}
err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc, BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
GFP_ATOMIC); GFP_ATOMIC);
if (err < 0) { if (err < 0) {
if (err == -ENOSPC) { if (err == -ENOSPC) {
...@@ -432,7 +462,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, ...@@ -432,7 +462,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
} else { } else {
spin_unlock_irqrestore(&chan->lock, flags); spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS, p9_debug(P9_DEBUG_TRANS,
"virtio rpc add_buf returned failure\n"); "virtio rpc add_sgs returned failure\n");
err = -EIO; err = -EIO;
goto err_out; goto err_out;
} }
......
...@@ -70,7 +70,7 @@ Running Lguest: ...@@ -70,7 +70,7 @@ Running Lguest:
- Run an lguest as root: - Run an lguest as root:
Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \ tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
--block=rootfile root=/dev/vda --block=rootfile root=/dev/vda
Explanation: Explanation:
......
all: test mod all: test mod
test: virtio_test test: virtio_test vringh_test
virtio_test: virtio_ring.o virtio_test.o virtio_test: virtio_ring.o virtio_test.o
CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD vringh_test: vringh_test.o vringh.o virtio_ring.o
vpath %.c ../../drivers/virtio
CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE
vpath %.c ../../drivers/virtio ../../drivers/vhost
mod: mod:
${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test
.PHONY: all test mod clean .PHONY: all test mod clean
clean: clean:
${RM} *.o vhost_test/*.o vhost_test/.*.cmd \ ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
vhost_test/Module.symvers vhost_test/modules.order *.d vhost_test/Module.symvers vhost_test/modules.order *.d
-include *.d -include *.d
#if defined(__i386__) || defined(__x86_64__)
#define barrier() asm volatile("" ::: "memory")
#define mb() __sync_synchronize()
#define smp_mb() mb()
# define smp_rmb() barrier()
# define smp_wmb() barrier()
/* Weak barriers should be used. If not - it's a bug */
# define rmb() abort()
# define wmb() abort()
#else
#error Please fill in barrier macros
#endif
#ifndef BUG_H
#define BUG_H
#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
#define BUILD_BUG_ON(x)
#define BUG() abort()
#endif /* BUG_H */
#ifndef ERR_H
#define ERR_H
#define MAX_ERRNO 4095
#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
static inline void * __must_check ERR_PTR(long error)
{
return (void *) error;
}
static inline long __must_check PTR_ERR(const void *ptr)
{
return (long) ptr;
}
static inline long __must_check IS_ERR(const void *ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
{
return !ptr || IS_ERR_VALUE((unsigned long)ptr);
}
#endif /* ERR_H */
#define EXPORT_SYMBOL(sym)
#define EXPORT_SYMBOL_GPL(sym)
#define EXPORT_SYMBOL_GPL_FUTURE(sym)
#define EXPORT_UNUSED_SYMBOL(sym)
#define EXPORT_UNUSED_SYMBOL_GPL(sym)
#include "../../../include/linux/irqreturn.h"
#ifndef KERNEL_H
#define KERNEL_H
#include <stdbool.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdarg.h>
#include <linux/types.h>
#include <linux/printk.h>
#include <linux/bug.h>
#include <errno.h>
#include <unistd.h>
#include <asm/barrier.h>
#define CONFIG_SMP
#define PAGE_SIZE getpagesize()
#define PAGE_MASK (~(PAGE_SIZE-1))
typedef unsigned long long dma_addr_t;
typedef size_t __kernel_size_t;
struct page {
unsigned long long dummy;
};
/* Physical == Virtual */
#define virt_to_phys(p) ((unsigned long)p)
#define phys_to_virt(a) ((void *)(unsigned long)(a))
/* Page address: Virtual / 4K */
#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p))
#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK))
#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE)
#define __printf(a,b) __attribute__((format(printf,a,b)))
typedef enum {
GFP_KERNEL,
GFP_ATOMIC,
__GFP_HIGHMEM,
__GFP_HIGH
} gfp_t;
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
static inline void *kmalloc(size_t s, gfp_t gfp)
{
if (__kmalloc_fake)
return __kmalloc_fake;
return malloc(s);
}
static inline void kfree(void *p)
{
if (p >= __kfree_ignore_start && p < __kfree_ignore_end)
return;
free(p);
}
static inline void *krealloc(void *p, size_t s, gfp_t gfp)
{
return realloc(p, s);
}
static inline unsigned long __get_free_page(gfp_t gfp)
{
void *p;
posix_memalign(&p, PAGE_SIZE, PAGE_SIZE);
return (unsigned long)p;
}
static inline void free_page(unsigned long addr)
{
free((void *)addr);
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
#define uninitialized_var(x) x = x
# ifndef likely
# define likely(x) (__builtin_expect(!!(x), 1))
# endif
# ifndef unlikely
# define unlikely(x) (__builtin_expect(!!(x), 0))
# endif
#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#ifdef DEBUG
#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#else
#define pr_debug(format, ...) do {} while (0)
#endif
#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#endif /* KERNEL_H */
#include "../../../include/linux/kern_levels.h"
#define printk printf
#define vprintk vprintf
#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0
#define __ratelimit(x) (*(x))
#ifndef SCATTERLIST_H
#define SCATTERLIST_H
#include <linux/kernel.h>
struct scatterlist {
unsigned long page_link;
unsigned int offset;
unsigned int length;
dma_addr_t dma_address;
};
/* Scatterlist helpers, stolen from linux/scatterlist.h */
#define sg_is_chain(sg) ((sg)->page_link & 0x01)
#define sg_is_last(sg) ((sg)->page_link & 0x02)
#define sg_chain_ptr(sg) \
((struct scatterlist *) ((sg)->page_link & ~0x03))
/**
* sg_assign_page - Assign a given page to an SG entry
* @sg: SG entry
* @page: The page
*
* Description:
* Assign page to sg entry. Also see sg_set_page(), the most commonly used
* variant.
*
**/
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
unsigned long page_link = sg->page_link & 0x3;
/*
* In order for the low bit stealing approach to work, pages
* must be aligned at a 32-bit boundary as a minimum.
*/
BUG_ON((unsigned long) page & 0x03);
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
BUG_ON(sg_is_chain(sg));
#endif
sg->page_link = page_link | (unsigned long) page;
}
/**
* sg_set_page - Set sg entry to point at given page
* @sg: SG entry
* @page: The page
* @len: Length of data
* @offset: Offset into page
*
* Description:
* Use this function to set an sg entry pointing at a page, never assign
* the page directly. We encode sg table information in the lower bits
* of the page pointer. See sg_page() for looking up the page belonging
* to an sg entry.
*
**/
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
}
static inline struct page *sg_page(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
BUG_ON(sg_is_chain(sg));
#endif
return (struct page *)((sg)->page_link & ~0x3);
}
/*
* Loop over each sg element, following the pointer to a new list if necessary
*/
#define for_each_sg(sglist, sg, nr, __i) \
for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
/**
* sg_chain - Chain two sglists together
* @prv: First scatterlist
* @prv_nents: Number of entries in prv
* @sgl: Second scatterlist
*
* Description:
* Links @prv@ and @sgl@ together, to form a longer scatterlist.
*
**/
static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
struct scatterlist *sgl)
{
/*
* offset and length are unused for chain entry. Clear them.
*/
prv[prv_nents - 1].offset = 0;
prv[prv_nents - 1].length = 0;
/*
* Set lowest bit to indicate a link pointer, and make sure to clear
* the termination bit if it happens to be set.
*/
prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
}
/**
* sg_mark_end - Mark the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Marks the passed in sg entry as the termination point for the sg
* table. A call to sg_next() on this entry will return NULL.
*
**/
static inline void sg_mark_end(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
/*
* Set termination bit, clear potential chain bit
*/
sg->page_link |= 0x02;
sg->page_link &= ~0x01;
}
/**
* sg_unmark_end - Undo setting the end of the scatterlist
* @sg: SG entryScatterlist
*
* Description:
* Removes the termination marker from the given entry of the scatterlist.
*
**/
static inline void sg_unmark_end(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
sg->page_link &= ~0x02;
}
static inline struct scatterlist *sg_next(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
if (sg_is_last(sg))
return NULL;
sg++;
if (unlikely(sg_is_chain(sg)))
sg = sg_chain_ptr(sg);
return sg;
}
static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
memset(sgl, 0, sizeof(*sgl) * nents);
#ifdef CONFIG_DEBUG_SG
{
unsigned int i;
for (i = 0; i < nents; i++)
sgl[i].sg_magic = SG_MAGIC;
}
#endif
sg_mark_end(&sgl[nents - 1]);
}
static inline dma_addr_t sg_phys(struct scatterlist *sg)
{
return page_to_phys(sg_page(sg)) + sg->offset;
}
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
unsigned int buflen)
{
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
static inline void sg_init_one(struct scatterlist *sg,
const void *buf, unsigned int buflen)
{
sg_init_table(sg, 1);
sg_set_buf(sg, buf, buflen);
}
#endif /* SCATTERLIST_H */
#ifndef TYPES_H
#define TYPES_H
#include <stdint.h>
#define __force
#define __user
#define __must_check
#define __cold
typedef uint64_t u64;
typedef int64_t s64;
typedef uint32_t u32;
typedef int32_t s32;
typedef uint16_t u16;
typedef int16_t s16;
typedef uint8_t u8;
typedef int8_t s8;
typedef uint64_t __u64;
typedef int64_t __s64;
typedef uint32_t __u32;
typedef int32_t __s32;
typedef uint16_t __u16;
typedef int16_t __s16;
typedef uint8_t __u8;
typedef int8_t __s8;
#endif /* TYPES_H */
#ifndef UACCESS_H
#define UACCESS_H
extern void *__user_addr_min, *__user_addr_max;
#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
static inline void __chk_user_ptr(const volatile void *p, size_t size)
{
assert(p >= __user_addr_min && p + size <= __user_addr_max);
}
#define put_user(x, ptr) \
({ \
typeof(ptr) __pu_ptr = (ptr); \
__chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
ACCESS_ONCE(*(__pu_ptr)) = x; \
0; \
})
#define get_user(x, ptr) \
({ \
typeof(ptr) __pu_ptr = (ptr); \
__chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
x = ACCESS_ONCE(*(__pu_ptr)); \
0; \
})
static void volatile_memcpy(volatile char *to, const volatile char *from,
unsigned long n)
{
while (n--)
*(to++) = *(from++);
}
static inline int copy_from_user(void *to, const void __user volatile *from,
unsigned long n)
{
__chk_user_ptr(from, n);
volatile_memcpy(to, from, n);
return 0;
}
static inline int copy_to_user(void __user volatile *to, const void *from,
unsigned long n)
{
__chk_user_ptr(to, n);
volatile_memcpy(to, from, n);
return 0;
}
#endif /* UACCESS_H */
#include <linux/kernel.h>
#include "../../../include/linux/uio.h"
#ifndef LINUX_VIRTIO_H #ifndef LINUX_VIRTIO_H
#define LINUX_VIRTIO_H #define LINUX_VIRTIO_H
#include <linux/scatterlist.h>
#include <stdbool.h> #include <linux/kernel.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <linux/types.h>
#include <errno.h>
typedef unsigned long long dma_addr_t;
struct scatterlist {
unsigned long page_link;
unsigned int offset;
unsigned int length;
dma_addr_t dma_address;
};
struct page {
unsigned long long dummy;
};
#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
/* Physical == Virtual */
#define virt_to_phys(p) ((unsigned long)p)
#define phys_to_virt(a) ((void *)(unsigned long)(a))
/* Page address: Virtual / 4K */
#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \
sizeof(struct page)))
#define offset_in_page(p) (((unsigned long)p) % 4096)
#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \
sg->offset)
static inline void sg_mark_end(struct scatterlist *sg)
{
/*
* Set termination bit, clear potential chain bit
*/
sg->page_link |= 0x02;
sg->page_link &= ~0x01;
}
static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
memset(sgl, 0, sizeof(*sgl) * nents);
sg_mark_end(&sgl[nents - 1]);
}
static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
{
unsigned long page_link = sg->page_link & 0x3;
/*
* In order for the low bit stealing approach to work, pages
* must be aligned at a 32-bit boundary as a minimum.
*/
BUG_ON((unsigned long) page & 0x03);
sg->page_link = page_link | (unsigned long) page;
}
static inline void sg_set_page(struct scatterlist *sg, struct page *page,
unsigned int len, unsigned int offset)
{
sg_assign_page(sg, page);
sg->offset = offset;
sg->length = len;
}
static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
unsigned int buflen)
{
sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
}
static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
{
sg_init_table(sg, 1);
sg_set_buf(sg, buf, buflen);
}
typedef __u16 u16;
typedef enum {
GFP_KERNEL,
GFP_ATOMIC,
} gfp_t;
typedef enum {
IRQ_NONE,
IRQ_HANDLED
} irqreturn_t;
static inline void *kmalloc(size_t s, gfp_t gfp)
{
return malloc(s);
}
static inline void kfree(void *p)
{
free(p);
}
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
#define uninitialized_var(x) x = x
# ifndef likely
# define likely(x) (__builtin_expect(!!(x), 1))
# endif
# ifndef unlikely
# define unlikely(x) (__builtin_expect(!!(x), 0))
# endif
#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#ifdef DEBUG
#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#else
#define pr_debug(format, ...) do {} while (0)
#endif
#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */ /* TODO: empty stubs for now. Broken but enough for virtio_ring.c */
#define list_add_tail(a, b) do {} while (0) #define list_add_tail(a, b) do {} while (0)
...@@ -131,6 +11,7 @@ static inline void kfree(void *p) ...@@ -131,6 +11,7 @@ static inline void kfree(void *p)
#define BITS_PER_BYTE 8 #define BITS_PER_BYTE 8
#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE) #define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE)
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
/* TODO: Not atomic as it should be: /* TODO: Not atomic as it should be:
* we don't use this for anything important. */ * we don't use this for anything important. */
static inline void clear_bit(int nr, volatile unsigned long *addr) static inline void clear_bit(int nr, volatile unsigned long *addr)
...@@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) ...@@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
{ {
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
} }
/* The only feature we care to support */
#define virtio_has_feature(dev, feature) \
test_bit((feature), (dev)->features)
/* end of stubs */ /* end of stubs */
struct virtio_device { struct virtio_device {
...@@ -163,39 +40,32 @@ struct virtqueue { ...@@ -163,39 +40,32 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq); void (*callback)(struct virtqueue *vq);
const char *name; const char *name;
struct virtio_device *vdev; struct virtio_device *vdev;
unsigned int index;
unsigned int num_free;
void *priv; void *priv;
}; };
#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \
void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \
}
#define MODULE_LICENSE(__MODULE_LICENSE_value) \ #define MODULE_LICENSE(__MODULE_LICENSE_value) \
const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
#define CONFIG_SMP
#if defined(__i386__) || defined(__x86_64__)
#define barrier() asm volatile("" ::: "memory")
#define mb() __sync_synchronize()
#define smp_mb() mb()
# define smp_rmb() barrier()
# define smp_wmb() barrier()
/* Weak barriers should be used. If not - it's a bug */
# define rmb() abort()
# define wmb() abort()
#else
#error Please fill in barrier macros
#endif
/* Interfaces exported by virtio_ring. */ /* Interfaces exported by virtio_ring. */
int virtqueue_add_buf(struct virtqueue *vq, int virtqueue_add_sgs(struct virtqueue *vq,
struct scatterlist sg[], struct scatterlist *sgs[],
unsigned int out_num, unsigned int out_sgs,
unsigned int in_num, unsigned int in_sgs,
void *data, void *data,
gfp_t gfp); gfp_t gfp);
int virtqueue_add_outbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
int virtqueue_add_inbuf(struct virtqueue *vq,
struct scatterlist sg[], unsigned int num,
void *data,
gfp_t gfp);
void virtqueue_kick(struct virtqueue *vq); void virtqueue_kick(struct virtqueue *vq);
void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
...@@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq); ...@@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
bool virtqueue_enable_cb_delayed(struct virtqueue *vq); bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
void *virtqueue_detach_unused_buf(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq);
struct virtqueue *vring_new_virtqueue(unsigned int num, struct virtqueue *vring_new_virtqueue(unsigned int index,
unsigned int num,
unsigned int vring_align, unsigned int vring_align,
struct virtio_device *vdev, struct virtio_device *vdev,
bool weak_barriers, bool weak_barriers,
......
#define VIRTIO_TRANSPORT_F_START 28
#define VIRTIO_TRANSPORT_F_END 32
#define virtio_has_feature(dev, feature) \
test_bit((feature), (dev)->features)
#include "../../../include/linux/virtio_ring.h"
#include "../../../include/linux/vringh.h"
#include "../../../../include/uapi/linux/virtio_config.h"
#ifndef VIRTIO_RING_H
#define VIRTIO_RING_H
#include "../../../../include/uapi/linux/virtio_ring.h"
#endif /* VIRTIO_RING_H */
...@@ -10,11 +10,15 @@ ...@@ -10,11 +10,15 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <fcntl.h> #include <fcntl.h>
#include <stdbool.h>
#include <linux/vhost.h> #include <linux/vhost.h>
#include <linux/virtio.h> #include <linux/virtio.h>
#include <linux/virtio_ring.h> #include <linux/virtio_ring.h>
#include "../../drivers/vhost/test.h" #include "../../drivers/vhost/test.h"
/* Unused */
void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
struct vq_info { struct vq_info {
int kick; int kick;
int call; int call;
...@@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num) ...@@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num)
assert(r >= 0); assert(r >= 0);
memset(info->ring, 0, vring_size(num, 4096)); memset(info->ring, 0, vring_size(num, 4096));
vring_init(&info->vring, num, info->ring, 4096); vring_init(&info->vring, num, info->ring, 4096);
info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, info->vq = vring_new_virtqueue(info->idx,
info->vring.num, 4096, &dev->vdev,
true, info->ring, true, info->ring,
vq_notify, vq_callback, "test"); vq_notify, vq_callback, "test");
assert(info->vq); assert(info->vq);
...@@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq, ...@@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq,
do { do {
if (started < bufs) { if (started < bufs) {
sg_init_one(&sl, dev->buf, dev->buf_size); sg_init_one(&sl, dev->buf, dev->buf_size);
r = virtqueue_add_buf(vq->vq, &sl, 1, 0, r = virtqueue_add_outbuf(vq->vq, &sl, 1,
dev->buf + started, dev->buf + started,
GFP_ATOMIC); GFP_ATOMIC);
if (likely(r == 0)) { if (likely(r == 0)) {
++started; ++started;
virtqueue_kick(vq->vq); virtqueue_kick(vq->vq);
......
/* Simple test of virtio code, entirely in userpsace. */
#define _GNU_SOURCE
#include <sched.h>
#include <err.h>
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/virtio.h>
#include <linux/vringh.h>
#include <linux/virtio_ring.h>
#include <linux/uaccess.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <fcntl.h>
#define USER_MEM (1024*1024)
void *__user_addr_min, *__user_addr_max;
void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
static u64 user_addr_offset;
#define RINGSIZE 256
#define ALIGN 4096
static void never_notify_host(struct virtqueue *vq)
{
abort();
}
static void never_callback_guest(struct virtqueue *vq)
{
abort();
}
static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r)
{
if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
return false;
if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
return false;
r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset;
r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset;
r->offset = user_addr_offset;
return true;
}
/* We return single byte ranges. */
static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r)
{
if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
return false;
if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
return false;
r->start = addr;
r->end_incl = r->start;
r->offset = user_addr_offset;
return true;
}
struct guest_virtio_device {
struct virtio_device vdev;
int to_host_fd;
unsigned long notifies;
};
static void parallel_notify_host(struct virtqueue *vq)
{
struct guest_virtio_device *gvdev;
gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev);
write(gvdev->to_host_fd, "", 1);
gvdev->notifies++;
}
static void no_notify_host(struct virtqueue *vq)
{
}
#define NUM_XFERS (10000000)
/* We aim for two "distant" cpus. */
static void find_cpus(unsigned int *first, unsigned int *last)
{
unsigned int i;
*first = -1U;
*last = 0;
for (i = 0; i < 4096; i++) {
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(i, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) {
if (i < *first)
*first = i;
if (i > *last)
*last = i;
}
}
}
/* Opencoded version for fast mode */
static inline int vringh_get_head(struct vringh *vrh, u16 *head)
{
u16 avail_idx, i;
int err;
err = get_user(avail_idx, &vrh->vring.avail->idx);
if (err)
return err;
if (vrh->last_avail_idx == avail_idx)
return 0;
/* Only get avail ring entries after they have been exposed by guest. */
virtio_rmb(vrh->weak_barriers);
i = vrh->last_avail_idx & (vrh->vring.num - 1);
err = get_user(*head, &vrh->vring.avail->ring[i]);
if (err)
return err;
vrh->last_avail_idx++;
return 1;
}
static int parallel_test(unsigned long features,
bool (*getrange)(struct vringh *vrh,
u64 addr, struct vringh_range *r),
bool fast_vringh)
{
void *host_map, *guest_map;
int fd, mapsize, to_guest[2], to_host[2];
unsigned long xfers = 0, notifies = 0, receives = 0;
unsigned int first_cpu, last_cpu;
cpu_set_t cpu_set;
char buf[128];
/* Create real file to mmap. */
fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600);
if (fd < 0)
err(1, "Opening /tmp/vringh_test-file");
/* Extra room at the end for some data, and indirects */
mapsize = vring_size(RINGSIZE, ALIGN)
+ RINGSIZE * 2 * sizeof(int)
+ RINGSIZE * 6 * sizeof(struct vring_desc);
mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1);
ftruncate(fd, mapsize);
/* Parent and child use separate addresses, to check our mapping logic! */
host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
pipe(to_guest);
pipe(to_host);
CPU_ZERO(&cpu_set);
find_cpus(&first_cpu, &last_cpu);
printf("Using CPUS %u and %u\n", first_cpu, last_cpu);
fflush(stdout);
if (fork() != 0) {
struct vringh vrh;
int status, err, rlen = 0;
char rbuf[5];
/* We are the host: never access guest addresses! */
munmap(guest_map, mapsize);
__user_addr_min = host_map;
__user_addr_max = __user_addr_min + mapsize;
user_addr_offset = host_map - guest_map;
assert(user_addr_offset);
close(to_guest[0]);
close(to_host[1]);
vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN);
vringh_init_user(&vrh, features, RINGSIZE, true,
vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
CPU_SET(first_cpu, &cpu_set);
if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
errx(1, "Could not set affinity to cpu %u", first_cpu);
while (xfers < NUM_XFERS) {
struct iovec host_riov[2], host_wiov[2];
struct vringh_iov riov, wiov;
u16 head, written;
if (fast_vringh) {
for (;;) {
err = vringh_get_head(&vrh, &head);
if (err != 0)
break;
err = vringh_need_notify_user(&vrh);
if (err < 0)
errx(1, "vringh_need_notify_user: %i",
err);
if (err) {
write(to_guest[1], "", 1);
notifies++;
}
}
if (err != 1)
errx(1, "vringh_get_head");
written = 0;
goto complete;
} else {
vringh_iov_init(&riov,
host_riov,
ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov,
host_wiov,
ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov,
getrange, &head);
}
if (err == 0) {
err = vringh_need_notify_user(&vrh);
if (err < 0)
errx(1, "vringh_need_notify_user: %i",
err);
if (err) {
write(to_guest[1], "", 1);
notifies++;
}
if (!vringh_notify_enable_user(&vrh))
continue;
/* Swallow all notifies at once. */
if (read(to_host[0], buf, sizeof(buf)) < 1)
break;
vringh_notify_disable_user(&vrh);
receives++;
continue;
}
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
/* We simply copy bytes. */
if (riov.used) {
rlen = vringh_iov_pull_user(&riov, rbuf,
sizeof(rbuf));
if (rlen != 4)
errx(1, "vringh_iov_pull_user: %i",
rlen);
assert(riov.i == riov.used);
written = 0;
} else {
err = vringh_iov_push_user(&wiov, rbuf, rlen);
if (err != rlen)
errx(1, "vringh_iov_push_user: %i",
err);
assert(wiov.i == wiov.used);
written = err;
}
complete:
xfers++;
err = vringh_complete_user(&vrh, head, written);
if (err != 0)
errx(1, "vringh_complete_user: %i", err);
}
err = vringh_need_notify_user(&vrh);
if (err < 0)
errx(1, "vringh_need_notify_user: %i", err);
if (err) {
write(to_guest[1], "", 1);
notifies++;
}
wait(&status);
if (!WIFEXITED(status))
errx(1, "Child died with signal %i?", WTERMSIG(status));
if (WEXITSTATUS(status) != 0)
errx(1, "Child exited %i?", WEXITSTATUS(status));
printf("Host: notified %lu, pinged %lu\n", notifies, receives);
return 0;
} else {
struct guest_virtio_device gvdev;
struct virtqueue *vq;
unsigned int *data;
struct vring_desc *indirects;
unsigned int finished = 0;
/* We pass sg[]s pointing into here, but we need RINGSIZE+1 */
data = guest_map + vring_size(RINGSIZE, ALIGN);
indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int);
/* We are the guest. */
munmap(host_map, mapsize);
close(to_guest[1]);
close(to_host[0]);
gvdev.vdev.features[0] = features;
gvdev.to_host_fd = to_host[1];
gvdev.notifies = 0;
CPU_SET(first_cpu, &cpu_set);
if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
err(1, "Could not set affinity to cpu %u", first_cpu);
vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true,
guest_map, fast_vringh ? no_notify_host
: parallel_notify_host,
never_callback_guest, "guest vq");
/* Don't kfree indirects. */
__kfree_ignore_start = indirects;
__kfree_ignore_end = indirects + RINGSIZE * 6;
while (xfers < NUM_XFERS) {
struct scatterlist sg[4];
unsigned int num_sg, len;
int *dbuf, err;
bool output = !(xfers % 2);
/* Consume bufs. */
while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) {
if (len == 4)
assert(*dbuf == finished - 1);
else if (!fast_vringh)
assert(*dbuf == finished);
finished++;
}
/* Produce a buffer. */
dbuf = data + (xfers % (RINGSIZE + 1));
if (output)
*dbuf = xfers;
else
*dbuf = -1;
switch ((xfers / sizeof(*dbuf)) % 4) {
case 0:
/* Nasty three-element sg list. */
sg_init_table(sg, num_sg = 3);
sg_set_buf(&sg[0], (void *)dbuf, 1);
sg_set_buf(&sg[1], (void *)dbuf + 1, 2);
sg_set_buf(&sg[2], (void *)dbuf + 3, 1);
break;
case 1:
sg_init_table(sg, num_sg = 2);
sg_set_buf(&sg[0], (void *)dbuf, 1);
sg_set_buf(&sg[1], (void *)dbuf + 1, 3);
break;
case 2:
sg_init_table(sg, num_sg = 1);
sg_set_buf(&sg[0], (void *)dbuf, 4);
break;
case 3:
sg_init_table(sg, num_sg = 4);
sg_set_buf(&sg[0], (void *)dbuf, 1);
sg_set_buf(&sg[1], (void *)dbuf + 1, 1);
sg_set_buf(&sg[2], (void *)dbuf + 2, 1);
sg_set_buf(&sg[3], (void *)dbuf + 3, 1);
break;
}
/* May allocate an indirect, so force it to allocate
* user addr */
__kmalloc_fake = indirects + (xfers % RINGSIZE) * 4;
if (output)
err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf,
GFP_KERNEL);
else
err = virtqueue_add_inbuf(vq, sg, num_sg,
dbuf, GFP_KERNEL);
if (err == -ENOSPC) {
if (!virtqueue_enable_cb_delayed(vq))
continue;
/* Swallow all notifies at once. */
if (read(to_guest[0], buf, sizeof(buf)) < 1)
break;
receives++;
virtqueue_disable_cb(vq);
continue;
}
if (err)
errx(1, "virtqueue_add_in/outbuf: %i", err);
xfers++;
virtqueue_kick(vq);
}
/* Any extra? */
while (finished != xfers) {
int *dbuf;
unsigned int len;
/* Consume bufs. */
dbuf = virtqueue_get_buf(vq, &len);
if (dbuf) {
if (len == 4)
assert(*dbuf == finished - 1);
else
assert(len == 0);
finished++;
continue;
}
if (!virtqueue_enable_cb_delayed(vq))
continue;
if (read(to_guest[0], buf, sizeof(buf)) < 1)
break;
receives++;
virtqueue_disable_cb(vq);
}
printf("Guest: notified %lu, pinged %lu\n",
gvdev.notifies, receives);
vring_del_virtqueue(vq);
return 0;
}
}
int main(int argc, char *argv[])
{
struct virtio_device vdev;
struct virtqueue *vq;
struct vringh vrh;
struct scatterlist guest_sg[RINGSIZE], *sgs[2];
struct iovec host_riov[2], host_wiov[2];
struct vringh_iov riov, wiov;
struct vring_used_elem used[RINGSIZE];
char buf[28];
u16 head;
int err;
unsigned i;
void *ret;
bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r);
bool fast_vringh = false, parallel = false;
getrange = getrange_iov;
vdev.features[0] = 0;
while (argv[1]) {
if (strcmp(argv[1], "--indirect") == 0)
vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
else if (strcmp(argv[1], "--eventidx") == 0)
vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX);
else if (strcmp(argv[1], "--slow-range") == 0)
getrange = getrange_slow;
else if (strcmp(argv[1], "--fast-vringh") == 0)
fast_vringh = true;
else if (strcmp(argv[1], "--parallel") == 0)
parallel = true;
else
errx(1, "Unknown arg %s", argv[1]);
argv++;
}
if (parallel)
return parallel_test(vdev.features[0], getrange, fast_vringh);
if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0)
abort();
__user_addr_max = __user_addr_min + USER_MEM;
memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN));
/* Set up guest side. */
vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
__user_addr_min,
never_notify_host, never_callback_guest,
"guest vq");
/* Set up host side. */
vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN);
vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true,
vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
/* No descriptor to get yet... */
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 0)
errx(1, "vringh_getdesc_user: %i", err);
/* Guest puts in a descriptor. */
memcpy(__user_addr_max - 1, "a", 1);
sg_init_table(guest_sg, 1);
sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
sg_init_table(guest_sg+1, 1);
sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2);
sgs[0] = &guest_sg[0];
sgs[1] = &guest_sg[1];
/* May allocate an indirect, so force it to allocate user addr */
__kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_sgs: %i", err);
__kmalloc_fake = NULL;
/* Host retreives it. */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
assert(riov.used == 1);
assert(riov.iov[0].iov_base == __user_addr_max - 1);
assert(riov.iov[0].iov_len == 1);
if (getrange != getrange_slow) {
assert(wiov.used == 1);
assert(wiov.iov[0].iov_base == __user_addr_max - 3);
assert(wiov.iov[0].iov_len == 2);
} else {
assert(wiov.used == 2);
assert(wiov.iov[0].iov_base == __user_addr_max - 3);
assert(wiov.iov[0].iov_len == 1);
assert(wiov.iov[1].iov_base == __user_addr_max - 2);
assert(wiov.iov[1].iov_len == 1);
}
err = vringh_iov_pull_user(&riov, buf, 5);
if (err != 1)
errx(1, "vringh_iov_pull_user: %i", err);
assert(buf[0] == 'a');
assert(riov.i == 1);
assert(vringh_iov_pull_user(&riov, buf, 5) == 0);
memcpy(buf, "bcdef", 5);
err = vringh_iov_push_user(&wiov, buf, 5);
if (err != 2)
errx(1, "vringh_iov_push_user: %i", err);
assert(memcmp(__user_addr_max - 3, "bc", 2) == 0);
assert(wiov.i == wiov.used);
assert(vringh_iov_push_user(&wiov, buf, 5) == 0);
/* Host is done. */
err = vringh_complete_user(&vrh, head, err);
if (err != 0)
errx(1, "vringh_complete_user: %i", err);
/* Guest should see used token now. */
__kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN);
__kfree_ignore_end = __kfree_ignore_start + 1;
ret = virtqueue_get_buf(vq, &i);
if (ret != &err)
errx(1, "virtqueue_get_buf: %p", ret);
assert(i == 2);
/* Guest puts in a huge descriptor. */
sg_init_table(guest_sg, RINGSIZE);
for (i = 0; i < RINGSIZE; i++) {
sg_set_buf(&guest_sg[i],
__user_addr_max - USER_MEM/4, USER_MEM/4);
}
/* Fill contents with recognisable garbage. */
for (i = 0; i < USER_MEM/4; i++)
((char *)__user_addr_max - USER_MEM/4)[i] = i;
/* This will allocate an indirect, so force it to allocate user addr */
__kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_outbuf (large): %i", err);
__kmalloc_fake = NULL;
/* Host picks it up (allocates new iov). */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
assert(riov.max_num & VRINGH_IOV_ALLOCATED);
assert(riov.iov != host_riov);
if (getrange != getrange_slow)
assert(riov.used == RINGSIZE);
else
assert(riov.used == RINGSIZE * USER_MEM/4);
assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED));
assert(wiov.used == 0);
/* Pull data back out (in odd chunks), should be as expected. */
for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) {
err = vringh_iov_pull_user(&riov, buf, 3);
if (err != 3 && i + err != RINGSIZE * USER_MEM/4)
errx(1, "vringh_iov_pull_user large: %i", err);
assert(buf[0] == (char)i);
assert(err < 2 || buf[1] == (char)(i + 1));
assert(err < 3 || buf[2] == (char)(i + 2));
}
assert(riov.i == riov.used);
vringh_iov_cleanup(&riov);
vringh_iov_cleanup(&wiov);
/* Complete using multi interface, just because we can. */
used[0].id = head;
used[0].len = 0;
err = vringh_complete_multi_user(&vrh, used, 1);
if (err)
errx(1, "vringh_complete_multi_user(1): %i", err);
/* Free up those descriptors. */
ret = virtqueue_get_buf(vq, &i);
if (ret != &err)
errx(1, "virtqueue_get_buf: %p", ret);
/* Add lots of descriptors. */
sg_init_table(guest_sg, 1);
sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
for (i = 0; i < RINGSIZE; i++) {
err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_outbuf (multiple): %i", err);
}
/* Now get many, and consume them all at once. */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
for (i = 0; i < RINGSIZE; i++) {
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
used[i].id = head;
used[i].len = 0;
}
/* Make sure it wraps around ring, to test! */
assert(vrh.vring.used->idx % RINGSIZE != 0);
err = vringh_complete_multi_user(&vrh, used, RINGSIZE);
if (err)
errx(1, "vringh_complete_multi_user: %i", err);
/* Free those buffers. */
for (i = 0; i < RINGSIZE; i++) {
unsigned len;
assert(virtqueue_get_buf(vq, &len) != NULL);
}
/* Test weird (but legal!) indirect. */
if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) {
char *data = __user_addr_max - USER_MEM/4;
struct vring_desc *d = __user_addr_max - USER_MEM/2;
struct vring vring;
/* Force creation of direct, which we modify. */
vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
__user_addr_min,
never_notify_host,
never_callback_guest,
"guest vq");
sg_init_table(guest_sg, 4);
sg_set_buf(&guest_sg[0], d, sizeof(*d)*2);
sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1);
sg_set_buf(&guest_sg[2], data + 6, 4);
sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3);
err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL);
if (err)
errx(1, "virtqueue_add_outbuf (indirect): %i", err);
vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN);
/* They're used in order, but double-check... */
assert(vring.desc[0].addr == (unsigned long)d);
assert(vring.desc[1].addr == (unsigned long)(d+2));
assert(vring.desc[2].addr == (unsigned long)data + 6);
assert(vring.desc[3].addr == (unsigned long)(d+3));
vring.desc[0].flags |= VRING_DESC_F_INDIRECT;
vring.desc[1].flags |= VRING_DESC_F_INDIRECT;
vring.desc[3].flags |= VRING_DESC_F_INDIRECT;
/* First indirect */
d[0].addr = (unsigned long)data;
d[0].len = 1;
d[0].flags = VRING_DESC_F_NEXT;
d[0].next = 1;
d[1].addr = (unsigned long)data + 1;
d[1].len = 2;
d[1].flags = 0;
/* Second indirect */
d[2].addr = (unsigned long)data + 3;
d[2].len = 3;
d[2].flags = 0;
/* Third indirect */
d[3].addr = (unsigned long)data + 10;
d[3].len = 5;
d[3].flags = VRING_DESC_F_NEXT;
d[3].next = 1;
d[4].addr = (unsigned long)data + 15;
d[4].len = 6;
d[4].flags = VRING_DESC_F_NEXT;
d[4].next = 2;
d[5].addr = (unsigned long)data + 21;
d[5].len = 7;
d[5].flags = 0;
/* Host picks it up (allocates new iov). */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
if (err != 1)
errx(1, "vringh_getdesc_user: %i", err);
if (head != 0)
errx(1, "vringh_getdesc_user: head %i not 0", head);
assert(riov.max_num & VRINGH_IOV_ALLOCATED);
if (getrange != getrange_slow)
assert(riov.used == 7);
else
assert(riov.used == 28);
err = vringh_iov_pull_user(&riov, buf, 29);
assert(err == 28);
/* Data should be linear. */
for (i = 0; i < err; i++)
assert(buf[i] == i);
vringh_iov_cleanup(&riov);
}
/* Don't leak memory... */
vring_del_virtqueue(vq);
free(__user_addr_min);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment