Commit 2c750557 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest

* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest:
  lguest: documentation update
  lguest: Add to maintainers file.
  lguest: build fix
  lguest: clean up lguest_launcher.h
  lguest: remove unused "wake" element from struct lguest
  lguest: use defines from x86 headers instead of magic numbers
  lguest: example launcher header cleanup.
parents fc42dabe e1e72965
...@@ -34,25 +34,24 @@ ...@@ -34,25 +34,24 @@
#include <zlib.h> #include <zlib.h>
#include <assert.h> #include <assert.h>
#include <sched.h> #include <sched.h>
/*L:110 We can ignore the 30 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
* As Linus said, "C is a Spartan language, and so should your naming be." I
* like these abbreviations and the header we need uses them, so we define them
* here.
*/
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
#include "linux/lguest_launcher.h" #include "linux/lguest_launcher.h"
#include "linux/pci_ids.h"
#include "linux/virtio_config.h" #include "linux/virtio_config.h"
#include "linux/virtio_net.h" #include "linux/virtio_net.h"
#include "linux/virtio_blk.h" #include "linux/virtio_blk.h"
#include "linux/virtio_console.h" #include "linux/virtio_console.h"
#include "linux/virtio_ring.h" #include "linux/virtio_ring.h"
#include "asm-x86/bootparam.h" #include "asm-x86/bootparam.h"
/*L:110 We can ignore the 38 include files we need for this program, but I do
* want to draw attention to the use of kernel-style types.
*
* As Linus said, "C is a Spartan language, and so should your naming be." I
* like these abbreviations, so we define them here. Note that u64 is always
* unsigned long long, which works on all Linux systems: this means that we can
* use %llu in printf for any u64. */
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
/*:*/ /*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */
...@@ -361,8 +360,8 @@ static unsigned long load_bzimage(int fd) ...@@ -361,8 +360,8 @@ static unsigned long load_bzimage(int fd)
} }
/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
* come wrapped up in the self-decompressing "bzImage" format. With some funky * come wrapped up in the self-decompressing "bzImage" format. With a little
* coding, we can load those, too. */ * work, we can load those, too. */
static unsigned long load_kernel(int fd) static unsigned long load_kernel(int fd)
{ {
Elf32_Ehdr hdr; Elf32_Ehdr hdr;
...@@ -465,6 +464,7 @@ static unsigned long setup_pagetables(unsigned long mem, ...@@ -465,6 +464,7 @@ static unsigned long setup_pagetables(unsigned long mem,
* to know where it is. */ * to know where it is. */
return to_guest_phys(pgdir); return to_guest_phys(pgdir);
} }
/*:*/
/* Simple routine to roll all the commandline arguments together with spaces /* Simple routine to roll all the commandline arguments together with spaces
* between them. */ * between them. */
...@@ -481,9 +481,9 @@ static void concat(char *dst, char *args[]) ...@@ -481,9 +481,9 @@ static void concat(char *dst, char *args[])
dst[len] = '\0'; dst[len] = '\0';
} }
/* This is where we actually tell the kernel to initialize the Guest. We saw /*L:185 This is where we actually tell the kernel to initialize the Guest. We
* the arguments it expects when we looked at initialize() in lguest_user.c: * saw the arguments it expects when we looked at initialize() in lguest_user.c:
* the base of guest "physical" memory, the top physical page to allow, the * the base of Guest "physical" memory, the top physical page to allow, the
* top level pagetable and the entry point for the Guest. */ * top level pagetable and the entry point for the Guest. */
static int tell_kernel(unsigned long pgdir, unsigned long start) static int tell_kernel(unsigned long pgdir, unsigned long start)
{ {
...@@ -513,13 +513,14 @@ static void add_device_fd(int fd) ...@@ -513,13 +513,14 @@ static void add_device_fd(int fd)
/*L:200 /*L:200
* The Waker. * The Waker.
* *
* With a console and network devices, we can have lots of input which we need * With console, block and network devices, we can have lots of input which we
* to process. We could try to tell the kernel what file descriptors to watch, * need to process. We could try to tell the kernel what file descriptors to
* but handing a file descriptor mask through to the kernel is fairly icky. * watch, but handing a file descriptor mask through to the kernel is fairly
* icky.
* *
* Instead, we fork off a process which watches the file descriptors and writes * Instead, we fork off a process which watches the file descriptors and writes
* the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
* loop to stop running the Guest. This causes it to return from the * stop running the Guest. This causes the Launcher to return from the
* /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
* the LHREQ_BREAK and wake us up again. * the LHREQ_BREAK and wake us up again.
* *
...@@ -545,7 +546,9 @@ static void wake_parent(int pipefd, int lguest_fd) ...@@ -545,7 +546,9 @@ static void wake_parent(int pipefd, int lguest_fd)
if (read(pipefd, &fd, sizeof(fd)) == 0) if (read(pipefd, &fd, sizeof(fd)) == 0)
exit(0); exit(0);
/* Otherwise it's telling us to change what file /* Otherwise it's telling us to change what file
* descriptors we're to listen to. */ * descriptors we're to listen to. Positive means
* listen to a new one, negative means stop
* listening. */
if (fd >= 0) if (fd >= 0)
FD_SET(fd, &devices.infds); FD_SET(fd, &devices.infds);
else else
...@@ -560,7 +563,7 @@ static int setup_waker(int lguest_fd) ...@@ -560,7 +563,7 @@ static int setup_waker(int lguest_fd)
{ {
int pipefd[2], child; int pipefd[2], child;
/* We create a pipe to talk to the waker, and also so it knows when the /* We create a pipe to talk to the Waker, and also so it knows when the
* Launcher dies (and closes pipe). */ * Launcher dies (and closes pipe). */
pipe(pipefd); pipe(pipefd);
child = fork(); child = fork();
...@@ -568,7 +571,8 @@ static int setup_waker(int lguest_fd) ...@@ -568,7 +571,8 @@ static int setup_waker(int lguest_fd)
err(1, "forking"); err(1, "forking");
if (child == 0) { if (child == 0) {
/* Close the "writing" end of our copy of the pipe */ /* We are the Waker: close the "writing" end of our copy of the
* pipe and start waiting for input. */
close(pipefd[1]); close(pipefd[1]);
wake_parent(pipefd[0], lguest_fd); wake_parent(pipefd[0], lguest_fd);
} }
...@@ -579,12 +583,12 @@ static int setup_waker(int lguest_fd) ...@@ -579,12 +583,12 @@ static int setup_waker(int lguest_fd)
return pipefd[1]; return pipefd[1];
} }
/*L:210 /*
* Device Handling. * Device Handling.
* *
* When the Guest sends DMA to us, it sends us an array of addresses and sizes. * When the Guest gives us a buffer, it sends an array of addresses and sizes.
* We need to make sure it's not trying to reach into the Launcher itself, so * We need to make sure it's not trying to reach into the Launcher itself, so
* we have a convenient routine which check it and exits with an error message * we have a convenient routine which checks it and exits with an error message
* if something funny is going on: * if something funny is going on:
*/ */
static void *_check_pointer(unsigned long addr, unsigned int size, static void *_check_pointer(unsigned long addr, unsigned int size,
...@@ -601,7 +605,9 @@ static void *_check_pointer(unsigned long addr, unsigned int size, ...@@ -601,7 +605,9 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
/* A macro which transparently hands the line number to the real function. */ /* A macro which transparently hands the line number to the real function. */
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
/* This function returns the next descriptor in the chain, or vq->vring.num. */ /* Each buffer in the virtqueues is actually a chain of descriptors. This
* function returns the next descriptor in the chain, or vq->vring.num if we're
* at the end. */
static unsigned next_desc(struct virtqueue *vq, unsigned int i) static unsigned next_desc(struct virtqueue *vq, unsigned int i)
{ {
unsigned int next; unsigned int next;
...@@ -680,13 +686,14 @@ static unsigned get_vq_desc(struct virtqueue *vq, ...@@ -680,13 +686,14 @@ static unsigned get_vq_desc(struct virtqueue *vq,
return head; return head;
} }
/* Once we've used one of their buffers, we tell them about it. We'll then /* After we've used one of their buffers, we tell them about it. We'll then
* want to send them an interrupt, using trigger_irq(). */ * want to send them an interrupt, using trigger_irq(). */
static void add_used(struct virtqueue *vq, unsigned int head, int len) static void add_used(struct virtqueue *vq, unsigned int head, int len)
{ {
struct vring_used_elem *used; struct vring_used_elem *used;
/* Get a pointer to the next entry in the used ring. */ /* The virtqueue contains a ring of used buffers. Get a pointer to the
* next entry in that used ring. */
used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
used->id = head; used->id = head;
used->len = len; used->len = len;
...@@ -700,6 +707,7 @@ static void trigger_irq(int fd, struct virtqueue *vq) ...@@ -700,6 +707,7 @@ static void trigger_irq(int fd, struct virtqueue *vq)
{ {
unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
/* If they don't want an interrupt, don't send one. */
if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
return; return;
...@@ -716,8 +724,11 @@ static void add_used_and_trigger(int fd, struct virtqueue *vq, ...@@ -716,8 +724,11 @@ static void add_used_and_trigger(int fd, struct virtqueue *vq,
trigger_irq(fd, vq); trigger_irq(fd, vq);
} }
/* Here is the input terminal setting we save, and the routine to restore them /*
* on exit so the user can see what they type next. */ * The Console
*
* Here is the input terminal setting we save, and the routine to restore them
* on exit so the user gets their terminal back. */
static struct termios orig_term; static struct termios orig_term;
static void restore_term(void) static void restore_term(void)
{ {
...@@ -818,7 +829,10 @@ static void handle_console_output(int fd, struct virtqueue *vq) ...@@ -818,7 +829,10 @@ static void handle_console_output(int fd, struct virtqueue *vq)
} }
} }
/* Handling output for network is also simple: we get all the output buffers /*
* The Network
*
* Handling output for network is also simple: we get all the output buffers
* and write them (ignoring the first element) to this device's file descriptor * and write them (ignoring the first element) to this device's file descriptor
* (stdout). */ * (stdout). */
static void handle_net_output(int fd, struct virtqueue *vq) static void handle_net_output(int fd, struct virtqueue *vq)
...@@ -831,8 +845,9 @@ static void handle_net_output(int fd, struct virtqueue *vq) ...@@ -831,8 +845,9 @@ static void handle_net_output(int fd, struct virtqueue *vq)
while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
if (in) if (in)
errx(1, "Input buffers in output queue?"); errx(1, "Input buffers in output queue?");
/* Check header, but otherwise ignore it (we said we supported /* Check header, but otherwise ignore it (we told the Guest we
* no features). */ * supported no features, so it shouldn't have anything
* interesting). */
(void)convert(&iov[0], struct virtio_net_hdr); (void)convert(&iov[0], struct virtio_net_hdr);
len = writev(vq->dev->fd, iov+1, out-1); len = writev(vq->dev->fd, iov+1, out-1);
add_used_and_trigger(fd, vq, head, len); add_used_and_trigger(fd, vq, head, len);
...@@ -883,7 +898,8 @@ static bool handle_tun_input(int fd, struct device *dev) ...@@ -883,7 +898,8 @@ static bool handle_tun_input(int fd, struct device *dev)
return true; return true;
} }
/* This callback ensures we try again, in case we stopped console or net /*L:215 This is the callback attached to the network and console input
* virtqueues: it ensures we try again, in case we stopped console or net
* delivery because Guest didn't have any buffers. */ * delivery because Guest didn't have any buffers. */
static void enable_fd(int fd, struct virtqueue *vq) static void enable_fd(int fd, struct virtqueue *vq)
{ {
...@@ -919,7 +935,7 @@ static void handle_output(int fd, unsigned long addr) ...@@ -919,7 +935,7 @@ static void handle_output(int fd, unsigned long addr)
strnlen(from_guest_phys(addr), guest_limit - addr)); strnlen(from_guest_phys(addr), guest_limit - addr));
} }
/* This is called when the waker wakes us up: check for incoming file /* This is called when the Waker wakes us up: check for incoming file
* descriptors. */ * descriptors. */
static void handle_input(int fd) static void handle_input(int fd)
{ {
...@@ -986,8 +1002,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) ...@@ -986,8 +1002,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
} }
/* Each device descriptor is followed by some configuration information. /* Each device descriptor is followed by some configuration information.
* The first byte is a "status" byte for the Guest to report what's happening. * Each configuration field looks like: u8 type, u8 len, [... len bytes...].
* After that are fields: u8 type, u8 len, [... len bytes...].
* *
* This routine adds a new field to an existing device's descriptor. It only * This routine adds a new field to an existing device's descriptor. It only
* works for the last device, but that's OK because that's how we use it. */ * works for the last device, but that's OK because that's how we use it. */
...@@ -1044,14 +1059,17 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, ...@@ -1044,14 +1059,17 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
/* Link virtqueue back to device. */ /* Link virtqueue back to device. */
vq->dev = dev; vq->dev = dev;
/* Set up handler. */ /* Set the routine to call when the Guest does something to this
* virtqueue. */
vq->handle_output = handle_output; vq->handle_output = handle_output;
/* Set the "Don't Notify Me" flag if we don't have a handler */
if (!handle_output) if (!handle_output)
vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
} }
/* This routine does all the creation and setup of a new device, including /* This routine does all the creation and setup of a new device, including
* caling new_dev_desc() to allocate the descriptor and device memory. */ * calling new_dev_desc() to allocate the descriptor and device memory. */
static struct device *new_device(const char *name, u16 type, int fd, static struct device *new_device(const char *name, u16 type, int fd,
bool (*handle_input)(int, struct device *)) bool (*handle_input)(int, struct device *))
{ {
...@@ -1060,7 +1078,7 @@ static struct device *new_device(const char *name, u16 type, int fd, ...@@ -1060,7 +1078,7 @@ static struct device *new_device(const char *name, u16 type, int fd,
/* Append to device list. Prepending to a single-linked list is /* Append to device list. Prepending to a single-linked list is
* easier, but the user expects the devices to be arranged on the bus * easier, but the user expects the devices to be arranged on the bus
* in command-line order. The first network device on the command line * in command-line order. The first network device on the command line
* is eth0, the first block device /dev/lgba, etc. */ * is eth0, the first block device /dev/vda, etc. */
*devices.lastdev = dev; *devices.lastdev = dev;
dev->next = NULL; dev->next = NULL;
devices.lastdev = &dev->next; devices.lastdev = &dev->next;
...@@ -1104,7 +1122,7 @@ static void setup_console(void) ...@@ -1104,7 +1122,7 @@ static void setup_console(void)
/* The console needs two virtqueues: the input then the output. When /* The console needs two virtqueues: the input then the output. When
* they put something the input queue, we make sure we're listening to * they put something the input queue, we make sure we're listening to
* stdin. When they put something in the output queue, we write it to * stdin. When they put something in the output queue, we write it to
* stdout. */ * stdout. */
add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
...@@ -1252,21 +1270,17 @@ static void setup_tun_net(const char *arg) ...@@ -1252,21 +1270,17 @@ static void setup_tun_net(const char *arg)
verbose("attached to bridge: %s\n", br_name); verbose("attached to bridge: %s\n", br_name);
} }
/* Our block (disk) device should be really simple: the Guest asks for a block
/* * number and we read or write that position in the file. Unfortunately, that
* Block device. * was amazingly slow: the Guest waits until the read is finished before
* running anything else, even if it could have been doing useful work.
* *
* Serving a block device is really easy: the Guest asks for a block number and * We could use async I/O, except it's reputed to suck so hard that characters
* we read or write that position in the file. * actually go missing from your code when you try to use it.
*
* Unfortunately, this is amazingly slow: the Guest waits until the read is
* finished before running anything else, even if it could be doing useful
* work. We could use async I/O, except it's reputed to suck so hard that
* characters actually go missing from your code when you try to use it.
* *
* So we farm the I/O out to thread, and communicate with it via a pipe. */ * So we farm the I/O out to thread, and communicate with it via a pipe. */
/* This hangs off device->priv, with the data. */ /* This hangs off device->priv. */
struct vblk_info struct vblk_info
{ {
/* The size of the file. */ /* The size of the file. */
...@@ -1282,8 +1296,14 @@ struct vblk_info ...@@ -1282,8 +1296,14 @@ struct vblk_info
* Launcher triggers interrupt to Guest. */ * Launcher triggers interrupt to Guest. */
int done_fd; int done_fd;
}; };
/*:*/
/* This is the core of the I/O thread. It returns true if it did something. */ /*L:210
* The Disk
*
* Remember that the block device is handled by a separate I/O thread. We head
* straight into the core of that thread here:
*/
static bool service_io(struct device *dev) static bool service_io(struct device *dev)
{ {
struct vblk_info *vblk = dev->priv; struct vblk_info *vblk = dev->priv;
...@@ -1294,10 +1314,14 @@ static bool service_io(struct device *dev) ...@@ -1294,10 +1314,14 @@ static bool service_io(struct device *dev)
struct iovec iov[dev->vq->vring.num]; struct iovec iov[dev->vq->vring.num];
off64_t off; off64_t off;
/* See if there's a request waiting. If not, nothing to do. */
head = get_vq_desc(dev->vq, iov, &out_num, &in_num); head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
if (head == dev->vq->vring.num) if (head == dev->vq->vring.num)
return false; return false;
/* Every block request should contain at least one output buffer
* (detailing the location on disk and the type of request) and one
* input buffer (to hold the result). */
if (out_num == 0 || in_num == 0) if (out_num == 0 || in_num == 0)
errx(1, "Bad virtblk cmd %u out=%u in=%u", errx(1, "Bad virtblk cmd %u out=%u in=%u",
head, out_num, in_num); head, out_num, in_num);
...@@ -1306,10 +1330,15 @@ static bool service_io(struct device *dev) ...@@ -1306,10 +1330,15 @@ static bool service_io(struct device *dev)
in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
off = out->sector * 512; off = out->sector * 512;
/* This is how we implement barriers. Pretty poor, no? */ /* The block device implements "barriers", where the Guest indicates
* that it wants all previous writes to occur before this write. We
* don't have a way of asking our kernel to do a barrier, so we just
* synchronize all the data in the file. Pretty poor, no? */
if (out->type & VIRTIO_BLK_T_BARRIER) if (out->type & VIRTIO_BLK_T_BARRIER)
fdatasync(vblk->fd); fdatasync(vblk->fd);
/* In general the virtio block driver is allowed to try SCSI commands.
* It'd be nice if we supported eject, for example, but we don't. */
if (out->type & VIRTIO_BLK_T_SCSI_CMD) { if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
fprintf(stderr, "Scsi commands unsupported\n"); fprintf(stderr, "Scsi commands unsupported\n");
in->status = VIRTIO_BLK_S_UNSUPP; in->status = VIRTIO_BLK_S_UNSUPP;
...@@ -1375,7 +1404,7 @@ static int io_thread(void *_dev) ...@@ -1375,7 +1404,7 @@ static int io_thread(void *_dev)
/* When this read fails, it means Launcher died, so we follow. */ /* When this read fails, it means Launcher died, so we follow. */
while (read(vblk->workpipe[0], &c, 1) == 1) { while (read(vblk->workpipe[0], &c, 1) == 1) {
/* We acknowledge each request immediately, to reduce latency, /* We acknowledge each request immediately to reduce latency,
* rather than waiting until we've done them all. I haven't * rather than waiting until we've done them all. I haven't
* measured to see if it makes any difference. */ * measured to see if it makes any difference. */
while (service_io(dev)) while (service_io(dev))
...@@ -1384,12 +1413,14 @@ static int io_thread(void *_dev) ...@@ -1384,12 +1413,14 @@ static int io_thread(void *_dev)
return 0; return 0;
} }
/* When the thread says some I/O is done, we interrupt the Guest. */ /* Now we've seen the I/O thread, we return to the Launcher to see what happens
* when the thread tells us it's completed some I/O. */
static bool handle_io_finish(int fd, struct device *dev) static bool handle_io_finish(int fd, struct device *dev)
{ {
char c; char c;
/* If child died, presumably it printed message. */ /* If the I/O thread died, presumably it printed the error, so we
* simply exit. */
if (read(dev->fd, &c, 1) != 1) if (read(dev->fd, &c, 1) != 1)
exit(1); exit(1);
...@@ -1398,7 +1429,7 @@ static bool handle_io_finish(int fd, struct device *dev) ...@@ -1398,7 +1429,7 @@ static bool handle_io_finish(int fd, struct device *dev)
return true; return true;
} }
/* When the Guest submits some I/O, we wake the I/O thread. */ /* When the Guest submits some I/O, we just need to wake the I/O thread. */
static void handle_virtblk_output(int fd, struct virtqueue *vq) static void handle_virtblk_output(int fd, struct virtqueue *vq)
{ {
struct vblk_info *vblk = vq->dev->priv; struct vblk_info *vblk = vq->dev->priv;
...@@ -1410,7 +1441,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq) ...@@ -1410,7 +1441,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq)
exit(1); exit(1);
} }
/* This creates a virtual block device. */ /*L:198 This actually sets up a virtual block device. */
static void setup_block_file(const char *filename) static void setup_block_file(const char *filename)
{ {
int p[2]; int p[2];
...@@ -1426,7 +1457,7 @@ static void setup_block_file(const char *filename) ...@@ -1426,7 +1457,7 @@ static void setup_block_file(const char *filename)
/* The device responds to return from I/O thread. */ /* The device responds to return from I/O thread. */
dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
/* The device has a virtqueue. */ /* The device has one virtqueue, where the Guest places requests. */
add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
/* Allocate the room for our own bookkeeping */ /* Allocate the room for our own bookkeeping */
...@@ -1448,7 +1479,8 @@ static void setup_block_file(const char *filename) ...@@ -1448,7 +1479,8 @@ static void setup_block_file(const char *filename)
/* The I/O thread writes to this end of the pipe when done. */ /* The I/O thread writes to this end of the pipe when done. */
vblk->done_fd = p[1]; vblk->done_fd = p[1];
/* This is how we tell the I/O thread about more work. */ /* This is the second pipe, which is how we tell the I/O thread about
* more work. */
pipe(vblk->workpipe); pipe(vblk->workpipe);
/* Create stack for thread and run it */ /* Create stack for thread and run it */
...@@ -1487,24 +1519,25 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) ...@@ -1487,24 +1519,25 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
char reason[1024] = { 0 }; char reason[1024] = { 0 };
read(lguest_fd, reason, sizeof(reason)-1); read(lguest_fd, reason, sizeof(reason)-1);
errx(1, "%s", reason); errx(1, "%s", reason);
/* EAGAIN means the waker wanted us to look at some input. /* EAGAIN means the Waker wanted us to look at some input.
* Anything else means a bug or incompatible change. */ * Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN) } else if (errno != EAGAIN)
err(1, "Running guest failed"); err(1, "Running guest failed");
/* Service input, then unset the BREAK which releases /* Service input, then unset the BREAK to release the Waker. */
* the Waker. */
handle_input(lguest_fd); handle_input(lguest_fd);
if (write(lguest_fd, args, sizeof(args)) < 0) if (write(lguest_fd, args, sizeof(args)) < 0)
err(1, "Resetting break"); err(1, "Resetting break");
} }
} }
/* /*
* This is the end of the Launcher. * This is the end of the Launcher. The good news: we are over halfway
* through! The bad news: the most fiendish part of the code still lies ahead
* of us.
* *
* But wait! We've seen I/O from the Launcher, and we've seen I/O from the * Are you ready? Take a deep breath and join me in the core of the Host, in
* Drivers. If we were to see the Host kernel I/O code, our understanding * "make Host".
* would be complete... :*/ :*/
static struct option opts[] = { static struct option opts[] = {
{ "verbose", 0, NULL, 'v' }, { "verbose", 0, NULL, 'v' },
...@@ -1527,7 +1560,7 @@ int main(int argc, char *argv[]) ...@@ -1527,7 +1560,7 @@ int main(int argc, char *argv[])
/* Memory, top-level pagetable, code startpoint and size of the /* Memory, top-level pagetable, code startpoint and size of the
* (optional) initrd. */ * (optional) initrd. */
unsigned long mem = 0, pgdir, start, initrd_size = 0; unsigned long mem = 0, pgdir, start, initrd_size = 0;
/* A temporary and the /dev/lguest file descriptor. */ /* Two temporaries and the /dev/lguest file descriptor. */
int i, c, lguest_fd; int i, c, lguest_fd;
/* The boot information for the Guest. */ /* The boot information for the Guest. */
struct boot_params *boot; struct boot_params *boot;
...@@ -1622,6 +1655,7 @@ int main(int argc, char *argv[]) ...@@ -1622,6 +1655,7 @@ int main(int argc, char *argv[])
/* The boot header contains a command line pointer: we put the command /* The boot header contains a command line pointer: we put the command
* line after the boot header. */ * line after the boot header. */
boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
/* We use a simple helper to copy the arguments separated by spaces. */
concat((char *)(boot + 1), argv+optind+2); concat((char *)(boot + 1), argv+optind+2);
/* Boot protocol version: 2.07 supports the fields for lguest. */ /* Boot protocol version: 2.07 supports the fields for lguest. */
......
...@@ -2259,6 +2259,13 @@ L: legousb-devel@lists.sourceforge.net ...@@ -2259,6 +2259,13 @@ L: legousb-devel@lists.sourceforge.net
W: http://legousb.sourceforge.net/ W: http://legousb.sourceforge.net/
S: Maintained S: Maintained
LGUEST
P: Rusty Russell
M: rusty@rustcorp.com.au
L: lguest@ozlabs.org
W: http://lguest.ozlabs.org/
S: Maintained
LINUX FOR IBM pSERIES (RS/6000) LINUX FOR IBM pSERIES (RS/6000)
P: Paul Mackerras P: Paul Mackerras
M: paulus@au.ibm.com M: paulus@au.ibm.com
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <linux/lguest.h> #include <linux/lguest.h>
#include <linux/lguest_launcher.h> #include <linux/lguest_launcher.h>
#include <linux/virtio_console.h> #include <linux/virtio_console.h>
#include <linux/pm.h>
#include <asm/paravirt.h> #include <asm/paravirt.h>
#include <asm/param.h> #include <asm/param.h>
#include <asm/page.h> #include <asm/page.h>
...@@ -98,7 +99,7 @@ static cycle_t clock_base; ...@@ -98,7 +99,7 @@ static cycle_t clock_base;
* When lazy_mode is set, it means we're allowed to defer all hypercalls and do * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
* them as a batch when lazy_mode is eventually turned off. Because hypercalls * them as a batch when lazy_mode is eventually turned off. Because hypercalls
* are reasonably expensive, batching them up makes sense. For example, a * are reasonably expensive, batching them up makes sense. For example, a
* large mmap might update dozens of page table entries: that code calls * large munmap might update dozens of page table entries: that code calls
* paravirt_enter_lazy_mmu(), does the dozen updates, then calls * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
* lguest_leave_lazy_mode(). * lguest_leave_lazy_mode().
* *
...@@ -163,8 +164,8 @@ void async_hcall(unsigned long call, ...@@ -163,8 +164,8 @@ void async_hcall(unsigned long call,
/*:*/ /*:*/
/*G:033 /*G:033
* Here are our first native-instruction replacements: four functions for * After that diversion we return to our first native-instruction
* interrupt control. * replacements: four functions for interrupt control.
* *
* The simplest way of implementing these would be to have "turn interrupts * The simplest way of implementing these would be to have "turn interrupts
* off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
...@@ -183,7 +184,7 @@ static unsigned long save_fl(void) ...@@ -183,7 +184,7 @@ static unsigned long save_fl(void)
return lguest_data.irq_enabled; return lguest_data.irq_enabled;
} }
/* "restore_flags" just sets the flags back to the value given. */ /* restore_flags() just sets the flags back to the value given. */
static void restore_fl(unsigned long flags) static void restore_fl(unsigned long flags)
{ {
lguest_data.irq_enabled = flags; lguest_data.irq_enabled = flags;
...@@ -356,7 +357,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, ...@@ -356,7 +357,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
* it. The Host needs to know when the Guest wants to change them, so we have * it. The Host needs to know when the Guest wants to change them, so we have
* a whole series of functions like read_cr0() and write_cr0(). * a whole series of functions like read_cr0() and write_cr0().
* *
* We start with CR0. CR0 allows you to turn on and off all kinds of basic * We start with cr0. cr0 allows you to turn on and off all kinds of basic
* features, but Linux only really cares about one: the horrifically-named Task * features, but Linux only really cares about one: the horrifically-named Task
* Switched (TS) bit at bit 3 (ie. 8) * Switched (TS) bit at bit 3 (ie. 8)
* *
...@@ -371,8 +372,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, ...@@ -371,8 +372,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
static unsigned long current_cr0, current_cr3; static unsigned long current_cr0, current_cr3;
static void lguest_write_cr0(unsigned long val) static void lguest_write_cr0(unsigned long val)
{ {
/* 8 == TS bit. */ lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);
lazy_hcall(LHCALL_TS, val & 8, 0, 0);
current_cr0 = val; current_cr0 = val;
} }
...@@ -387,10 +387,10 @@ static unsigned long lguest_read_cr0(void) ...@@ -387,10 +387,10 @@ static unsigned long lguest_read_cr0(void)
static void lguest_clts(void) static void lguest_clts(void)
{ {
lazy_hcall(LHCALL_TS, 0, 0, 0); lazy_hcall(LHCALL_TS, 0, 0, 0);
current_cr0 &= ~8U; current_cr0 &= ~X86_CR0_TS;
} }
/* CR2 is the virtual address of the last page fault, which the Guest only ever /* cr2 is the virtual address of the last page fault, which the Guest only ever
* reads. The Host kindly writes this into our "struct lguest_data", so we * reads. The Host kindly writes this into our "struct lguest_data", so we
* just read it out of there. */ * just read it out of there. */
static unsigned long lguest_read_cr2(void) static unsigned long lguest_read_cr2(void)
...@@ -398,7 +398,7 @@ static unsigned long lguest_read_cr2(void) ...@@ -398,7 +398,7 @@ static unsigned long lguest_read_cr2(void)
return lguest_data.cr2; return lguest_data.cr2;
} }
/* CR3 is the current toplevel pagetable page: the principle is the same as /* cr3 is the current toplevel pagetable page: the principle is the same as
* cr0. Keep a local copy, and tell the Host when it changes. */ * cr0. Keep a local copy, and tell the Host when it changes. */
static void lguest_write_cr3(unsigned long cr3) static void lguest_write_cr3(unsigned long cr3)
{ {
...@@ -411,7 +411,7 @@ static unsigned long lguest_read_cr3(void) ...@@ -411,7 +411,7 @@ static unsigned long lguest_read_cr3(void)
return current_cr3; return current_cr3;
} }
/* CR4 is used to enable and disable PGE, but we don't care. */ /* cr4 is used to enable and disable PGE, but we don't care. */
static unsigned long lguest_read_cr4(void) static unsigned long lguest_read_cr4(void)
{ {
return 0; return 0;
...@@ -432,7 +432,7 @@ static void lguest_write_cr4(unsigned long val) ...@@ -432,7 +432,7 @@ static void lguest_write_cr4(unsigned long val)
* maps virtual addresses to physical addresses using "page tables". We could * maps virtual addresses to physical addresses using "page tables". We could
* use one huge index of 1 million entries: each address is 4 bytes, so that's * use one huge index of 1 million entries: each address is 4 bytes, so that's
* 1024 pages just to hold the page tables. But since most virtual addresses * 1024 pages just to hold the page tables. But since most virtual addresses
* are unused, we use a two level index which saves space. The CR3 register * are unused, we use a two level index which saves space. The cr3 register
* contains the physical address of the top level "page directory" page, which * contains the physical address of the top level "page directory" page, which
* contains physical addresses of up to 1024 second-level pages. Each of these * contains physical addresses of up to 1024 second-level pages. Each of these
* second level pages contains up to 1024 physical addresses of actual pages, * second level pages contains up to 1024 physical addresses of actual pages,
...@@ -440,7 +440,7 @@ static void lguest_write_cr4(unsigned long val) ...@@ -440,7 +440,7 @@ static void lguest_write_cr4(unsigned long val)
* *
* Here's a diagram, where arrows indicate physical addresses: * Here's a diagram, where arrows indicate physical addresses:
* *
* CR3 ---> +---------+ * cr3 ---> +---------+
* | --------->+---------+ * | --------->+---------+
* | | | PADDR1 | * | | | PADDR1 |
* Top-level | | PADDR2 | * Top-level | | PADDR2 |
...@@ -498,8 +498,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) ...@@ -498,8 +498,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* *
* ... except in early boot when the kernel sets up the initial pagetables, * ... except in early boot when the kernel sets up the initial pagetables,
* which makes booting astonishingly slow. So we don't even tell the Host * which makes booting astonishingly slow. So we don't even tell the Host
* anything changed until we've done the first page table switch. * anything changed until we've done the first page table switch. */
*/
static void lguest_set_pte(pte_t *ptep, pte_t pteval) static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{ {
*ptep = pteval; *ptep = pteval;
...@@ -720,10 +719,10 @@ static void lguest_time_init(void) ...@@ -720,10 +719,10 @@ static void lguest_time_init(void)
/* Set up the timer interrupt (0) to go to our simple timer routine */ /* Set up the timer interrupt (0) to go to our simple timer routine */
set_irq_handler(0, lguest_time_irq); set_irq_handler(0, lguest_time_irq);
/* Our clock structure look like arch/i386/kernel/tsc.c if we can use /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can
* the TSC, otherwise it's a dumb nanosecond-resolution clock. Either * use the TSC, otherwise it's a dumb nanosecond-resolution clock.
* way, the "rating" is initialized so high that it's always chosen * Either way, the "rating" is set so high that it's always chosen over
* over any other clocksource. */ * any other clocksource. */
if (lguest_data.tsc_khz) if (lguest_data.tsc_khz)
lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
lguest_clock.shift); lguest_clock.shift);
...@@ -749,7 +748,7 @@ static void lguest_time_init(void) ...@@ -749,7 +748,7 @@ static void lguest_time_init(void)
* to work. They're pretty simple. * to work. They're pretty simple.
*/ */
/* The Guest needs to tell the host what stack it expects traps to use. For /* The Guest needs to tell the Host what stack it expects traps to use. For
* native hardware, this is part of the Task State Segment mentioned above in * native hardware, this is part of the Task State Segment mentioned above in
* lguest_load_tr_desc(), but to help hypervisors there's this special call. * lguest_load_tr_desc(), but to help hypervisors there's this special call.
* *
...@@ -850,13 +849,16 @@ static __init char *lguest_memory_setup(void) ...@@ -850,13 +849,16 @@ static __init char *lguest_memory_setup(void)
return "LGUEST"; return "LGUEST";
} }
/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to /* We will eventually use the virtio console device to produce console output,
* produce console output. */ * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
* console output. */
static __init int early_put_chars(u32 vtermno, const char *buf, int count) static __init int early_put_chars(u32 vtermno, const char *buf, int count)
{ {
char scratch[17]; char scratch[17];
unsigned int len = count; unsigned int len = count;
/* We use a nul-terminated string, so we have to make a copy. Icky,
* huh? */
if (len > sizeof(scratch) - 1) if (len > sizeof(scratch) - 1)
len = sizeof(scratch) - 1; len = sizeof(scratch) - 1;
scratch[len] = '\0'; scratch[len] = '\0';
...@@ -883,7 +885,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) ...@@ -883,7 +885,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
* Our current solution is to allow the paravirt back end to optionally patch * Our current solution is to allow the paravirt back end to optionally patch
* over the indirect calls to replace them with something more efficient. We * over the indirect calls to replace them with something more efficient. We
* patch the four most commonly called functions: disable interrupts, enable * patch the four most commonly called functions: disable interrupts, enable
* interrupts, restore interrupts and save interrupts. We usually have 10 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10
* bytes to patch into: the Guest versions of these operations are small enough * bytes to patch into: the Guest versions of these operations are small enough
* that we can fit comfortably. * that we can fit comfortably.
* *
...@@ -1015,7 +1017,7 @@ __init void lguest_init(void) ...@@ -1015,7 +1017,7 @@ __init void lguest_init(void)
asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
/* The Host uses the top of the Guest's virtual address space for the /* The Host uses the top of the Guest's virtual address space for the
* Host<->Guest Switcher, and it tells us how much it needs in * Host<->Guest Switcher, and it tells us how big that is in
* lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
reserve_top_address(lguest_data.reserve_mem); reserve_top_address(lguest_data.reserve_mem);
...@@ -1065,6 +1067,6 @@ __init void lguest_init(void) ...@@ -1065,6 +1067,6 @@ __init void lguest_init(void)
/* /*
* This marks the end of stage II of our journey, The Guest. * This marks the end of stage II of our journey, The Guest.
* *
* It is now time for us to explore the nooks and crannies of the three Guest * It is now time for us to explore the layer of virtual drivers and complete
* devices and complete our understanding of the Guest in "make Drivers". * our understanding of the Guest in "make Drivers".
*/ */
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <asm/processor-flags.h> #include <asm/processor-flags.h>
/*G:020 This is where we begin: head.S notes that the boot header's platform /*G:020 This is where we begin: head.S notes that the boot header's platform
* type field is "1" (lguest), so calls us here. The boot header is in %esi. * type field is "1" (lguest), so calls us here.
* *
* WARNING: be very careful here! We're running at addresses equal to physical * WARNING: be very careful here! We're running at addresses equal to physical
* addesses (around 0), not above PAGE_OFFSET as most code expectes * addesses (around 0), not above PAGE_OFFSET as most code expectes
...@@ -17,13 +17,15 @@ ...@@ -17,13 +17,15 @@
* boot. */ * boot. */
.section .init.text, "ax", @progbits .section .init.text, "ax", @progbits
ENTRY(lguest_entry) ENTRY(lguest_entry)
/* Make initial hypercall now, so we can set up the pagetables. */ /* We make the "initialization" hypercall now to tell the Host about
* us, and also find out where it put our page tables. */
movl $LHCALL_LGUEST_INIT, %eax movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %edx movl $lguest_data - __PAGE_OFFSET, %edx
int $LGUEST_TRAP_ENTRY int $LGUEST_TRAP_ENTRY
/* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
* instruction uses %esi implicitly. */ * instruction uses %esi implicitly as the source for the copy we'
* about to do. */
movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
/* Copy first 32 entries of page directory to __PAGE_OFFSET entries. /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
......
...@@ -128,9 +128,12 @@ static void unmap_switcher(void) ...@@ -128,9 +128,12 @@ static void unmap_switcher(void)
__free_pages(switcher_page[i], 0); __free_pages(switcher_page[i], 0);
} }
/*L:305 /*H:032
* Dealing With Guest Memory. * Dealing With Guest Memory.
* *
* Before we go too much further into the Host, we need to grok the routines
* we use to deal with Guest memory.
*
* When the Guest gives us (what it thinks is) a physical address, we can use * When the Guest gives us (what it thinks is) a physical address, we can use
* the normal copy_from_user() & copy_to_user() on the corresponding place in * the normal copy_from_user() & copy_to_user() on the corresponding place in
* the memory region allocated by the Launcher. * the memory region allocated by the Launcher.
......
...@@ -90,6 +90,7 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args) ...@@ -90,6 +90,7 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
lg->pending_notify = args->arg1; lg->pending_notify = args->arg1;
break; break;
default: default:
/* It should be an architecture-specific hypercall. */
if (lguest_arch_do_hcall(lg, args)) if (lguest_arch_do_hcall(lg, args))
kill_guest(lg, "Bad hypercall %li\n", args->arg0); kill_guest(lg, "Bad hypercall %li\n", args->arg0);
} }
...@@ -157,7 +158,6 @@ static void do_async_hcalls(struct lguest *lg) ...@@ -157,7 +158,6 @@ static void do_async_hcalls(struct lguest *lg)
* Guest makes a hypercall, we end up here to set things up: */ * Guest makes a hypercall, we end up here to set things up: */
static void initialize(struct lguest *lg) static void initialize(struct lguest *lg)
{ {
/* You can't do anything until you're initialized. The Guest knows the /* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here. */ * rules, so we're unforgiving here. */
if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
...@@ -174,7 +174,8 @@ static void initialize(struct lguest *lg) ...@@ -174,7 +174,8 @@ static void initialize(struct lguest *lg)
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
kill_guest(lg, "bad guest page %p", lg->lguest_data); kill_guest(lg, "bad guest page %p", lg->lguest_data);
/* We write the current time into the Guest's data page once now. */ /* We write the current time into the Guest's data page once so it can
* set its clock. */
write_timestamp(lg); write_timestamp(lg);
/* page_tables.c will also do some setup. */ /* page_tables.c will also do some setup. */
...@@ -182,8 +183,8 @@ static void initialize(struct lguest *lg) ...@@ -182,8 +183,8 @@ static void initialize(struct lguest *lg)
/* This is the one case where the above accesses might have been the /* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write * first write to a Guest page. This may have caused a copy-on-write
* fault, but the Guest might be referring to the old (read-only) * fault, but the old page might be (read-only) in the Guest
* page. */ * pagetable. */
guest_pagetable_clear_all(lg); guest_pagetable_clear_all(lg);
} }
...@@ -220,7 +221,7 @@ void do_hypercalls(struct lguest *lg) ...@@ -220,7 +221,7 @@ void do_hypercalls(struct lguest *lg)
* Normally it doesn't matter: the Guest will run again and * Normally it doesn't matter: the Guest will run again and
* update the trap number before we come back here. * update the trap number before we come back here.
* *
* However, if we are signalled or the Guest sends DMA to the * However, if we are signalled or the Guest sends I/O to the
* Launcher, the run_guest() loop will exit without running the * Launcher, the run_guest() loop will exit without running the
* Guest. When it comes back it would try to re-run the * Guest. When it comes back it would try to re-run the
* hypercall. */ * hypercall. */
......
...@@ -92,8 +92,8 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) ...@@ -92,8 +92,8 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
/* Remember that we never let the Guest actually disable interrupts, so /* Remember that we never let the Guest actually disable interrupts, so
* the "Interrupt Flag" bit is always set. We copy that bit from the * the "Interrupt Flag" bit is always set. We copy that bit from the
* Guest's "irq_enabled" field into the eflags word: the Guest copies * Guest's "irq_enabled" field into the eflags word: we saw the Guest
* it back in "lguest_iret". */ * copy it back in "lguest_iret". */
eflags = lg->regs->eflags; eflags = lg->regs->eflags;
if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
&& !(irq_enable & X86_EFLAGS_IF)) && !(irq_enable & X86_EFLAGS_IF))
...@@ -124,7 +124,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) ...@@ -124,7 +124,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
kill_guest(lg, "Disabling interrupts"); kill_guest(lg, "Disabling interrupts");
} }
/*H:200 /*H:205
* Virtual Interrupts. * Virtual Interrupts.
* *
* maybe_do_interrupt() gets called before every entry to the Guest, to see if * maybe_do_interrupt() gets called before every entry to the Guest, to see if
...@@ -256,19 +256,21 @@ int deliver_trap(struct lguest *lg, unsigned int num) ...@@ -256,19 +256,21 @@ int deliver_trap(struct lguest *lg, unsigned int num)
* bogus one in): if we fail here, the Guest will be killed. */ * bogus one in): if we fail here, the Guest will be killed. */
if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
return 0; return 0;
set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
has_err(num));
return 1; return 1;
} }
/*H:250 Here's the hard part: returning to the Host every time a trap happens /*H:250 Here's the hard part: returning to the Host every time a trap happens
* and then calling deliver_trap() and re-entering the Guest is slow. * and then calling deliver_trap() and re-entering the Guest is slow.
* Particularly because Guest userspace system calls are traps (trap 128). * Particularly because Guest userspace system calls are traps (usually trap
* 128).
* *
* So we'd like to set up the IDT to tell the CPU to deliver traps directly * So we'd like to set up the IDT to tell the CPU to deliver traps directly
* into the Guest. This is possible, but the complexities cause the size of * into the Guest. This is possible, but the complexities cause the size of
* this file to double! However, 150 lines of code is worth writing for taking * this file to double! However, 150 lines of code is worth writing for taking
* system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
* the other hypervisors would tease it. * the other hypervisors would beat it up at lunchtime.
* *
* This routine indicates if a particular trap number could be delivered * This routine indicates if a particular trap number could be delivered
* directly. */ * directly. */
...@@ -331,7 +333,7 @@ void pin_stack_pages(struct lguest *lg) ...@@ -331,7 +333,7 @@ void pin_stack_pages(struct lguest *lg)
* change stacks on each context switch. */ * change stacks on each context switch. */
void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
{ {
/* You are not allowd have a stack segment with privilege level 0: bad /* You are not allowed have a stack segment with privilege level 0: bad
* Guest! */ * Guest! */
if ((seg & 0x3) != GUEST_PL) if ((seg & 0x3) != GUEST_PL)
kill_guest(lg, "bad stack segment %i", seg); kill_guest(lg, "bad stack segment %i", seg);
...@@ -350,7 +352,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) ...@@ -350,7 +352,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
* part of the Host: page table handling. */ * part of the Host: page table handling. */
/*H:235 This is the routine which actually checks the Guest's IDT entry and /*H:235 This is the routine which actually checks the Guest's IDT entry and
* transfers it into our entry in "struct lguest": */ * transfers it into the entry in "struct lguest": */
static void set_trap(struct lguest *lg, struct desc_struct *trap, static void set_trap(struct lguest *lg, struct desc_struct *trap,
unsigned int num, u32 lo, u32 hi) unsigned int num, u32 lo, u32 hi)
{ {
...@@ -456,6 +458,18 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, ...@@ -456,6 +458,18 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
} }
} }
/*H:200
* The Guest Clock.
*
* There are two sources of virtual interrupts. We saw one in lguest_user.c:
* the Launcher sending interrupts for virtual devices. The other is the Guest
* timer interrupt.
*
* The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
* the next timer interrupt (in nanoseconds). We use the high-resolution timer
* infrastructure to set a callback at that time.
*
* 0 means "turn off the clock". */
void guest_set_clockevent(struct lguest *lg, unsigned long delta) void guest_set_clockevent(struct lguest *lg, unsigned long delta)
{ {
ktime_t expires; ktime_t expires;
...@@ -466,20 +480,27 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta) ...@@ -466,20 +480,27 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
return; return;
} }
/* We use wallclock time here, so the Guest might not be running for
* all the time between now and the timer interrupt it asked for. This
* is almost always the right thing to do. */
expires = ktime_add_ns(ktime_get_real(), delta); expires = ktime_add_ns(ktime_get_real(), delta);
hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
} }
/* This is the function called when the Guest's timer expires. */
static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
{ {
struct lguest *lg = container_of(timer, struct lguest, hrt); struct lguest *lg = container_of(timer, struct lguest, hrt);
/* Remember the first interrupt is the timer interrupt. */
set_bit(0, lg->irqs_pending); set_bit(0, lg->irqs_pending);
/* If the Guest is actually stopped, we need to wake it up. */
if (lg->halted) if (lg->halted)
wake_up_process(lg->tsk); wake_up_process(lg->tsk);
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
/* This sets up the timer for this Guest. */
void init_clockdev(struct lguest *lg) void init_clockdev(struct lguest *lg)
{ {
hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
......
...@@ -74,9 +74,6 @@ struct lguest ...@@ -74,9 +74,6 @@ struct lguest
u32 pgdidx; u32 pgdidx;
struct pgdir pgdirs[4]; struct pgdir pgdirs[4];
/* Cached wakeup: we hold a reference to this task. */
struct task_struct *wake;
unsigned long noirq_start, noirq_end; unsigned long noirq_start, noirq_end;
unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
...@@ -103,7 +100,7 @@ int lguest_address_ok(const struct lguest *lg, ...@@ -103,7 +100,7 @@ int lguest_address_ok(const struct lguest *lg,
void __lgread(struct lguest *, void *, unsigned long, unsigned); void __lgread(struct lguest *, void *, unsigned long, unsigned);
void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
/*L:306 Using memory-copy operations like that is usually inconvient, so we /*H:035 Using memory-copy operations like that is usually inconvient, so we
* have the following helper macros which read and write a specific type (often * have the following helper macros which read and write a specific type (often
* an unsigned long). * an unsigned long).
* *
...@@ -191,7 +188,7 @@ void write_timestamp(struct lguest *lg); ...@@ -191,7 +188,7 @@ void write_timestamp(struct lguest *lg);
* Let's step aside for the moment, to study one important routine that's used * Let's step aside for the moment, to study one important routine that's used
* widely in the Host code. * widely in the Host code.
* *
* There are many cases where the Guest does something invalid, like pass crap * There are many cases where the Guest can do something invalid, like pass crap
* to a hypercall. Since only the Guest kernel can make hypercalls, it's quite * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
* acceptable to simply terminate the Guest and give the Launcher a nicely * acceptable to simply terminate the Guest and give the Launcher a nicely
* formatted reason. It's also simpler for the Guest itself, which doesn't * formatted reason. It's also simpler for the Guest itself, which doesn't
......
...@@ -53,7 +53,8 @@ struct lguest_device { ...@@ -53,7 +53,8 @@ struct lguest_device {
* Device configurations * Device configurations
* *
* The configuration information for a device consists of a series of fields. * The configuration information for a device consists of a series of fields.
* The device will look for these fields during setup. * We don't really care what they are: the Launcher set them up, and the driver
* will look at them during setup.
* *
* For us these fields come immediately after that device's descriptor in the * For us these fields come immediately after that device's descriptor in the
* lguest_devices page. * lguest_devices page.
...@@ -122,8 +123,8 @@ static void lg_set_status(struct virtio_device *vdev, u8 status) ...@@ -122,8 +123,8 @@ static void lg_set_status(struct virtio_device *vdev, u8 status)
* The other piece of infrastructure virtio needs is a "virtqueue": a way of * The other piece of infrastructure virtio needs is a "virtqueue": a way of
* the Guest device registering buffers for the other side to read from or * the Guest device registering buffers for the other side to read from or
* write into (ie. send and receive buffers). Each device can have multiple * write into (ie. send and receive buffers). Each device can have multiple
* virtqueues: for example the console has one queue for sending and one for * virtqueues: for example the console driver uses one queue for sending and
* receiving. * another for receiving.
* *
* Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
* already exists in virtio_ring.c. We just need to connect it up. * already exists in virtio_ring.c. We just need to connect it up.
...@@ -158,7 +159,7 @@ static void lg_notify(struct virtqueue *vq) ...@@ -158,7 +159,7 @@ static void lg_notify(struct virtqueue *vq)
* *
* This is kind of an ugly duckling. It'd be nicer to have a standard * This is kind of an ugly duckling. It'd be nicer to have a standard
* representation of a virtqueue in the configuration space, but it seems that * representation of a virtqueue in the configuration space, but it seems that
* everyone wants to do it differently. The KVM guys want the Guest to * everyone wants to do it differently. The KVM coders want the Guest to
* allocate its own pages and tell the Host where they are, but for lguest it's * allocate its own pages and tell the Host where they are, but for lguest it's
* simpler for the Host to simply tell us where the pages are. * simpler for the Host to simply tell us where the pages are.
* *
...@@ -284,6 +285,8 @@ static void add_lguest_device(struct lguest_device_desc *d) ...@@ -284,6 +285,8 @@ static void add_lguest_device(struct lguest_device_desc *d)
{ {
struct lguest_device *ldev; struct lguest_device *ldev;
/* Start with zeroed memory; Linux's device layer seems to count on
* it. */
ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
if (!ldev) { if (!ldev) {
printk(KERN_EMERG "Cannot allocate lguest dev %u\n", printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
......
...@@ -8,20 +8,22 @@ ...@@ -8,20 +8,22 @@
#include <linux/fs.h> #include <linux/fs.h>
#include "lg.h" #include "lg.h"
/*L:315 To force the Guest to stop running and return to the Launcher, the /*L:055 When something happens, the Waker process needs a way to stop the
* Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The * kernel running the Guest and return to the Launcher. So the Waker writes
* Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
* has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
* the Waker. */
static int break_guest_out(struct lguest *lg, const unsigned long __user *input) static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
{ {
unsigned long on; unsigned long on;
/* Fetch whether they're turning break on or off.. */ /* Fetch whether they're turning break on or off. */
if (get_user(on, input) != 0) if (get_user(on, input) != 0)
return -EFAULT; return -EFAULT;
if (on) { if (on) {
lg->break_out = 1; lg->break_out = 1;
/* Pop it out (may be running on different CPU) */ /* Pop it out of the Guest (may be running on different CPU) */
wake_up_process(lg->tsk); wake_up_process(lg->tsk);
/* Wait for them to reset it */ /* Wait for them to reset it */
return wait_event_interruptible(lg->break_wq, !lg->break_out); return wait_event_interruptible(lg->break_wq, !lg->break_out);
...@@ -58,7 +60,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) ...@@ -58,7 +60,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
if (!lg) if (!lg)
return -EINVAL; return -EINVAL;
/* If you're not the task which owns the guest, go away. */ /* If you're not the task which owns the Guest, go away. */
if (current != lg->tsk) if (current != lg->tsk)
return -EPERM; return -EPERM;
...@@ -92,8 +94,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) ...@@ -92,8 +94,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
* base: The start of the Guest-physical memory inside the Launcher memory. * base: The start of the Guest-physical memory inside the Launcher memory.
* *
* pfnlimit: The highest (Guest-physical) page number the Guest should be * pfnlimit: The highest (Guest-physical) page number the Guest should be
* allowed to access. The Launcher has to live in Guest memory, so it sets * allowed to access. The Guest memory lives inside the Launcher, so it sets
* this to ensure the Guest can't reach it. * this to ensure the Guest can only reach its own memory.
* *
* pgdir: The (Guest-physical) address of the top of the initial Guest * pgdir: The (Guest-physical) address of the top of the initial Guest
* pagetables (which are set up by the Launcher). * pagetables (which are set up by the Launcher).
...@@ -189,7 +191,7 @@ static int initialize(struct file *file, const unsigned long __user *input) ...@@ -189,7 +191,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
} }
/*L:010 The first operation the Launcher does must be a write. All writes /*L:010 The first operation the Launcher does must be a write. All writes
* start with a 32 bit number: for the first write this must be * start with an unsigned long number: for the first write this must be
* LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
* writes of other values to send interrupts. */ * writes of other values to send interrupts. */
static ssize_t write(struct file *file, const char __user *in, static ssize_t write(struct file *file, const char __user *in,
...@@ -275,8 +277,7 @@ static int close(struct inode *inode, struct file *file) ...@@ -275,8 +277,7 @@ static int close(struct inode *inode, struct file *file)
* The Launcher is the Host userspace program which sets up, runs and services * The Launcher is the Host userspace program which sets up, runs and services
* the Guest. In fact, many comments in the Drivers which refer to "the Host" * the Guest. In fact, many comments in the Drivers which refer to "the Host"
* doing things are inaccurate: the Launcher does all the device handling for * doing things are inaccurate: the Launcher does all the device handling for
* the Guest. The Guest can't tell what's done by the the Launcher and what by * the Guest, but the Guest can't know that.
* the Host.
* *
* Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
* shall see more of that later. * shall see more of that later.
......
...@@ -26,7 +26,8 @@ ...@@ -26,7 +26,8 @@
* *
* We use two-level page tables for the Guest. If you're not entirely * We use two-level page tables for the Guest. If you're not entirely
* comfortable with virtual addresses, physical addresses and page tables then * comfortable with virtual addresses, physical addresses and page tables then
* I recommend you review lguest.c's "Page Table Handling" (with diagrams!). * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with
* diagrams!).
* *
* The Guest keeps page tables, but we maintain the actual ones here: these are * The Guest keeps page tables, but we maintain the actual ones here: these are
* called "shadow" page tables. Which is a very Guest-centric name: these are * called "shadow" page tables. Which is a very Guest-centric name: these are
...@@ -36,11 +37,11 @@ ...@@ -36,11 +37,11 @@
* *
* Anyway, this is the most complicated part of the Host code. There are seven * Anyway, this is the most complicated part of the Host code. There are seven
* parts to this: * parts to this:
* (i) Setting up a page table entry for the Guest when it faults, * (i) Looking up a page table entry when the Guest faults,
* (ii) Setting up the page table entry for the Guest stack, * (ii) Making sure the Guest stack is mapped,
* (iii) Setting up a page table entry when the Guest tells us it has changed, * (iii) Setting up a page table entry when the Guest tells us one has changed,
* (iv) Switching page tables, * (iv) Switching page tables,
* (v) Flushing (thowing away) page tables, * (v) Flushing (throwing away) page tables,
* (vi) Mapping the Switcher when the Guest is about to run, * (vi) Mapping the Switcher when the Guest is about to run,
* (vii) Setting up the page tables initially. * (vii) Setting up the page tables initially.
:*/ :*/
...@@ -57,16 +58,15 @@ ...@@ -57,16 +58,15 @@
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
/*H:320 With our shadow and Guest types established, we need to deal with /*H:320 The page table code is curly enough to need helper functions to keep it
* them: the page table code is curly enough to need helper functions to keep * clear and clean.
* it clear and clean.
* *
* There are two functions which return pointers to the shadow (aka "real") * There are two functions which return pointers to the shadow (aka "real")
* page tables. * page tables.
* *
* spgd_addr() takes the virtual address and returns a pointer to the top-level * spgd_addr() takes the virtual address and returns a pointer to the top-level
* page directory entry for that address. Since we keep track of several page * page directory entry (PGD) for that address. Since we keep track of several
* tables, the "i" argument tells us which one we're interested in (it's * page tables, the "i" argument tells us which one we're interested in (it's
* usually the current one). */ * usually the current one). */
static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
{ {
...@@ -81,9 +81,9 @@ static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) ...@@ -81,9 +81,9 @@ static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
return &lg->pgdirs[i].pgdir[index]; return &lg->pgdirs[i].pgdir[index];
} }
/* This routine then takes the PGD entry given above, which contains the /* This routine then takes the page directory entry returned above, which
* address of the PTE page. It then returns a pointer to the PTE entry for the * contains the address of the page table entry (PTE) page. It then returns a
* given address. */ * pointer to the PTE entry for the given address. */
static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
{ {
pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
...@@ -191,7 +191,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd) ...@@ -191,7 +191,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
} }
/*H:330 /*H:330
* (i) Setting up a page table entry for the Guest when it faults * (i) Looking up a page table entry when the Guest faults.
* *
* We saw this call in run_guest(): when we see a page fault in the Guest, we * We saw this call in run_guest(): when we see a page fault in the Guest, we
* come here. That's because we only set up the shadow page tables lazily as * come here. That's because we only set up the shadow page tables lazily as
...@@ -199,7 +199,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd) ...@@ -199,7 +199,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
* and return to the Guest without it knowing. * and return to the Guest without it knowing.
* *
* If we fixed up the fault (ie. we mapped the address), this routine returns * If we fixed up the fault (ie. we mapped the address), this routine returns
* true. */ * true. Otherwise, it was a real fault and we need to tell the Guest. */
int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
{ {
pgd_t gpgd; pgd_t gpgd;
...@@ -246,16 +246,16 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) ...@@ -246,16 +246,16 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
return 0; return 0;
/* User access to a kernel page? (bit 3 == user access) */ /* User access to a kernel-only page? (bit 3 == user access) */
if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
return 0; return 0;
/* Check that the Guest PTE flags are OK, and the page number is below /* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary). */ * the pfn_limit (ie. not mapping the Launcher binary). */
check_gpte(lg, gpte); check_gpte(lg, gpte);
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte); gpte = pte_mkyoung(gpte);
if (errcode & 2) if (errcode & 2)
gpte = pte_mkdirty(gpte); gpte = pte_mkdirty(gpte);
...@@ -272,23 +272,28 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) ...@@ -272,23 +272,28 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
else else
/* If this is a read, don't set the "writable" bit in the page /* If this is a read, don't set the "writable" bit in the page
* table entry, even if the Guest says it's writable. That way * table entry, even if the Guest says it's writable. That way
* we come back here when a write does actually ocur, so we can * we will come back here when a write does actually occur, so
* update the Guest's _PAGE_DIRTY flag. */ * we can update the Guest's _PAGE_DIRTY flag. */
*spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
/* Finally, we write the Guest PTE entry back: we've set the /* Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
lgwrite(lg, gpte_ptr, pte_t, gpte); lgwrite(lg, gpte_ptr, pte_t, gpte);
/* We succeeded in mapping the page! */ /* The fault is fixed, the page table is populated, the mapping
* manipulated, the result returned and the code complete. A small
* delay and a trace of alliteration are the only indications the Guest
* has that a page fault occurred at all. */
return 1; return 1;
} }
/*H:360 (ii) Setting up the page table entry for the Guest stack. /*H:360
* (ii) Making sure the Guest stack is mapped.
* *
* Remember pin_stack_pages() which makes sure the stack is mapped? It could * Remember that direct traps into the Guest need a mapped Guest kernel stack.
* simply call demand_page(), but as we've seen that logic is quite long, and * pin_stack_pages() calls us here: we could simply call demand_page(), but as
* usually the stack pages are already mapped anyway, so it's not required. * we've seen that logic is quite long, and usually the stack pages are already
* mapped, so it's overkill.
* *
* This is a quick version which answers the question: is this virtual address * This is a quick version which answers the question: is this virtual address
* mapped by the shadow page tables, and is it writable? */ * mapped by the shadow page tables, and is it writable? */
...@@ -297,7 +302,7 @@ static int page_writable(struct lguest *lg, unsigned long vaddr) ...@@ -297,7 +302,7 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
pgd_t *spgd; pgd_t *spgd;
unsigned long flags; unsigned long flags;
/* Look at the top level entry: is it present? */ /* Look at the current top level entry: is it present? */
spgd = spgd_addr(lg, lg->pgdidx, vaddr); spgd = spgd_addr(lg, lg->pgdidx, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
return 0; return 0;
...@@ -333,15 +338,14 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd) ...@@ -333,15 +338,14 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd)
release_pte(ptepage[i]); release_pte(ptepage[i]);
/* Now we can free the page of PTEs */ /* Now we can free the page of PTEs */
free_page((long)ptepage); free_page((long)ptepage);
/* And zero out the PGD entry we we never release it twice. */ /* And zero out the PGD entry so we never release it twice. */
*spgd = __pgd(0); *spgd = __pgd(0);
} }
} }
/*H:440 (v) Flushing (thowing away) page tables, /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
* * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
* We saw flush_user_mappings() called when we re-used a top-level pgdir page. * It simply releases every PTE page from 0 up to the Guest's kernel address. */
* It simply releases every PTE page from 0 up to the kernel address. */
static void flush_user_mappings(struct lguest *lg, int idx) static void flush_user_mappings(struct lguest *lg, int idx)
{ {
unsigned int i; unsigned int i;
...@@ -350,8 +354,10 @@ static void flush_user_mappings(struct lguest *lg, int idx) ...@@ -350,8 +354,10 @@ static void flush_user_mappings(struct lguest *lg, int idx)
release_pgd(lg, lg->pgdirs[idx].pgdir + i); release_pgd(lg, lg->pgdirs[idx].pgdir + i);
} }
/* The Guest also has a hypercall to do this manually: it's used when a large /*H:440 (v) Flushing (throwing away) page tables,
* number of mappings have been changed. */ *
* The Guest has a hypercall to throw away the page tables: it's used when a
* large number of mappings have been changed. */
void guest_pagetable_flush_user(struct lguest *lg) void guest_pagetable_flush_user(struct lguest *lg)
{ {
/* Drop the userspace part of the current page table. */ /* Drop the userspace part of the current page table. */
...@@ -423,8 +429,9 @@ static unsigned int new_pgdir(struct lguest *lg, ...@@ -423,8 +429,9 @@ static unsigned int new_pgdir(struct lguest *lg,
/*H:430 (iv) Switching page tables /*H:430 (iv) Switching page tables
* *
* This is what happens when the Guest changes page tables (ie. changes the * Now we've seen all the page table setting and manipulation, let's see what
* top-level pgdir). This happens on almost every context switch. */ * what happens when the Guest changes page tables (ie. changes the top-level
* pgdir). This occurs on almost every context switch. */
void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
{ {
int newpgdir, repin = 0; int newpgdir, repin = 0;
...@@ -443,7 +450,8 @@ void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) ...@@ -443,7 +450,8 @@ void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
} }
/*H:470 Finally, a routine which throws away everything: all PGD entries in all /*H:470 Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables. This is used when we destroy the Guest. */ * the shadow page tables, including the Guest's kernel mappings. This is used
* when we destroy the Guest. */
static void release_all_pagetables(struct lguest *lg) static void release_all_pagetables(struct lguest *lg)
{ {
unsigned int i, j; unsigned int i, j;
...@@ -458,13 +466,22 @@ static void release_all_pagetables(struct lguest *lg) ...@@ -458,13 +466,22 @@ static void release_all_pagetables(struct lguest *lg)
/* We also throw away everything when a Guest tells us it's changed a kernel /* We also throw away everything when a Guest tells us it's changed a kernel
* mapping. Since kernel mappings are in every page table, it's easiest to * mapping. Since kernel mappings are in every page table, it's easiest to
* throw them all away. This is amazingly slow, but thankfully rare. */ * throw them all away. This traps the Guest in amber for a while as
* everything faults back in, but it's rare. */
void guest_pagetable_clear_all(struct lguest *lg) void guest_pagetable_clear_all(struct lguest *lg)
{ {
release_all_pagetables(lg); release_all_pagetables(lg);
/* We need the Guest kernel stack mapped again. */ /* We need the Guest kernel stack mapped again. */
pin_stack_pages(lg); pin_stack_pages(lg);
} }
/*:*/
/*M:009 Since we throw away all mappings when a kernel mapping changes, our
* performance sucks for guests using highmem. In fact, a guest with
* PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
* usually slower than a Guest with less memory.
*
* This, of course, cannot be fixed. It would take some kind of... well, I
* don't know, but the term "puissant code-fu" comes to mind. :*/
/*H:420 This is the routine which actually sets the page table entry for then /*H:420 This is the routine which actually sets the page table entry for then
* "idx"'th shadow page table. * "idx"'th shadow page table.
...@@ -483,7 +500,7 @@ void guest_pagetable_clear_all(struct lguest *lg) ...@@ -483,7 +500,7 @@ void guest_pagetable_clear_all(struct lguest *lg)
static void do_set_pte(struct lguest *lg, int idx, static void do_set_pte(struct lguest *lg, int idx,
unsigned long vaddr, pte_t gpte) unsigned long vaddr, pte_t gpte)
{ {
/* Look up the matching shadow page directot entry. */ /* Look up the matching shadow page directory entry. */
pgd_t *spgd = spgd_addr(lg, idx, vaddr); pgd_t *spgd = spgd_addr(lg, idx, vaddr);
/* If the top level isn't present, there's no entry to update. */ /* If the top level isn't present, there's no entry to update. */
...@@ -500,7 +517,8 @@ static void do_set_pte(struct lguest *lg, int idx, ...@@ -500,7 +517,8 @@ static void do_set_pte(struct lguest *lg, int idx,
*spte = gpte_to_spte(lg, gpte, *spte = gpte_to_spte(lg, gpte,
pte_flags(gpte) & _PAGE_DIRTY); pte_flags(gpte) & _PAGE_DIRTY);
} else } else
/* Otherwise we can demand_page() it in later. */ /* Otherwise kill it and we can demand_page() it in
* later. */
*spte = __pte(0); *spte = __pte(0);
} }
} }
...@@ -535,7 +553,7 @@ void guest_set_pte(struct lguest *lg, ...@@ -535,7 +553,7 @@ void guest_set_pte(struct lguest *lg,
} }
/*H:400 /*H:400
* (iii) Setting up a page table entry when the Guest tells us it has changed. * (iii) Setting up a page table entry when the Guest tells us one has changed.
* *
* Just like we did in interrupts_and_traps.c, it makes sense for us to deal * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
* with the other side of page tables while we're here: what happens when the * with the other side of page tables while we're here: what happens when the
...@@ -612,9 +630,10 @@ void free_guest_pagetable(struct lguest *lg) ...@@ -612,9 +630,10 @@ void free_guest_pagetable(struct lguest *lg)
/*H:480 (vi) Mapping the Switcher when the Guest is about to run. /*H:480 (vi) Mapping the Switcher when the Guest is about to run.
* *
* The Switcher and the two pages for this CPU need to be available to the * The Switcher and the two pages for this CPU need to be visible in the
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
* for each CPU already set up, we just need to hook them in. */ * for each CPU already set up, we just need to hook them in now we know which
* Guest is about to run on this CPU. */
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
{ {
pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
...@@ -677,6 +696,18 @@ static __init void populate_switcher_pte_page(unsigned int cpu, ...@@ -677,6 +696,18 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
} }
/* We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over.
*
* If nothing else, note that all this complexity in juggling shadow page
* tables in sync with the Guest's page tables is for one reason: for most
* Guests this page table dance determines how bad performance will be. This
* is why Xen uses exotic direct Guest pagetable manipulation, and why both
* Intel and AMD have implemented shadow page table support directly into
* hardware.
*
* There is just one file remaining in the Host. */
/*H:510 At boot or module load time, init_pagetables() allocates and populates /*H:510 At boot or module load time, init_pagetables() allocates and populates
* the Switcher PTE page for each CPU. */ * the Switcher PTE page for each CPU. */
__init int init_pagetables(struct page **switcher_page, unsigned int pages) __init int init_pagetables(struct page **switcher_page, unsigned int pages)
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
#include "lg.h" #include "lg.h"
/*H:600 /*H:600
* We've almost completed the Host; there's just one file to go!
*
* Segments & The Global Descriptor Table * Segments & The Global Descriptor Table
* *
* (That title sounds like a bad Nerdcore group. Not to suggest that there are * (That title sounds like a bad Nerdcore group. Not to suggest that there are
...@@ -55,7 +53,7 @@ static int ignored_gdt(unsigned int num) ...@@ -55,7 +53,7 @@ static int ignored_gdt(unsigned int num)
|| num == GDT_ENTRY_DOUBLEFAULT_TSS); || num == GDT_ENTRY_DOUBLEFAULT_TSS);
} }
/*H:610 Once the GDT has been changed, we fix the new entries up a little. We /*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We
* don't care if they're invalid: the worst that can happen is a General * don't care if they're invalid: the worst that can happen is a General
* Protection Fault in the Switcher when it restores a Guest segment register * Protection Fault in the Switcher when it restores a Guest segment register
* which tries to use that entry. Then we kill the Guest for causing such a * which tries to use that entry. Then we kill the Guest for causing such a
...@@ -84,25 +82,33 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) ...@@ -84,25 +82,33 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
} }
} }
/* This routine is called at boot or modprobe time for each CPU to set up the /*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep
* "constant" GDT entries for Guests running on that CPU. */ * a GDT for each CPU, and copy across the Guest's entries each time we want to
* run the Guest on that CPU.
*
* This routine is called at boot or modprobe time for each CPU to set up the
* constant GDT entries: the ones which are the same no matter what Guest we're
* running. */
void setup_default_gdt_entries(struct lguest_ro_state *state) void setup_default_gdt_entries(struct lguest_ro_state *state)
{ {
struct desc_struct *gdt = state->guest_gdt; struct desc_struct *gdt = state->guest_gdt;
unsigned long tss = (unsigned long)&state->guest_tss; unsigned long tss = (unsigned long)&state->guest_tss;
/* The hypervisor segments are full 0-4G segments, privilege level 0 */ /* The Switcher segments are full 0-4G segments, privilege level 0 */
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
/* The TSS segment refers to the TSS entry for this CPU, so we cannot /* The TSS segment refers to the TSS entry for this particular CPU.
* copy it from the Guest. Forgive the magic flags */ * Forgive the magic flags: the 0x8900 means the entry is Present, it's
* privilege level 0 Available 386 TSS system segment, and the 0x67
* means Saturn is eclipsed by Mercury in the twelfth house. */
gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
| ((tss >> 16) & 0x000000FF); | ((tss >> 16) & 0x000000FF);
} }
/* This routine is called before the Guest is run for the first time. */ /* This routine sets up the initial Guest GDT for booting. All entries start
* as 0 (unusable). */
void setup_guest_gdt(struct lguest *lg) void setup_guest_gdt(struct lguest *lg)
{ {
/* Start with full 0-4G segments... */ /* Start with full 0-4G segments... */
...@@ -114,13 +120,8 @@ void setup_guest_gdt(struct lguest *lg) ...@@ -114,13 +120,8 @@ void setup_guest_gdt(struct lguest *lg)
lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
} }
/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the /*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
* GDTs for each CPU, then we copy across the entries each time we want to run * entries. */
* a different Guest on that CPU. */
/* A partial GDT load, for the three "thead-local storage" entries. Otherwise
* it's just like load_guest_gdt(). So much, in fact, it would probably be
* neater to have a single hypercall to cover both. */
void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
{ {
unsigned int i; unsigned int i;
...@@ -129,7 +130,9 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) ...@@ -129,7 +130,9 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
gdt[i] = lg->arch.gdt[i]; gdt[i] = lg->arch.gdt[i];
} }
/* This is the full version */ /*H:640 When the Guest is run on a different CPU, or the GDT entries have
* changed, copy_gdt() is called to copy the Guest's GDT entries across to this
* CPU's GDT. */
void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
{ {
unsigned int i; unsigned int i;
...@@ -141,7 +144,8 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) ...@@ -141,7 +144,8 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
gdt[i] = lg->arch.gdt[i]; gdt[i] = lg->arch.gdt[i];
} }
/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ /*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
* We copy it from the Guest and tweak the entries. */
void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
{ {
/* We assume the Guest has the same number of GDT entries as the /* We assume the Guest has the same number of GDT entries as the
...@@ -157,16 +161,22 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) ...@@ -157,16 +161,22 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
lg->changed |= CHANGED_GDT; lg->changed |= CHANGED_GDT;
} }
/* This is the fast-track version for just changing the three TLS entries.
* Remember that this happens on every context switch, so it's worth
* optimizing. But wouldn't it be neater to have a single hypercall to cover
* both cases? */
void guest_load_tls(struct lguest *lg, unsigned long gtls) void guest_load_tls(struct lguest *lg, unsigned long gtls)
{ {
struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
__lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
/* Note that just the TLS entries have changed. */
lg->changed |= CHANGED_GDT_TLS; lg->changed |= CHANGED_GDT_TLS;
} }
/*:*/
/* /*H:660
* With this, we have finished the Host. * With this, we have finished the Host.
* *
* Five of the seven parts of our task are complete. You have made it through * Five of the seven parts of our task are complete. You have made it through
......
...@@ -63,7 +63,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) ...@@ -63,7 +63,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
static DEFINE_PER_CPU(struct lguest *, last_guest); static DEFINE_PER_CPU(struct lguest *, last_guest);
/*S:010 /*S:010
* We are getting close to the Switcher. * We approach the Switcher.
* *
* Remember that each CPU has two pages which are visible to the Guest when it * Remember that each CPU has two pages which are visible to the Guest when it
* runs on that CPU. This has to contain the state for that Guest: we copy the * runs on that CPU. This has to contain the state for that Guest: we copy the
...@@ -134,7 +134,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) ...@@ -134,7 +134,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
* *
* The lcall also pushes the old code segment (KERNEL_CS) onto the * The lcall also pushes the old code segment (KERNEL_CS) onto the
* stack, then the address of this call. This stack layout happens to * stack, then the address of this call. This stack layout happens to
* exactly match the stack of an interrupt... */ * exactly match the stack layout created by an interrupt... */
asm volatile("pushf; lcall *lguest_entry" asm volatile("pushf; lcall *lguest_entry"
/* This is how we tell GCC that %eax ("a") and %ebx ("b") /* This is how we tell GCC that %eax ("a") and %ebx ("b")
* are changed by this routine. The "=" means output. */ * are changed by this routine. The "=" means output. */
...@@ -151,40 +151,46 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) ...@@ -151,40 +151,46 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
} }
/*:*/ /*:*/
/*M:002 There are hooks in the scheduler which we can register to tell when we
* get kicked off the CPU (preempt_notifier_register()). This would allow us
* to lazily disable SYSENTER which would regain some performance, and should
* also simplify copy_in_guest_info(). Note that we'd still need to restore
* things when we exit to Launcher userspace, but that's fairly easy.
*
* The hooks were designed for KVM, but we can also put them to good use. :*/
/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts /*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
* are disabled: we own the CPU. */ * are disabled: we own the CPU. */
void lguest_arch_run_guest(struct lguest *lg) void lguest_arch_run_guest(struct lguest *lg)
{ {
/* Remember the awfully-named TS bit? If the Guest has asked /* Remember the awfully-named TS bit? If the Guest has asked to set it
* to set it we set it now, so we can trap and pass that trap * we set it now, so we can trap and pass that trap to the Guest if it
* to the Guest if it uses the FPU. */ * uses the FPU. */
if (lg->ts) if (lg->ts)
lguest_set_ts(); lguest_set_ts();
/* SYSENTER is an optimized way of doing system calls. We /* SYSENTER is an optimized way of doing system calls. We can't allow
* can't allow it because it always jumps to privilege level 0. * it because it always jumps to privilege level 0. A normal Guest
* A normal Guest won't try it because we don't advertise it in * won't try it because we don't advertise it in CPUID, but a malicious
* CPUID, but a malicious Guest (or malicious Guest userspace * Guest (or malicious Guest userspace program) could, so we tell the
* program) could, so we tell the CPU to disable it before * CPU to disable it before running the Guest. */
* running the Guest. */
if (boot_cpu_has(X86_FEATURE_SEP)) if (boot_cpu_has(X86_FEATURE_SEP))
wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
/* Now we actually run the Guest. It will pop back out when /* Now we actually run the Guest. It will return when something
* something interesting happens, and we can examine its * interesting happens, and we can examine its registers to see what it
* registers to see what it was doing. */ * was doing. */
run_guest_once(lg, lguest_pages(raw_smp_processor_id())); run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
/* The "regs" pointer contains two extra entries which are not /* Note that the "regs" pointer contains two extra entries which are
* really registers: a trap number which says what interrupt or * not really registers: a trap number which says what interrupt or
* trap made the switcher code come back, and an error code * trap made the switcher code come back, and an error code which some
* which some traps set. */ * traps set. */
/* If the Guest page faulted, then the cr2 register will tell /* If the Guest page faulted, then the cr2 register will tell us the
* us the bad virtual address. We have to grab this now, * bad virtual address. We have to grab this now, because once we
* because once we re-enable interrupts an interrupt could * re-enable interrupts an interrupt could fault and thus overwrite
* fault and thus overwrite cr2, or we could even move off to a * cr2, or we could even move off to a different CPU. */
* different CPU. */
if (lg->regs->trapnum == 14) if (lg->regs->trapnum == 14)
lg->arch.last_pagefault = read_cr2(); lg->arch.last_pagefault = read_cr2();
/* Similarly, if we took a trap because the Guest used the FPU, /* Similarly, if we took a trap because the Guest used the FPU,
...@@ -197,14 +203,15 @@ void lguest_arch_run_guest(struct lguest *lg) ...@@ -197,14 +203,15 @@ void lguest_arch_run_guest(struct lguest *lg)
wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
} }
/*H:130 Our Guest is usually so well behaved; it never tries to do things it /*H:130 Now we've examined the hypercall code; our Guest can make requests.
* isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't * Our Guest is usually so well behaved; it never tries to do things it isn't
* quite complete, because it doesn't contain replacements for the Intel I/O * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
* instructions. As a result, the Guest sometimes fumbles across one during * infrastructure isn't quite complete, because it doesn't contain replacements
* the boot process as it probes for various things which are usually attached * for the Intel I/O instructions. As a result, the Guest sometimes fumbles
* to a PC. * across one during the boot process as it probes for various things which are
* usually attached to a PC.
* *
* When the Guest uses one of these instructions, we get trap #13 (General * When the Guest uses one of these instructions, we get a trap (General
* Protection Fault) and come here. We see if it's one of those troublesome * Protection Fault) and come here. We see if it's one of those troublesome
* instructions and skip over it. We return true if we did. */ * instructions and skip over it. We return true if we did. */
static int emulate_insn(struct lguest *lg) static int emulate_insn(struct lguest *lg)
...@@ -275,43 +282,43 @@ static int emulate_insn(struct lguest *lg) ...@@ -275,43 +282,43 @@ static int emulate_insn(struct lguest *lg)
void lguest_arch_handle_trap(struct lguest *lg) void lguest_arch_handle_trap(struct lguest *lg)
{ {
switch (lg->regs->trapnum) { switch (lg->regs->trapnum) {
case 13: /* We've intercepted a GPF. */ case 13: /* We've intercepted a General Protection Fault. */
/* Check if this was one of those annoying IN or OUT /* Check if this was one of those annoying IN or OUT
* instructions which we need to emulate. If so, we * instructions which we need to emulate. If so, we just go
* just go back into the Guest after we've done it. */ * back into the Guest after we've done it. */
if (lg->regs->errcode == 0) { if (lg->regs->errcode == 0) {
if (emulate_insn(lg)) if (emulate_insn(lg))
return; return;
} }
break; break;
case 14: /* We've intercepted a page fault. */ case 14: /* We've intercepted a Page Fault. */
/* The Guest accessed a virtual address that wasn't /* The Guest accessed a virtual address that wasn't mapped.
* mapped. This happens a lot: we don't actually set * This happens a lot: we don't actually set up most of the
* up most of the page tables for the Guest at all when * page tables for the Guest at all when we start: as it runs
* we start: as it runs it asks for more and more, and * it asks for more and more, and we set them up as
* we set them up as required. In this case, we don't * required. In this case, we don't even tell the Guest that
* even tell the Guest that the fault happened. * the fault happened.
* *
* The errcode tells whether this was a read or a * The errcode tells whether this was a read or a write, and
* write, and whether kernel or userspace code. */ * whether kernel or userspace code. */
if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
return; return;
/* OK, it's really not there (or not OK): the Guest /* OK, it's really not there (or not OK): the Guest needs to
* needs to know. We write out the cr2 value so it * know. We write out the cr2 value so it knows where the
* knows where the fault occurred. * fault occurred.
* *
* Note that if the Guest were really messed up, this * Note that if the Guest were really messed up, this could
* could happen before it's done the INITIALIZE * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
* hypercall, so lg->lguest_data will be NULL */ * lg->lguest_data could be NULL */
if (lg->lguest_data && if (lg->lguest_data &&
put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
kill_guest(lg, "Writing cr2"); kill_guest(lg, "Writing cr2");
break; break;
case 7: /* We've intercepted a Device Not Available fault. */ case 7: /* We've intercepted a Device Not Available fault. */
/* If the Guest doesn't want to know, we already /* If the Guest doesn't want to know, we already restored the
* restored the Floating Point Unit, so we just * Floating Point Unit, so we just continue without telling
* continue without telling it. */ * it. */
if (!lg->ts) if (!lg->ts)
return; return;
break; break;
...@@ -536,9 +543,6 @@ int lguest_arch_init_hypercalls(struct lguest *lg) ...@@ -536,9 +543,6 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
return 0; return 0;
} }
/* Now we've examined the hypercall code; our Guest can make requests. There
* is one other way we can do things for the Guest, as we see in
* emulate_insn(). :*/
/*L:030 lguest_arch_setup_regs() /*L:030 lguest_arch_setup_regs()
* *
...@@ -562,7 +566,7 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) ...@@ -562,7 +566,7 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
* is supposed to always be "1". Bit 9 (0x200) controls whether * is supposed to always be "1". Bit 9 (0x200) controls whether
* interrupts are enabled. We always leave interrupts enabled while * interrupts are enabled. We always leave interrupts enabled while
* running the Guest. */ * running the Guest. */
regs->eflags = 0x202; regs->eflags = X86_EFLAGS_IF | 0x2;
/* The "Extended Instruction Pointer" register says where the Guest is /* The "Extended Instruction Pointer" register says where the Guest is
* running. */ * running. */
...@@ -570,8 +574,8 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) ...@@ -570,8 +574,8 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
/* %esi points to our boot information, at physical address 0, so don't /* %esi points to our boot information, at physical address 0, so don't
* touch it. */ * touch it. */
/* There are a couple of GDT entries the Guest expects when first /* There are a couple of GDT entries the Guest expects when first
* booting. */ * booting. */
setup_guest_gdt(lg); setup_guest_gdt(lg);
} }
...@@ -6,6 +6,37 @@ ...@@ -6,6 +6,37 @@
* are feeling invigorated and refreshed then the next, more challenging stage * are feeling invigorated and refreshed then the next, more challenging stage
* can be found in "make Guest". :*/ * can be found in "make Guest". :*/
/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
* gain at least 1% more performance. Since neither LOC nor performance can be
* measured beforehand, it generally means implementing a feature then deciding
* if it's worth it. And once it's implemented, who can say no?
*
* This is why I haven't implemented this idea myself. I want to, but I
* haven't. You could, though.
*
* The main place where lguest performance sucks is Guest page faulting. When
* a Guest userspace process hits an unmapped page we switch back to the Host,
* walk the page tables, find it's not mapped, switch back to the Guest page
* fault handler, which calls a hypercall to set the page table entry, then
* finally returns to userspace. That's two round-trips.
*
* If we had a small walker in the Switcher, we could quickly check the Guest
* page table and if the page isn't mapped, immediately reflect the fault back
* into the Guest. This means the Switcher would have to know the top of the
* Guest page table and the page fault handler address.
*
* For simplicity, the Guest should only handle the case where the privilege
* level of the fault is 3 and probably only not present or write faults. It
* should also detect recursive faults, and hand the original fault to the
* Host (which is actually really easy).
*
* Two questions remain. Would the performance gain outweigh the complexity?
* And who would write the verse documenting it? :*/
/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their
* code). It's worth doing though, since it would let us use oprofile in the
* Host when a Guest is running. :*/
/*S:100 /*S:100
* Welcome to the Switcher itself! * Welcome to the Switcher itself!
* *
...@@ -88,7 +119,7 @@ ENTRY(switch_to_guest) ...@@ -88,7 +119,7 @@ ENTRY(switch_to_guest)
// All saved and there's now five steps before us: // All saved and there's now five steps before us:
// Stack, GDT, IDT, TSS // Stack, GDT, IDT, TSS
// And last of all the page tables are flipped. // Then last of all the page tables are flipped.
// Yet beware that our stack pointer must be // Yet beware that our stack pointer must be
// Always valid lest an NMI hits // Always valid lest an NMI hits
...@@ -103,25 +134,25 @@ ENTRY(switch_to_guest) ...@@ -103,25 +134,25 @@ ENTRY(switch_to_guest)
lgdt LGUEST_PAGES_guest_gdt_desc(%eax) lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
// The Guest's IDT we did partially // The Guest's IDT we did partially
// Move to the "struct lguest_pages" as well. // Copy to "struct lguest_pages" as well.
lidt LGUEST_PAGES_guest_idt_desc(%eax) lidt LGUEST_PAGES_guest_idt_desc(%eax)
// The TSS entry which controls traps // The TSS entry which controls traps
// Must be loaded up with "ltr" now: // Must be loaded up with "ltr" now:
// The GDT entry that TSS uses
// Changes type when we load it: damn Intel!
// For after we switch over our page tables // For after we switch over our page tables
// It (as the rest) will be writable no more. // That entry will be read-only: we'd crash.
// (The GDT entry TSS needs
// Changes type when we load it: damn Intel!)
movl $(GDT_ENTRY_TSS*8), %edx movl $(GDT_ENTRY_TSS*8), %edx
ltr %dx ltr %dx
// Look back now, before we take this last step! // Look back now, before we take this last step!
// The Host's TSS entry was also marked used; // The Host's TSS entry was also marked used;
// Let's clear it again, ere we return. // Let's clear it again for our return.
// The GDT descriptor of the Host // The GDT descriptor of the Host
// Points to the table after two "size" bytes // Points to the table after two "size" bytes
movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
// Clear the type field of "used" (byte 5, bit 2) // Clear "used" from type field (byte 5, bit 2)
andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
// Once our page table's switched, the Guest is live! // Once our page table's switched, the Guest is live!
...@@ -131,7 +162,7 @@ ENTRY(switch_to_guest) ...@@ -131,7 +162,7 @@ ENTRY(switch_to_guest)
// The page table change did one tricky thing: // The page table change did one tricky thing:
// The Guest's register page has been mapped // The Guest's register page has been mapped
// Writable onto our %esp (stack) -- // Writable under our %esp (stack) --
// We can simply pop off all Guest regs. // We can simply pop off all Guest regs.
popl %eax popl %eax
popl %ebx popl %ebx
...@@ -152,16 +183,15 @@ ENTRY(switch_to_guest) ...@@ -152,16 +183,15 @@ ENTRY(switch_to_guest)
addl $8, %esp addl $8, %esp
// The last five stack slots hold return address // The last five stack slots hold return address
// And everything needed to change privilege // And everything needed to switch privilege
// Into the Guest privilege level of 1, // From Switcher's level 0 to Guest's 1,
// And the stack where the Guest had last left it. // And the stack where the Guest had last left it.
// Interrupts are turned back on: we are Guest. // Interrupts are turned back on: we are Guest.
iret iret
// There are two paths where we switch to the Host // We treat two paths to switch back to the Host
// Yet both must save Guest state and restore Host
// So we put the routine in a macro. // So we put the routine in a macro.
// We are on our way home, back to the Host
// Interrupted out of the Guest, we come here.
#define SWITCH_TO_HOST \ #define SWITCH_TO_HOST \
/* We save the Guest state: all registers first \ /* We save the Guest state: all registers first \
* Laid out just as "struct lguest_regs" defines */ \ * Laid out just as "struct lguest_regs" defines */ \
...@@ -194,7 +224,7 @@ ENTRY(switch_to_guest) ...@@ -194,7 +224,7 @@ ENTRY(switch_to_guest)
movl %esp, %eax; \ movl %esp, %eax; \
andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
/* Save our trap number: the switch will obscure it \ /* Save our trap number: the switch will obscure it \
* (The Guest regs are not mapped here in the Host) \ * (In the Host the Guest regs are not mapped here) \
* %ebx holds it safe for deliver_to_host */ \ * %ebx holds it safe for deliver_to_host */ \
movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
/* The Host GDT, IDT and stack! \ /* The Host GDT, IDT and stack! \
...@@ -210,9 +240,9 @@ ENTRY(switch_to_guest) ...@@ -210,9 +240,9 @@ ENTRY(switch_to_guest)
/* Switch to Host's GDT, IDT. */ \ /* Switch to Host's GDT, IDT. */ \
lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
lidt LGUEST_PAGES_host_idt_desc(%eax); \ lidt LGUEST_PAGES_host_idt_desc(%eax); \
/* Restore the Host's stack where it's saved regs lie */ \ /* Restore the Host's stack where its saved regs lie */ \
movl LGUEST_PAGES_host_sp(%eax), %esp; \ movl LGUEST_PAGES_host_sp(%eax), %esp; \
/* Last the TSS: our Host is complete */ \ /* Last the TSS: our Host is returned */ \
movl $(GDT_ENTRY_TSS*8), %edx; \ movl $(GDT_ENTRY_TSS*8), %edx; \
ltr %dx; \ ltr %dx; \
/* Restore now the regs saved right at the first. */ \ /* Restore now the regs saved right at the first. */ \
...@@ -222,14 +252,15 @@ ENTRY(switch_to_guest) ...@@ -222,14 +252,15 @@ ENTRY(switch_to_guest)
popl %ds; \ popl %ds; \
popl %es popl %es
// Here's where we come when the Guest has just trapped: // The first path is trod when the Guest has trapped:
// (Which trap we'll see has been pushed on the stack). // (Which trap it was has been pushed on the stack).
// We need only switch back, and the Host will decode // We need only switch back, and the Host will decode
// Why we came home, and what needs to be done. // Why we came home, and what needs to be done.
return_to_host: return_to_host:
SWITCH_TO_HOST SWITCH_TO_HOST
iret iret
// We are lead to the second path like so:
// An interrupt, with some cause external // An interrupt, with some cause external
// Has ajerked us rudely from the Guest's code // Has ajerked us rudely from the Guest's code
// Again we must return home to the Host // Again we must return home to the Host
...@@ -238,7 +269,7 @@ deliver_to_host: ...@@ -238,7 +269,7 @@ deliver_to_host:
// But now we must go home via that place // But now we must go home via that place
// Where that interrupt was supposed to go // Where that interrupt was supposed to go
// Had we not been ensconced, running the Guest. // Had we not been ensconced, running the Guest.
// Here we see the cleverness of our stack: // Here we see the trickness of run_guest_once():
// The Host stack is formed like an interrupt // The Host stack is formed like an interrupt
// With EIP, CS and EFLAGS layered. // With EIP, CS and EFLAGS layered.
// Interrupt handlers end with "iret" // Interrupt handlers end with "iret"
...@@ -263,7 +294,7 @@ deliver_to_host: ...@@ -263,7 +294,7 @@ deliver_to_host:
xorw %ax, %ax xorw %ax, %ax
orl %eax, %edx orl %eax, %edx
// Now the address of the handler's in %edx // Now the address of the handler's in %edx
// We call it now: its "iret" takes us home. // We call it now: its "iret" drops us home.
jmp *%edx jmp *%edx
// Every interrupt can come to us here // Every interrupt can come to us here
......
...@@ -18,12 +18,17 @@ ...@@ -18,12 +18,17 @@
#define LHCALL_LOAD_TLS 16 #define LHCALL_LOAD_TLS 16
#define LHCALL_NOTIFY 17 #define LHCALL_NOTIFY 17
#define LGUEST_TRAP_ENTRY 0x1F
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
/*G:031 First, how does our Guest contact the Host to ask for privileged /*G:031 First, how does our Guest contact the Host to ask for privileged
* operations? There are two ways: the direct way is to make a "hypercall", * operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself. * to make requests of the Host Itself.
* *
* Our hypercall mechanism uses the highest unused trap code (traps 32 and * Our hypercall mechanism uses the highest unused trap code (traps 32 and
* above are used by real hardware interrupts). Seventeen hypercalls are * above are used by real hardware interrupts). Fifteen hypercalls are
* available: the hypercall number is put in the %eax register, and the * available: the hypercall number is put in the %eax register, and the
* arguments (when required) are placed in %edx, %ebx and %ecx. If a return * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
* value makes sense, it's returned in %eax. * value makes sense, it's returned in %eax.
...@@ -31,20 +36,15 @@ ...@@ -31,20 +36,15 @@
* Grossly invalid calls result in Sudden Death at the hands of the vengeful * Grossly invalid calls result in Sudden Death at the hands of the vengeful
* Host, rather than returning failure. This reflects Winston Churchill's * Host, rather than returning failure. This reflects Winston Churchill's
* definition of a gentleman: "someone who is only rude intentionally". */ * definition of a gentleman: "someone who is only rude intentionally". */
#define LGUEST_TRAP_ENTRY 0x1F
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
static inline unsigned long static inline unsigned long
hcall(unsigned long call, hcall(unsigned long call,
unsigned long arg1, unsigned long arg2, unsigned long arg3) unsigned long arg1, unsigned long arg2, unsigned long arg3)
{ {
/* "int" is the Intel instruction to trigger a trap. */ /* "int" is the Intel instruction to trigger a trap. */
asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
/* The call is in %eax (aka "a"), and can be replaced */ /* The call in %eax (aka "a") might be overwritten */
: "=a"(call) : "=a"(call)
/* The other arguments are in %eax, %edx, %ebx & %ecx */ /* The arguments are in %eax, %edx, %ebx & %ecx */
: "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
/* "memory" means this might write somewhere in memory. /* "memory" means this might write somewhere in memory.
* This isn't true for all calls, but it's safe to tell * This isn't true for all calls, but it's safe to tell
......
...@@ -12,8 +12,8 @@ ...@@ -12,8 +12,8 @@
#define LG_CLOCK_MAX_DELTA ULONG_MAX #define LG_CLOCK_MAX_DELTA ULONG_MAX
/*G:032 The second method of communicating with the Host is to via "struct /*G:032 The second method of communicating with the Host is to via "struct
* lguest_data". The Guest's very first hypercall is to tell the Host where * lguest_data". Once the Guest's initialization hypercall tells the Host where
* this is, and then the Guest and Host both publish information in it. :*/ * this is, the Guest and Host both publish information in it. :*/
struct lguest_data struct lguest_data
{ {
/* 512 == enabled (same as eflags in normal hardware). The Guest /* 512 == enabled (same as eflags in normal hardware). The Guest
......
#ifndef _ASM_LGUEST_USER #ifndef _LINUX_LGUEST_LAUNCHER
#define _ASM_LGUEST_USER #define _LINUX_LGUEST_LAUNCHER
/* Everything the "lguest" userspace program needs to know. */ /* Everything the "lguest" userspace program needs to know. */
#include <linux/types.h> #include <linux/types.h>
/* They can register up to 32 arrays of lguest_dma. */
#define LGUEST_MAX_DMA 32
/* At most we can dma 16 lguest_dma in one op. */
#define LGUEST_MAX_DMA_SECTIONS 16
/* How many devices? Assume each one wants up to two dma arrays per device. */
#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
/* Where the Host expects the Guest to SEND_DMA console output to. */
#define LGUEST_CONSOLE_DMA_KEY 0
/*D:010 /*D:010
* Drivers * Drivers
...@@ -20,7 +10,11 @@ ...@@ -20,7 +10,11 @@
* real devices (think of the damage it could do!) we provide virtual devices. * real devices (think of the damage it could do!) we provide virtual devices.
* We could emulate a PCI bus with various devices on it, but that is a fairly * We could emulate a PCI bus with various devices on it, but that is a fairly
* complex burden for the Host and suboptimal for the Guest, so we have our own * complex burden for the Host and suboptimal for the Guest, so we have our own
* "lguest" bus and simple drivers. * simple lguest bus and we use "virtio" drivers. These drivers need a set of
* routines from us which will actually do the virtual I/O, but they handle all
* the net/block/console stuff themselves. This means that if we want to add
* a new device, we simply need to write a new virtio driver and create support
* for it in the Launcher: this code won't need to change.
* *
* Devices are described by a simplified ID, a status byte, and some "config" * Devices are described by a simplified ID, a status byte, and some "config"
* bytes which describe this device's configuration. This is placed by the * bytes which describe this device's configuration. This is placed by the
...@@ -51,9 +45,9 @@ struct lguest_vqconfig { ...@@ -51,9 +45,9 @@ struct lguest_vqconfig {
/* Write command first word is a request. */ /* Write command first word is a request. */
enum lguest_req enum lguest_req
{ {
LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ LHREQ_INITIALIZE, /* + base, pfnlimit, pgdir, start */
LHREQ_GETDMA, /* No longer used */ LHREQ_GETDMA, /* No longer used */
LHREQ_IRQ, /* + irq */ LHREQ_IRQ, /* + irq */
LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
}; };
#endif /* _ASM_LGUEST_USER */ #endif /* _LINUX_LGUEST_LAUNCHER */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment