Commit a7e1aabb authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus

* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
  lguest: Fix in/out emulation
  lguest: Fix translation count about wikipedia's cpuid page
  lguest: Fix three simple typos in comments
  lguest: update comments
  lguest: Simplify device initialization.
  lguest: don't rewrite vmcall instructions
  lguest: remove remaining vmcall
  lguest: use a special 1:1 linear pagetable mode until first switch.
  lguest: Do not exit on non-fatal errors
parents 111ad119 996ba96a
...@@ -51,7 +51,7 @@ ...@@ -51,7 +51,7 @@
#include <asm/bootparam.h> #include <asm/bootparam.h>
#include "../../../include/linux/lguest_launcher.h" #include "../../../include/linux/lguest_launcher.h"
/*L:110 /*L:110
* We can ignore the 42 include files we need for this program, but I do want * We can ignore the 43 include files we need for this program, but I do want
* to draw attention to the use of kernel-style types. * to draw attention to the use of kernel-style types.
* *
* As Linus said, "C is a Spartan language, and so should your naming be." I * As Linus said, "C is a Spartan language, and so should your naming be." I
...@@ -65,7 +65,6 @@ typedef uint16_t u16; ...@@ -65,7 +65,6 @@ typedef uint16_t u16;
typedef uint8_t u8; typedef uint8_t u8;
/*:*/ /*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define BRIDGE_PFX "bridge:" #define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF #ifndef SIOCBRADDIF
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #define SIOCBRADDIF 0x89a2 /* add interface to bridge */
...@@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq) ...@@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq)
/* writev can return a partial write, so we loop here. */ /* writev can return a partial write, so we loop here. */
while (!iov_empty(iov, out)) { while (!iov_empty(iov, out)) {
int len = writev(STDOUT_FILENO, iov, out); int len = writev(STDOUT_FILENO, iov, out);
if (len <= 0) if (len <= 0) {
err(1, "Write to stdout gave %i", len); warn("Write to stdout gave %i (%d)", len, errno);
break;
}
iov_consume(iov, out, len); iov_consume(iov, out, len);
} }
...@@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq) ...@@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq)
* same format: what a coincidence! * same format: what a coincidence!
*/ */
if (writev(net_info->tunfd, iov, out) < 0) if (writev(net_info->tunfd, iov, out) < 0)
errx(1, "Write to tun failed?"); warnx("Write to tun failed (%d)?", errno);
/* /*
* Done with that one; wait_for_vq_desc() will send the interrupt if * Done with that one; wait_for_vq_desc() will send the interrupt if
...@@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq) ...@@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq)
*/ */
len = readv(net_info->tunfd, iov, in); len = readv(net_info->tunfd, iov, in);
if (len <= 0) if (len <= 0)
err(1, "Failed to read from tun."); warn("Failed to read from tun (%d).", errno);
/* /*
* Mark that packet buffer as used, but don't interrupt here. We want * Mark that packet buffer as used, but don't interrupt here. We want
...@@ -1093,8 +1094,9 @@ static void update_device_status(struct device *dev) ...@@ -1093,8 +1094,9 @@ static void update_device_status(struct device *dev)
warnx("Device %s configuration FAILED", dev->name); warnx("Device %s configuration FAILED", dev->name);
if (dev->running) if (dev->running)
reset_device(dev); reset_device(dev);
} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { } else {
if (!dev->running) if (dev->running)
err(1, "Device %s features finalized twice", dev->name);
start_device(dev); start_device(dev);
} }
} }
...@@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr) ...@@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr)
return; return;
} }
/* /* Devices should not be used before features are finalized. */
* Devices *can* be used before status is set to DRIVER_OK.
* The original plan was that they would never do this: they
* would always finish setting up their status bits before
* actually touching the virtqueues. In practice, we allowed
* them to, and they do (eg. the disk probes for partition
* tables as part of initialization).
*
* If we see this, we start the device: once it's running, we
* expect the device to catch all the notifications.
*/
for (vq = i->vq; vq; vq = vq->next) { for (vq = i->vq; vq; vq = vq->next) {
if (addr != vq->config.pfn*getpagesize()) if (addr != vq->config.pfn*getpagesize())
continue; continue;
if (i->running) errx(1, "Notification on %s before setup!", i->name);
errx(1, "Notification on running %s", i->name);
/* This just calls create_thread() for each virtqueue */
start_device(i);
return;
} }
} }
...@@ -1370,7 +1358,7 @@ static void setup_console(void) ...@@ -1370,7 +1358,7 @@ static void setup_console(void)
* --sharenet=<name> option which opens or creates a named pipe. This can be * --sharenet=<name> option which opens or creates a named pipe. This can be
* used to send packets to another guest in a 1:1 manner. * used to send packets to another guest in a 1:1 manner.
* *
* More sopisticated is to use one of the tools developed for project like UML * More sophisticated is to use one of the tools developed for project like UML
* to do networking. * to do networking.
* *
* Faster is to do virtio bonding in kernel. Doing this 1:1 would be * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
...@@ -1380,7 +1368,7 @@ static void setup_console(void) ...@@ -1380,7 +1368,7 @@ static void setup_console(void)
* multiple inter-guest channels behind one interface, although it would * multiple inter-guest channels behind one interface, although it would
* require some manner of hotplugging new virtio channels. * require some manner of hotplugging new virtio channels.
* *
* Finally, we could implement a virtio network switch in the kernel. * Finally, we could use a virtio network switch in the kernel, ie. vhost.
:*/ :*/
static u32 str2ip(const char *ipaddr) static u32 str2ip(const char *ipaddr)
...@@ -2017,10 +2005,7 @@ int main(int argc, char *argv[]) ...@@ -2017,10 +2005,7 @@ int main(int argc, char *argv[])
/* Tell the entry path not to try to reload segment registers. */ /* Tell the entry path not to try to reload segment registers. */
boot->hdr.loadflags |= KEEP_SEGMENTS; boot->hdr.loadflags |= KEEP_SEGMENTS;
/* /* We tell the kernel to initialize the Guest. */
* We tell the kernel to initialize the Guest: this returns the open
* /dev/lguest file descriptor.
*/
tell_kernel(start); tell_kernel(start);
/* Ensure that we terminate if a device-servicing child dies. */ /* Ensure that we terminate if a device-servicing child dies. */
......
...@@ -61,6 +61,7 @@ hcall(unsigned long call, ...@@ -61,6 +61,7 @@ hcall(unsigned long call,
: "memory"); : "memory");
return call; return call;
} }
/*:*/
/* Can't use our min() macro here: needs to be a constant */ /* Can't use our min() macro here: needs to be a constant */
#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
......
...@@ -63,7 +63,6 @@ void foo(void) ...@@ -63,7 +63,6 @@ void foo(void)
BLANK(); BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK(); BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
......
...@@ -71,7 +71,8 @@ ...@@ -71,7 +71,8 @@
#include <asm/stackprotector.h> #include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */ #include <asm/reboot.h> /* for struct machine_ops */
/*G:010 Welcome to the Guest! /*G:010
* Welcome to the Guest!
* *
* The Guest in our tale is a simple creature: identical to the Host but * The Guest in our tale is a simple creature: identical to the Host but
* behaving in simplified but equivalent ways. In particular, the Guest is the * behaving in simplified but equivalent ways. In particular, the Guest is the
...@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call, ...@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
#endif #endif
/*G:036 /*G:036
* When lazy mode is turned off reset the per-cpu lazy mode variable and then * When lazy mode is turned off, we issue the do-nothing hypercall to
* issue the do-nothing hypercall to flush any stored calls. * flush any stored calls, and call the generic helper to reset the
:*/ * per-cpu lazy mode variable.
*/
static void lguest_leave_lazy_mmu_mode(void) static void lguest_leave_lazy_mmu_mode(void)
{ {
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
paravirt_leave_lazy_mmu(); paravirt_leave_lazy_mmu();
} }
/*
* We also catch the end of context switch; we enter lazy mode for much of
* that too, so again we need to flush here.
*
* (Technically, this is lazy CPU mode, and normally we're in lazy MMU
* mode, but unlike Xen, lguest doesn't care about the difference).
*/
static void lguest_end_context_switch(struct task_struct *next) static void lguest_end_context_switch(struct task_struct *next)
{ {
hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
...@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void) ...@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
* giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * giant ball of hair. Its entry in the current Intel manual runs to 28 pages.
* *
* This instruction even it has its own Wikipedia entry. The Wikipedia entry * This instruction even it has its own Wikipedia entry. The Wikipedia entry
* has been translated into 5 languages. I am not making this up! * has been translated into 6 languages. I am not making this up!
* *
* We could get funky here and identify ourselves as "GenuineLguest", but * We could get funky here and identify ourselves as "GenuineLguest", but
* instead we just use the real "cpuid" instruction. Then I pretty much turned * instead we just use the real "cpuid" instruction. Then I pretty much turned
...@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, ...@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
/* /*
* PAE systems can mark pages as non-executable. Linux calls this the * PAE systems can mark pages as non-executable. Linux calls this the
* NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
* Virus Protection). We just switch turn if off here, since we don't * Virus Protection). We just switch it off here, since we don't
* support it. * support it.
*/ */
case 0x80000001: case 0x80000001:
...@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void) ...@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */ /* See lguest_set_pte() below. */
static bool cr3_changed = false; static bool cr3_changed = false;
static unsigned long current_cr3;
/* /*
* cr3 is the current toplevel pagetable page: the principle is the same as * cr3 is the current toplevel pagetable page: the principle is the same as
* cr0. Keep a local copy, and tell the Host when it changes. The only * cr0. Keep a local copy, and tell the Host when it changes.
* difference is that our local copy is in lguest_data because the Host needs
* to set it upon our initial hypercall.
*/ */
static void lguest_write_cr3(unsigned long cr3) static void lguest_write_cr3(unsigned long cr3)
{ {
lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */ /* These two page tables are simple, linear, and used during boot */
if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
...@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3) ...@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
static unsigned long lguest_read_cr3(void) static unsigned long lguest_read_cr3(void)
{ {
return lguest_data.pgdir; return current_cr3;
} }
/* cr4 is used to enable and disable PGE, but we don't care. */ /* cr4 is used to enable and disable PGE, but we don't care. */
...@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val) ...@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
/* /*
* The Guest calls this after it has set a second-level entry (pte), ie. to map * The Guest calls this after it has set a second-level entry (pte), ie. to map
* a page into a process' address space. Wetell the Host the toplevel and * a page into a process' address space. We tell the Host the toplevel and
* address this corresponds to. The Guest uses one pagetable per process, so * address this corresponds to. The Guest uses one pagetable per process, so
* we need to tell the Host which one we're changing (mm->pgd). * we need to tell the Host which one we're changing (mm->pgd).
*/ */
...@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp) ...@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr) static void lguest_flush_tlb_single(unsigned long addr)
{ {
/* Simply set it to zero: if it was not, it will fault back in. */ /* Simply set it to zero: if it was not, it will fault back in. */
lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
} }
/* /*
...@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = { ...@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = {
static __init char *lguest_memory_setup(void) static __init char *lguest_memory_setup(void)
{ {
/* /*
*The Linux bootloader header contains an "e820" memory map: the * The Linux bootloader header contains an "e820" memory map: the
* Launcher populated the first entry with our memory limit. * Launcher populated the first entry with our memory limit.
*/ */
e820_add_region(boot_params.e820_map[0].addr, e820_add_region(boot_params.e820_map[0].addr,
......
...@@ -6,18 +6,22 @@ ...@@ -6,18 +6,22 @@
#include <asm/processor-flags.h> #include <asm/processor-flags.h>
/*G:020 /*G:020
* Our story starts with the kernel booting into startup_32 in
* arch/x86/kernel/head_32.S. It expects a boot header, which is created by * Our story starts with the bzImage: booting starts at startup_32 in
* the bootloader (the Launcher in our case). * arch/x86/boot/compressed/head_32.S. This merely uncompresses the real
* kernel in place and then jumps into it: startup_32 in
* arch/x86/kernel/head_32.S. Both routines expects a boot header in the %esi
* register, which is created by the bootloader (the Launcher in our case).
* *
* The startup_32 function does very little: it clears the uninitialized global * The startup_32 function does very little: it clears the uninitialized global
* C variables which we expect to be zero (ie. BSS) and then copies the boot * C variables which we expect to be zero (ie. BSS) and then copies the boot
* header and kernel command line somewhere safe. Finally it checks the * header and kernel command line somewhere safe, and populates some initial
* 'hardware_subarch' field. This was introduced in 2.6.24 for lguest and Xen: * page tables. Finally it checks the 'hardware_subarch' field. This was
* if it's set to '1' (lguest's assigned number), then it calls us here. * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
* assigned number), then it calls us here.
* *
* WARNING: be very careful here! We're running at addresses equal to physical * WARNING: be very careful here! We're running at addresses equal to physical
* addesses (around 0), not above PAGE_OFFSET as most code expectes * addresses (around 0), not above PAGE_OFFSET as most code expects
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data without remembering to subtract __PAGE_OFFSET! * data without remembering to subtract __PAGE_OFFSET!
* *
...@@ -27,13 +31,18 @@ ...@@ -27,13 +31,18 @@
.section .init.text, "ax", @progbits .section .init.text, "ax", @progbits
ENTRY(lguest_entry) ENTRY(lguest_entry)
/* /*
* We make the "initialization" hypercall now to tell the Host about * We make the "initialization" hypercall now to tell the Host where
* us, and also find out where it put our page tables. * our lguest_data struct is.
*/ */
movl $LHCALL_LGUEST_INIT, %eax movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY int $LGUEST_TRAP_ENTRY
/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
movl $LHCALL_NEW_PGTABLE, %eax
movl $(initial_page_table - __PAGE_OFFSET), %ebx
int $LGUEST_TRAP_ENTRY
/* Set up the initial stack so we can run C code. */ /* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp movl $(init_thread_union+THREAD_SIZE),%esp
...@@ -96,12 +105,8 @@ send_interrupts: ...@@ -96,12 +105,8 @@ send_interrupts:
*/ */
pushl %eax pushl %eax
movl $LHCALL_SEND_INTERRUPTS, %eax movl $LHCALL_SEND_INTERRUPTS, %eax
/* /* This is the actual hypercall trap. */
* This is a vmcall instruction (same thing that KVM uses). Older int $LGUEST_TRAP_ENTRY
* assembler versions might not know the "vmcall" instruction, so we
* create one manually here.
*/
.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
/* Put eax back the way we found it. */ /* Put eax back the way we found it. */
popl %eax popl %eax
ret ret
......
...@@ -117,7 +117,7 @@ static __init int map_switcher(void) ...@@ -117,7 +117,7 @@ static __init int map_switcher(void)
/* /*
* Now the Switcher is mapped at the right address, we can't fail! * Now the Switcher is mapped at the right address, we can't fail!
* Copy in the compiled-in Switcher code (from <arch>_switcher.S). * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
*/ */
memcpy(switcher_vma->addr, start_switcher_text, memcpy(switcher_vma->addr, start_switcher_text,
end_switcher_text - start_switcher_text); end_switcher_text - start_switcher_text);
......
...@@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num) ...@@ -375,11 +375,9 @@ static bool direct_trap(unsigned int num)
/* /*
* The Host needs to see page faults (for shadow paging and to save the * The Host needs to see page faults (for shadow paging and to save the
* fault address), general protection faults (in/out emulation) and * fault address), general protection faults (in/out emulation) and
* device not available (TS handling), invalid opcode fault (kvm hcall), * device not available (TS handling) and of course, the hypercall trap.
* and of course, the hypercall trap.
*/ */
return num != 14 && num != 13 && num != 7 && return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY;
num != 6 && num != LGUEST_TRAP_ENTRY;
} }
/*:*/ /*:*/
...@@ -429,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu) ...@@ -429,8 +427,8 @@ void pin_stack_pages(struct lg_cpu *cpu)
/* /*
* Direct traps also mean that we need to know whenever the Guest wants to use * Direct traps also mean that we need to know whenever the Guest wants to use
* a different kernel stack, so we can change the IDT entries to use that * a different kernel stack, so we can change the guest TSS to use that
* stack. The IDT entries expect a virtual address, so unlike most addresses * stack. The TSS entries expect a virtual address, so unlike most addresses
* the Guest gives us, the "esp" (stack pointer) value here is virtual, not * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
* physical. * physical.
* *
......
...@@ -59,6 +59,8 @@ struct lg_cpu { ...@@ -59,6 +59,8 @@ struct lg_cpu {
struct lguest_pages *last_pages; struct lguest_pages *last_pages;
/* Initialization mode: linear map everything. */
bool linear_pages;
int cpu_pgd; /* Which pgd this cpu is currently using */ int cpu_pgd; /* Which pgd this cpu is currently using */
/* If a hypercall was asked for, this points to the arguments. */ /* If a hypercall was asked for, this points to the arguments. */
......
...@@ -108,6 +108,17 @@ static u32 lg_get_features(struct virtio_device *vdev) ...@@ -108,6 +108,17 @@ static u32 lg_get_features(struct virtio_device *vdev)
return features; return features;
} }
/*
* To notify on reset or feature finalization, we (ab)use the NOTIFY
* hypercall, with the descriptor address of the device.
*/
static void status_notify(struct virtio_device *vdev)
{
unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
}
/* /*
* The virtio core takes the features the Host offers, and copies the ones * The virtio core takes the features the Host offers, and copies the ones
* supported by the driver into the vdev->features array. Once that's all * supported by the driver into the vdev->features array. Once that's all
...@@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev) ...@@ -135,6 +146,9 @@ static void lg_finalize_features(struct virtio_device *vdev)
if (test_bit(i, vdev->features)) if (test_bit(i, vdev->features))
out_features[i / 8] |= (1 << (i % 8)); out_features[i / 8] |= (1 << (i % 8));
} }
/* Tell Host we've finished with this device's feature negotiation */
status_notify(vdev);
} }
/* Once they've found a field, getting a copy of it is easy. */ /* Once they've found a field, getting a copy of it is easy. */
...@@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev) ...@@ -168,28 +182,21 @@ static u8 lg_get_status(struct virtio_device *vdev)
return to_lgdev(vdev)->desc->status; return to_lgdev(vdev)->desc->status;
} }
/*
* To notify on status updates, we (ab)use the NOTIFY hypercall, with the
* descriptor address of the device. A zero status means "reset".
*/
static void set_status(struct virtio_device *vdev, u8 status)
{
unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
/* We set the status. */
to_lgdev(vdev)->desc->status = status;
hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
}
static void lg_set_status(struct virtio_device *vdev, u8 status) static void lg_set_status(struct virtio_device *vdev, u8 status)
{ {
BUG_ON(!status); BUG_ON(!status);
set_status(vdev, status); to_lgdev(vdev)->desc->status = status;
/* Tell Host immediately if we failed. */
if (status & VIRTIO_CONFIG_S_FAILED)
status_notify(vdev);
} }
static void lg_reset(struct virtio_device *vdev) static void lg_reset(struct virtio_device *vdev)
{ {
set_status(vdev, 0); /* 0 status means "reset" */
to_lgdev(vdev)->desc->status = 0;
status_notify(vdev);
} }
/* /*
......
/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher /*P:200 This contains all the /dev/lguest code, whereby the userspace
* controls and communicates with the Guest. For example, the first write will * launcher controls and communicates with the Guest. For example,
* tell us the Guest's memory layout and entry point. A read will run the * the first write will tell us the Guest's memory layout and entry
* Guest until something happens, such as a signal or the Guest doing a NOTIFY * point. A read will run the Guest until something happens, such as
* out to the Launcher. * a signal or the Guest doing a NOTIFY out to the Launcher. There is
* also a way for the Launcher to attach eventfds to particular NOTIFY
* values instead of returning from the read() call.
:*/ :*/
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/miscdevice.h> #include <linux/miscdevice.h>
...@@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input) ...@@ -357,8 +359,8 @@ static int initialize(struct file *file, const unsigned long __user *input)
goto free_eventfds; goto free_eventfds;
/* /*
* Initialize the Guest's shadow page tables, using the toplevel * Initialize the Guest's shadow page tables. This allocates
* address the Launcher gave us. This allocates memory, so can fail. * memory, so can fail.
*/ */
err = init_guest_pagetable(lg); err = init_guest_pagetable(lg);
if (err) if (err)
...@@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = { ...@@ -516,6 +518,7 @@ static const struct file_operations lguest_fops = {
.read = read, .read = read,
.llseek = default_llseek, .llseek = default_llseek,
}; };
/*:*/
/* /*
* This is a textbook example of a "misc" character device. Populate a "struct * This is a textbook example of a "misc" character device. Populate a "struct
......
This diff is collapsed.
...@@ -269,10 +269,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) ...@@ -269,10 +269,10 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
static int emulate_insn(struct lg_cpu *cpu) static int emulate_insn(struct lg_cpu *cpu)
{ {
u8 insn; u8 insn;
unsigned int insnlen = 0, in = 0, shift = 0; unsigned int insnlen = 0, in = 0, small_operand = 0;
/* /*
* The eip contains the *virtual* address of the Guest's instruction: * The eip contains the *virtual* address of the Guest's instruction:
* guest_pa just subtracts the Guest's page_offset. * walk the Guest's page tables to find the "physical" address.
*/ */
unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
...@@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu) ...@@ -300,11 +300,10 @@ static int emulate_insn(struct lg_cpu *cpu)
} }
/* /*
* 0x66 is an "operand prefix". It means it's using the upper 16 bits * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
* of the eax register.
*/ */
if (insn == 0x66) { if (insn == 0x66) {
shift = 16; small_operand = 1;
/* The instruction is 1 byte so far, read the next byte. */ /* The instruction is 1 byte so far, read the next byte. */
insnlen = 1; insnlen = 1;
insn = lgread(cpu, physaddr + insnlen, u8); insn = lgread(cpu, physaddr + insnlen, u8);
...@@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu) ...@@ -340,11 +339,14 @@ static int emulate_insn(struct lg_cpu *cpu)
* traditionally means "there's nothing there". * traditionally means "there's nothing there".
*/ */
if (in) { if (in) {
/* Lower bit tells is whether it's a 16 or 32 bit access */ /* Lower bit tells means it's a 32/16 bit access */
if (insn & 0x1) if (insn & 0x1) {
cpu->regs->eax = 0xFFFFFFFF; if (small_operand)
cpu->regs->eax |= 0xFFFF;
else else
cpu->regs->eax |= (0xFFFF << shift); cpu->regs->eax = 0xFFFFFFFF;
} else
cpu->regs->eax |= 0xFF;
} }
/* Finally, we've "done" the instruction, so move past it. */ /* Finally, we've "done" the instruction, so move past it. */
cpu->regs->eip += insnlen; cpu->regs->eip += insnlen;
...@@ -352,69 +354,6 @@ static int emulate_insn(struct lg_cpu *cpu) ...@@ -352,69 +354,6 @@ static int emulate_insn(struct lg_cpu *cpu)
return 1; return 1;
} }
/*
* Our hypercalls mechanism used to be based on direct software interrupts.
* After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
* change over to using kvm hypercalls.
*
* KVM_HYPERCALL is actually a "vmcall" instruction, which generates an invalid
* opcode fault (fault 6) on non-VT cpus, so the easiest solution seemed to be
* an *emulation approach*: if the fault was really produced by an hypercall
* (is_hypercall() does exactly this check), we can just call the corresponding
* hypercall host implementation function.
*
* But these invalid opcode faults are notably slower than software interrupts.
* So we implemented the *patching (or rewriting) approach*: every time we hit
* the KVM_HYPERCALL opcode in Guest code, we patch it to the old "int 0x1f"
* opcode, so next time the Guest calls this hypercall it will use the
* faster trap mechanism.
*
* Matias even benchmarked it to convince you: this shows the average cycle
* cost of a hypercall. For each alternative solution mentioned above we've
* made 5 runs of the benchmark:
*
* 1) direct software interrupt: 2915, 2789, 2764, 2721, 2898
* 2) emulation technique: 3410, 3681, 3466, 3392, 3780
* 3) patching (rewrite) technique: 2977, 2975, 2891, 2637, 2884
*
* One two-line function is worth a 20% hypercall speed boost!
*/
static void rewrite_hypercall(struct lg_cpu *cpu)
{
/*
* This are the opcodes we use to patch the Guest. The opcode for "int
* $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
* complete the sequence with a NOP (0x90).
*/
u8 insn[3] = {0xcd, 0x1f, 0x90};
__lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
/*
* The above write might have caused a copy of that page to be made
* (if it was read-only). We need to make sure the Guest has
* up-to-date pagetables. As this doesn't happen often, we can just
* drop them all.
*/
guest_pagetable_clear_all(cpu);
}
static bool is_hypercall(struct lg_cpu *cpu)
{
u8 insn[3];
/*
* This must be the Guest kernel trying to do something.
* The bottom two bits of the CS segment register are the privilege
* level.
*/
if ((cpu->regs->cs & 3) != GUEST_PL)
return false;
/* Is it a vmcall? */
__lgread(cpu, insn, guest_pa(cpu, cpu->regs->eip), sizeof(insn));
return insn[0] == 0x0f && insn[1] == 0x01 && insn[2] == 0xc1;
}
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lg_cpu *cpu) void lguest_arch_handle_trap(struct lg_cpu *cpu)
{ {
...@@ -429,20 +368,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) ...@@ -429,20 +368,6 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
if (emulate_insn(cpu)) if (emulate_insn(cpu))
return; return;
} }
/*
* If KVM is active, the vmcall instruction triggers a General
* Protection Fault. Normally it triggers an invalid opcode
* fault (6):
*/
case 6:
/*
* We need to check if ring == GUEST_PL and faulting
* instruction == vmcall.
*/
if (is_hypercall(cpu)) {
rewrite_hypercall(cpu);
return;
}
break; break;
case 14: /* We've intercepted a Page Fault. */ case 14: /* We've intercepted a Page Fault. */
/* /*
...@@ -486,7 +411,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) ...@@ -486,7 +411,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
* These values mean a real interrupt occurred, in which case * These values mean a real interrupt occurred, in which case
* the Host handler has already been run. We just do a * the Host handler has already been run. We just do a
* friendly check if another process should now be run, then * friendly check if another process should now be run, then
* return to run the Guest again * return to run the Guest again.
*/ */
cond_resched(); cond_resched();
return; return;
...@@ -536,7 +461,7 @@ void __init lguest_arch_host_init(void) ...@@ -536,7 +461,7 @@ void __init lguest_arch_host_init(void)
int i; int i;
/* /*
* Most of the i386/switcher.S doesn't care that it's been moved; on * Most of the x86/switcher_32.S doesn't care that it's been moved; on
* Intel, jumps are relative, and it doesn't access any references to * Intel, jumps are relative, and it doesn't access any references to
* external code or data. * external code or data.
* *
...@@ -664,7 +589,7 @@ void __init lguest_arch_host_init(void) ...@@ -664,7 +589,7 @@ void __init lguest_arch_host_init(void)
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
} }
put_online_cpus(); put_online_cpus();
}; }
/*:*/ /*:*/
void __exit lguest_arch_host_fini(void) void __exit lguest_arch_host_fini(void)
...@@ -747,8 +672,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) ...@@ -747,8 +672,6 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
/*:*/ /*:*/
/*L:030 /*L:030
* lguest_arch_setup_regs()
*
* Most of the Guest's registers are left alone: we used get_zeroed_page() to * Most of the Guest's registers are left alone: we used get_zeroed_page() to
* allocate the structure, so they will be 0. * allocate the structure, so they will be 0.
*/ */
......
...@@ -59,8 +59,6 @@ struct lguest_data { ...@@ -59,8 +59,6 @@ struct lguest_data {
unsigned long reserve_mem; unsigned long reserve_mem;
/* KHz for the TSC clock. */ /* KHz for the TSC clock. */
u32 tsc_khz; u32 tsc_khz;
/* Page where the top-level pagetable is */
unsigned long pgdir;
/* Fields initialized by the Guest at boot: */ /* Fields initialized by the Guest at boot: */
/* Instruction range to suppress interrupts even if enabled */ /* Instruction range to suppress interrupts even if enabled */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment