Commit bff672e6 authored by Rusty Russell's avatar Rusty Russell Committed by Linus Torvalds

lguest: documentation V: Host

Documentation: The Host
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent dde79789
This diff is collapsed.
......@@ -28,37 +28,63 @@
#include <irq_vectors.h>
#include "lg.h"
/*H:120 This is the core hypercall routine: where the Guest gets what it
* wants. Or gets killed. Or, in the case of LHCALL_CRASH, both.
*
* Remember from the Guest: %eax == which call to make, and the arguments are
* packed into %edx, %ebx and %ecx if needed. */
static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
{
switch (regs->eax) {
case LHCALL_FLUSH_ASYNC:
/* This call does nothing, except by breaking out of the Guest
* it makes us process all the asynchronous hypercalls. */
break;
case LHCALL_LGUEST_INIT:
/* You can't get here unless you're already initialized. Don't
* do that. */
kill_guest(lg, "already have lguest_data");
break;
case LHCALL_CRASH: {
/* Crash is such a trivial hypercall that we do it in four
* lines right here. */
char msg[128];
/* If the lgread fails, it will call kill_guest() itself; the
* kill_guest() with the message will be ignored. */
lgread(lg, msg, regs->edx, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
kill_guest(lg, "CRASH: %s", msg);
break;
}
case LHCALL_FLUSH_TLB:
/* FLUSH_TLB comes in two flavors, depending on the
* argument: */
if (regs->edx)
guest_pagetable_clear_all(lg);
else
guest_pagetable_flush_user(lg);
break;
case LHCALL_GET_WALLCLOCK: {
/* The Guest wants to know the real time in seconds since 1970,
* in good Unix tradition. */
struct timespec ts;
ktime_get_real_ts(&ts);
regs->eax = ts.tv_sec;
break;
}
case LHCALL_BIND_DMA:
/* BIND_DMA really wants four arguments, but it's the only call
* which does. So the Guest packs the number of buffers and
* the interrupt number into the final argument, and we decode
* it here. This can legitimately fail, since we currently
* place a limit on the number of DMA pools a Guest can have.
* So we return true or false from this call. */
regs->eax = bind_dma(lg, regs->edx, regs->ebx,
regs->ecx >> 8, regs->ecx & 0xFF);
break;
/* All these calls simply pass the arguments through to the right
* routines. */
case LHCALL_SEND_DMA:
send_dma(lg, regs->edx, regs->ebx);
break;
......@@ -86,10 +112,13 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(lg, regs->edx);
break;
case LHCALL_TS:
/* This sets the TS flag, as we saw used in run_guest(). */
lg->ts = regs->edx;
break;
case LHCALL_HALT:
/* Similarly, this sets the halted flag for run_guest(). */
lg->halted = 1;
break;
default:
......@@ -97,25 +126,42 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
}
}
/* We always do queued calls before actual hypercall. */
/* Asynchronous hypercalls are easy: we just look in the array in the Guest's
* "struct lguest_data" and see if there are any new ones marked "ready".
*
* We are careful to do these in order: obviously we respect the order the
* Guest put them in the ring, but we also promise the Guest that they will
* happen before any normal hypercall (which is why we check this before
* checking for a normal hcall). */
static void do_async_hcalls(struct lguest *lg)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
/* For simplicity, we copy the entire call status array in at once. */
if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
return;
/* We process "struct lguest_data"s hcalls[] ring once. */
for (i = 0; i < ARRAY_SIZE(st); i++) {
struct lguest_regs regs;
/* We remember where we were up to from last time. This makes
* sure that the hypercalls are done in the order the Guest
* places them in the ring. */
unsigned int n = lg->next_hcall;
/* 0xFF means there's no call here (yet). */
if (st[n] == 0xFF)
break;
/* OK, we have hypercall. Increment the "next_hcall" cursor,
* and wrap back to 0 if we reach the end. */
if (++lg->next_hcall == LHCALL_RING_SIZE)
lg->next_hcall = 0;
/* We copy the hypercall arguments into a fake register
* structure. This makes life simple for do_hcall(). */
if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
|| get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
|| get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
......@@ -124,74 +170,126 @@ static void do_async_hcalls(struct lguest *lg)
break;
}
/* Do the hypercall, same as a normal one. */
do_hcall(lg, &regs);
/* Mark the hypercall done. */
if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
kill_guest(lg, "Writing result for async hypercall");
break;
}
/* Stop doing hypercalls if we've just done a DMA to the
* Launcher: it needs to service this first. */
if (lg->dma_is_pending)
break;
}
}
/* Last of all, we look at what happens first of all. The very first time the
* Guest makes a hypercall, we end up here to set things up: */
static void initialize(struct lguest *lg)
{
u32 tsc_speed;
/* You can't do anything until you're initialized. The Guest knows the
* rules, so we're unforgiving here. */
if (lg->regs->eax != LHCALL_LGUEST_INIT) {
kill_guest(lg, "hypercall %li before LGUEST_INIT",
lg->regs->eax);
return;
}
/* We only tell the guest to use the TSC if it's reliable. */
/* We insist that the Time Stamp Counter exist and doesn't change with
* cpu frequency. Some devious chip manufacturers decided that TSC
* changes could be handled in software. I decided that time going
* backwards might be good for benchmarks, but it's bad for users.
*
* We also insist that the TSC be stable: the kernel detects unreliable
* TSCs for its own purposes, and we use that here. */
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
tsc_speed = tsc_khz;
else
tsc_speed = 0;
/* The pointer to the Guest's "struct lguest_data" is the only
* argument. */
lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
/* We check here so we can simply copy_to_user/from_user */
/* If we check the address they gave is OK now, we can simply
* copy_to_user/from_user from now on rather than using lgread/lgwrite.
* I put this in to show that I'm not immune to writing stupid
* optimizations. */
if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
kill_guest(lg, "bad guest page %p", lg->lguest_data);
return;
}
/* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data". */
if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
/* We reserve the top pgd entry. */
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
|| put_user(tsc_speed, &lg->lguest_data->tsc_khz)
/* We also give the Guest a unique id, as used in lguest_net.c. */
|| put_user(lg->guestid, &lg->lguest_data->guestid))
kill_guest(lg, "bad guest page %p", lg->lguest_data);
/* This is the one case where the above accesses might have
* been the first write to a Guest page. This may have caused
* a copy-on-write fault, but the Guest might be referring to
* the old (read-only) page. */
/* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the Guest might be referring to the old (read-only)
* page. */
guest_pagetable_clear_all(lg);
}
/* Now we've examined the hypercall code; our Guest can make requests. There
* is one other way we can do things for the Guest, as we see in
* emulate_insn(). */
/* Even if we go out to userspace and come back, we don't want to do
* the hypercall again. */
/*H:110 Tricky point: we mark the hypercall as "done" once we've done it.
* Normally we don't need to do this: the Guest will run again and update the
* trap number before we come back around the run_guest() loop to
* do_hypercalls().
*
* However, if we are signalled or the Guest sends DMA to the Launcher, that
* loop will exit without running the Guest. When it comes back it would try
* to re-run the hypercall. */
static void clear_hcall(struct lguest *lg)
{
lg->regs->trapnum = 255;
}
/*H:100
* Hypercalls
*
* Remember from the Guest, hypercalls come in two flavors: normal and
* asynchronous. This file handles both of types.
*/
void do_hypercalls(struct lguest *lg)
{
/* Not initialized yet? */
if (unlikely(!lg->lguest_data)) {
/* Did the Guest make a hypercall? We might have come back for
* some other reason (an interrupt, a different trap). */
if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
/* Set up the "struct lguest_data" */
initialize(lg);
/* The hypercall is done. */
clear_hcall(lg);
}
return;
}
/* The Guest has initialized.
*
* Look in the hypercall ring for the async hypercalls: */
do_async_hcalls(lg);
/* If we stopped reading the hypercall ring because the Guest did a
* SEND_DMA to the Launcher, we want to return now. Otherwise if the
* Guest asked us to do a hypercall, we do it. */
if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
do_hcall(lg, lg->regs);
/* The hypercall is done. */
clear_hcall(lg);
}
}
This diff is collapsed.
......@@ -58,9 +58,18 @@ struct lguest_dma_info
u8 interrupt; /* 0 when not registered */
};
/* We have separate types for the guest's ptes & pgds and the shadow ptes &
* pgds. Since this host might use three-level pagetables and the guest and
* shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He
* reviewed the original code which used "u32" for all page table entries, and
* insisted that it would be far clearer with explicit typing. I thought it
* was overkill, but he was right: it is much clearer than it was before.
*
* We have separate types for the Guest's ptes & pgds and the shadow ptes &
* pgds. There's already a Linux type for these (pte_t and pgd_t) but they
* change depending on kernel config options (PAE). */
/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the
* "page frame number" (0 == first physical page, etc). They are different
* types so the compiler will warn us if we mix them improperly. */
typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
......@@ -77,8 +86,12 @@ typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
} gpte_t;
/* We have two convenient macros to convert a "raw" value as handed to us by
* the Guest into the correct Guest PGD or PTE type. */
#define mkgpte(_val) ((gpte_t){.raw.val = _val})
#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
/*:*/
struct pgdir
{
......
This diff is collapsed.
......@@ -11,17 +11,58 @@
* from frolicking through its parklike serenity. :*/
#include "lg.h"
/*H:600
* We've almost completed the Host; there's just one file to go!
*
* Segments & The Global Descriptor Table
*
* (That title sounds like a bad Nerdcore group. Not to suggest that there are
* any good Nerdcore groups, but in high school a friend of mine had a band
* called Joe Fish and the Chips, so there are definitely worse band names).
*
* To refresh: the GDT is a table of 8-byte values describing segments. Once
* set up, these segments can be loaded into one of the 6 "segment registers".
*
* GDT entries are passed around as "struct desc_struct"s, which like IDT
* entries are split into two 32-bit members, "a" and "b". One day, someone
* will clean that up, and be declared a Hero. (No pressure, I'm just saying).
*
* Anyway, the GDT entry contains a base (the start address of the segment), a
* limit (the size of the segment - 1), and some flags. Sounds simple, and it
* would be, except those zany Intel engineers decided that it was too boring
* to put the base at one end, the limit at the other, and the flags in
* between. They decided to shotgun the bits at random throughout the 8 bytes,
* like so:
*
* 0 16 40 48 52 56 63
* [ limit part 1 ][ base part 1 ][ flags ][li][fl][base ]
* mit ags part 2
* part 2
*
* As a result, this file contains a certain amount of magic numeracy. Let's
* begin.
*/
/* Is the descriptor the Guest wants us to put in OK?
*
* The flag which Intel says must be zero: must be zero. The descriptor must
* be present, (this is actually checked earlier but is here for thorougness),
* and the descriptor type must be 1 (a memory segment). */
static int desc_ok(const struct desc_struct *gdt)
{
/* MBZ=0, P=1, DT=1 */
return ((gdt->b & 0x00209000) == 0x00009000);
}
/* Is the segment present? (Otherwise it can't be used by the Guest). */
static int segment_present(const struct desc_struct *gdt)
{
return gdt->b & 0x8000;
}
/* There are several entries we don't let the Guest set. The TSS entry is the
* "Task State Segment" which controls all kinds of delicate things. The
* LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
* the Guest can't be trusted to deal with double faults. */
static int ignored_gdt(unsigned int num)
{
return (num == GDT_ENTRY_TSS
......@@ -30,9 +71,18 @@ static int ignored_gdt(unsigned int num)
|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
}
/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
/* If the Guest asks us to remove an entry from the GDT, we have to be careful.
* If one of the segment registers is pointing at that entry the Switcher will
* crash when it tries to reload the segment registers for the Guest.
*
* It doesn't make much sense for the Guest to try to remove its own code, data
* or stack segments while they're in use: assume that's a Guest bug. If it's
* one of the lesser segment registers using the removed entry, we simply set
* that register to 0 (unusable). */
static void check_segment_use(struct lguest *lg, unsigned int desc)
{
/* GDT entries are 8 bytes long, so we divide to get the index and
* ignore the bottom bits. */
if (lg->regs->gs / 8 == desc)
lg->regs->gs = 0;
if (lg->regs->fs / 8 == desc)
......@@ -45,12 +95,16 @@ static void check_segment_use(struct lguest *lg, unsigned int desc)
kill_guest(lg, "Removed live GDT entry %u", desc);
}
/*H:610 Once the GDT has been changed, we look through the changed entries and
* see if they're OK. If not, we'll call kill_guest() and the Guest will never
* get to use the invalid entries. */
static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
{
unsigned int i;
for (i = start; i < end; i++) {
/* We never copy these ones to real gdt */
/* We never copy these ones to real GDT, so we don't care what
* they say */
if (ignored_gdt(i))
continue;
......@@ -64,41 +118,57 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
if (!desc_ok(&lg->gdt[i]))
kill_guest(lg, "Bad GDT descriptor %i", i);
/* DPL 0 presumably means "for use by guest". */
/* Segment descriptors contain a privilege level: the Guest is
* sometimes careless and leaves this as 0, even though it's
* running at privilege level 1. If so, we fix it here. */
if ((lg->gdt[i].b & 0x00006000) == 0)
lg->gdt[i].b |= (GUEST_PL << 13);
/* Set accessed bit, since gdt isn't writable. */
/* Each descriptor has an "accessed" bit. If we don't set it
* now, the CPU will try to set it when the Guest first loads
* that entry into a segment register. But the GDT isn't
* writable by the Guest, so bad things can happen. */
lg->gdt[i].b |= 0x00000100;
}
}
/* This routine is called at boot or modprobe time for each CPU to set up the
* "constant" GDT entries for Guests running on that CPU. */
void setup_default_gdt_entries(struct lguest_ro_state *state)
{
struct desc_struct *gdt = state->guest_gdt;
unsigned long tss = (unsigned long)&state->guest_tss;
/* Hypervisor segments. */
/* The hypervisor segments are full 0-4G segments, privilege level 0 */
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
/* This is the one which we *cannot* copy from guest, since tss
is depended on this lguest_ro_state, ie. this cpu. */
/* The TSS segment refers to the TSS entry for this CPU, so we cannot
* copy it from the Guest. Forgive the magic flags */
gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
| ((tss >> 16) & 0x000000FF);
}
/* This routine is called before the Guest is run for the first time. */
void setup_guest_gdt(struct lguest *lg)
{
/* Start with full 0-4G segments... */
lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
/* ...except the Guest is allowed to use them, so set the privilege
* level appropriately in the flags. */
lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
}
/* This is a fast version for the common case where only the three TLS entries
* have changed. */
/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the
* GDTs for each CPU, then we copy across the entries each time we want to run
* a different Guest on that CPU. */
/* A partial GDT load, for the three "thead-local storage" entries. Otherwise
* it's just like load_guest_gdt(). So much, in fact, it would probably be
* neater to have a single hypercall to cover both. */
void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
{
unsigned int i;
......@@ -107,22 +177,31 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
gdt[i] = lg->gdt[i];
}
/* This is the full version */
void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
{
unsigned int i;
/* The default entries from setup_default_gdt_entries() are not
* replaced. See ignored_gdt() above. */
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
gdt[i] = lg->gdt[i];
}
/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */
void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
{
/* We assume the Guest has the same number of GDT entries as the
* Host, otherwise we'd have to dynamically allocate the Guest GDT. */
if (num > ARRAY_SIZE(lg->gdt))
kill_guest(lg, "too many gdt entries %i", num);
/* We read the whole thing in, then fix it up. */
lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
/* Mark that the GDT changed so the core knows it has to copy it again,
* even if the Guest is run on the same CPU. */
lg->changed |= CHANGED_GDT;
}
......@@ -134,3 +213,13 @@ void guest_load_tls(struct lguest *lg, unsigned long gtls)
fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
lg->changed |= CHANGED_GDT_TLS;
}
/*
* With this, we have finished the Host.
*
* Five of the seven parts of our task are complete. You have made it through
* the Bit of Despair (I think that's somewhere in the page table code,
* myself).
*
* Next, we examine "make Switcher". It's short, but intense.
*/
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment