Commit d7e28ffe authored by Rusty Russell's avatar Rusty Russell Committed by Linus Torvalds

lguest: the host code

This is the code for the "lg.ko" module, which allows lguest guests to
be launched.

[akpm@linux-foundation.org: update for futex-new-private-futexes]
[akpm@linux-foundation.org: build fix]
[jmorris@namei.org: lguest: use hrtimers]
[akpm@linux-foundation.org: x86_64 build fix]
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 07ad157f
...@@ -27,6 +27,7 @@ static int tsc_enabled; ...@@ -27,6 +27,7 @@ static int tsc_enabled;
* an extra value to store the TSC freq * an extra value to store the TSC freq
*/ */
unsigned int tsc_khz; unsigned int tsc_khz;
EXPORT_SYMBOL_GPL(tsc_khz);
int tsc_disable; int tsc_disable;
...@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup); ...@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
*/ */
static int tsc_unstable; static int tsc_unstable;
static inline int check_tsc_unstable(void) int check_tsc_unstable(void)
{ {
return tsc_unstable; return tsc_unstable;
} }
EXPORT_SYMBOL_GPL(check_tsc_unstable);
/* Accellerators for sched_clock() /* Accellerators for sched_clock()
* convert from cycles(64bits) => nanoseconds (64bits) * convert from cycles(64bits) => nanoseconds (64bits)
......
...@@ -44,7 +44,7 @@ unsigned long long sched_clock(void) ...@@ -44,7 +44,7 @@ unsigned long long sched_clock(void)
static int tsc_unstable; static int tsc_unstable;
static inline int check_tsc_unstable(void) inline int check_tsc_unstable(void)
{ {
return tsc_unstable; return tsc_unstable;
} }
......
This diff is collapsed.
/* Actual hypercalls, which allow guests to actually do something.
Copyright (C) 2006 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <irq_vectors.h>
#include "lg.h"
static void do_hcall(struct lguest *lg, struct lguest_regs *regs)
{
switch (regs->eax) {
case LHCALL_FLUSH_ASYNC:
break;
case LHCALL_LGUEST_INIT:
kill_guest(lg, "already have lguest_data");
break;
case LHCALL_CRASH: {
char msg[128];
lgread(lg, msg, regs->edx, sizeof(msg));
msg[sizeof(msg)-1] = '\0';
kill_guest(lg, "CRASH: %s", msg);
break;
}
case LHCALL_FLUSH_TLB:
if (regs->edx)
guest_pagetable_clear_all(lg);
else
guest_pagetable_flush_user(lg);
break;
case LHCALL_GET_WALLCLOCK: {
struct timespec ts;
ktime_get_real_ts(&ts);
regs->eax = ts.tv_sec;
break;
}
case LHCALL_BIND_DMA:
regs->eax = bind_dma(lg, regs->edx, regs->ebx,
regs->ecx >> 8, regs->ecx & 0xFF);
break;
case LHCALL_SEND_DMA:
send_dma(lg, regs->edx, regs->ebx);
break;
case LHCALL_LOAD_GDT:
load_guest_gdt(lg, regs->edx, regs->ebx);
break;
case LHCALL_LOAD_IDT_ENTRY:
load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
break;
case LHCALL_NEW_PGTABLE:
guest_new_pagetable(lg, regs->edx);
break;
case LHCALL_SET_STACK:
guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
break;
case LHCALL_SET_PTE:
guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx));
break;
case LHCALL_SET_PMD:
guest_set_pmd(lg, regs->edx, regs->ebx);
break;
case LHCALL_LOAD_TLS:
guest_load_tls(lg, regs->edx);
break;
case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(lg, regs->edx);
break;
case LHCALL_TS:
lg->ts = regs->edx;
break;
case LHCALL_HALT:
lg->halted = 1;
break;
default:
kill_guest(lg, "Bad hypercall %li\n", regs->eax);
}
}
/* We always do queued calls before actual hypercall. */
static void do_async_hcalls(struct lguest *lg)
{
unsigned int i;
u8 st[LHCALL_RING_SIZE];
if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st)))
return;
for (i = 0; i < ARRAY_SIZE(st); i++) {
struct lguest_regs regs;
unsigned int n = lg->next_hcall;
if (st[n] == 0xFF)
break;
if (++lg->next_hcall == LHCALL_RING_SIZE)
lg->next_hcall = 0;
if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax)
|| get_user(regs.edx, &lg->lguest_data->hcalls[n].edx)
|| get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx)
|| get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) {
kill_guest(lg, "Fetching async hypercalls");
break;
}
do_hcall(lg, &regs);
if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) {
kill_guest(lg, "Writing result for async hypercall");
break;
}
if (lg->dma_is_pending)
break;
}
}
static void initialize(struct lguest *lg)
{
u32 tsc_speed;
if (lg->regs->eax != LHCALL_LGUEST_INIT) {
kill_guest(lg, "hypercall %li before LGUEST_INIT",
lg->regs->eax);
return;
}
/* We only tell the guest to use the TSC if it's reliable. */
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
tsc_speed = tsc_khz;
else
tsc_speed = 0;
lg->lguest_data = (struct lguest_data __user *)lg->regs->edx;
/* We check here so we can simply copy_to_user/from_user */
if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) {
kill_guest(lg, "bad guest page %p", lg->lguest_data);
return;
}
if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
/* We reserve the top pgd entry. */
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
|| put_user(tsc_speed, &lg->lguest_data->tsc_khz)
|| put_user(lg->guestid, &lg->lguest_data->guestid))
kill_guest(lg, "bad guest page %p", lg->lguest_data);
/* This is the one case where the above accesses might have
* been the first write to a Guest page. This may have caused
* a copy-on-write fault, but the Guest might be referring to
* the old (read-only) page. */
guest_pagetable_clear_all(lg);
}
/* Even if we go out to userspace and come back, we don't want to do
* the hypercall again. */
static void clear_hcall(struct lguest *lg)
{
lg->regs->trapnum = 255;
}
void do_hypercalls(struct lguest *lg)
{
if (unlikely(!lg->lguest_data)) {
if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
initialize(lg);
clear_hcall(lg);
}
return;
}
do_async_hcalls(lg);
if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) {
do_hcall(lg, lg->regs);
clear_hcall(lg);
}
}
#include <linux/uaccess.h>
#include "lg.h"
static unsigned long idt_address(u32 lo, u32 hi)
{
return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
}
static int idt_type(u32 lo, u32 hi)
{
return (hi >> 8) & 0xF;
}
static int idt_present(u32 lo, u32 hi)
{
return (hi & 0x8000);
}
static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
{
*gstack -= 4;
lgwrite_u32(lg, *gstack, val);
}
static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
{
unsigned long gstack;
u32 eflags, ss, irq_enable;
/* If they want a ring change, we use new stack and push old ss/esp */
if ((lg->regs->ss&0x3) != GUEST_PL) {
gstack = guest_pa(lg, lg->esp1);
ss = lg->ss1;
push_guest_stack(lg, &gstack, lg->regs->ss);
push_guest_stack(lg, &gstack, lg->regs->esp);
} else {
gstack = guest_pa(lg, lg->regs->esp);
ss = lg->regs->ss;
}
/* We use IF bit in eflags to indicate whether irqs were disabled
(it's always 0, since irqs are enabled when guest is running). */
eflags = lg->regs->eflags;
if (get_user(irq_enable, &lg->lguest_data->irq_enabled))
irq_enable = 0;
eflags |= (irq_enable & X86_EFLAGS_IF);
push_guest_stack(lg, &gstack, eflags);
push_guest_stack(lg, &gstack, lg->regs->cs);
push_guest_stack(lg, &gstack, lg->regs->eip);
if (has_err)
push_guest_stack(lg, &gstack, lg->regs->errcode);
/* Change the real stack so switcher returns to trap handler */
lg->regs->ss = ss;
lg->regs->esp = gstack + lg->page_offset;
lg->regs->cs = (__KERNEL_CS|GUEST_PL);
lg->regs->eip = idt_address(lo, hi);
/* Disable interrupts for an interrupt gate. */
if (idt_type(lo, hi) == 0xE)
if (put_user(0, &lg->lguest_data->irq_enabled))
kill_guest(lg, "Disabling interrupts");
}
void maybe_do_interrupt(struct lguest *lg)
{
unsigned int irq;
DECLARE_BITMAP(blk, LGUEST_IRQS);
struct desc_struct *idt;
if (!lg->lguest_data)
return;
/* Mask out any interrupts they have blocked. */
if (copy_from_user(&blk, lg->lguest_data->blocked_interrupts,
sizeof(blk)))
return;
bitmap_andnot(blk, lg->irqs_pending, blk, LGUEST_IRQS);
irq = find_first_bit(blk, LGUEST_IRQS);
if (irq >= LGUEST_IRQS)
return;
if (lg->regs->eip >= lg->noirq_start && lg->regs->eip < lg->noirq_end)
return;
/* If they're halted, we re-enable interrupts. */
if (lg->halted) {
/* Re-enable interrupts. */
if (put_user(X86_EFLAGS_IF, &lg->lguest_data->irq_enabled))
kill_guest(lg, "Re-enabling interrupts");
lg->halted = 0;
} else {
/* Maybe they have interrupts disabled? */
u32 irq_enabled;
if (get_user(irq_enabled, &lg->lguest_data->irq_enabled))
irq_enabled = 0;
if (!irq_enabled)
return;
}
idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq];
if (idt_present(idt->a, idt->b)) {
clear_bit(irq, lg->irqs_pending);
set_guest_interrupt(lg, idt->a, idt->b, 0);
}
}
static int has_err(unsigned int trap)
{
return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
}
int deliver_trap(struct lguest *lg, unsigned int num)
{
u32 lo = lg->idt[num].a, hi = lg->idt[num].b;
if (!idt_present(lo, hi))
return 0;
set_guest_interrupt(lg, lo, hi, has_err(num));
return 1;
}
static int direct_trap(const struct lguest *lg,
const struct desc_struct *trap,
unsigned int num)
{
/* Hardware interrupts don't go to guest (except syscall). */
if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR)
return 0;
/* We intercept page fault (demand shadow paging & cr2 saving)
protection fault (in/out emulation) and device not
available (TS handling), and hypercall */
if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY)
return 0;
/* Interrupt gates (0xE) or not present (0x0) can't go direct. */
return idt_type(trap->a, trap->b) == 0xF;
}
void pin_stack_pages(struct lguest *lg)
{
unsigned int i;
for (i = 0; i < lg->stack_pages; i++)
pin_page(lg, lg->esp1 - i * PAGE_SIZE);
}
void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
{
/* You cannot have a stack segment with priv level 0. */
if ((seg & 0x3) != GUEST_PL)
kill_guest(lg, "bad stack segment %i", seg);
if (pages > 2)
kill_guest(lg, "bad stack pages %u", pages);
lg->ss1 = seg;
lg->esp1 = esp;
lg->stack_pages = pages;
pin_stack_pages(lg);
}
/* Set up trap in IDT. */
static void set_trap(struct lguest *lg, struct desc_struct *trap,
unsigned int num, u32 lo, u32 hi)
{
u8 type = idt_type(lo, hi);
if (!idt_present(lo, hi)) {
trap->a = trap->b = 0;
return;
}
if (type != 0xE && type != 0xF)
kill_guest(lg, "bad IDT type %i", type);
trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
trap->b = (hi&0xFFFFEF00);
}
void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi)
{
/* Guest never handles: NMI, doublefault, hypercall, spurious irq. */
if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
return;
lg->changed |= CHANGED_IDT;
if (num < ARRAY_SIZE(lg->idt))
set_trap(lg, &lg->idt[num], num, lo, hi);
else if (num == SYSCALL_VECTOR)
set_trap(lg, &lg->syscall_idt, num, lo, hi);
}
static void default_idt_entry(struct desc_struct *idt,
int trap,
const unsigned long handler)
{
u32 flags = 0x8e00;
/* They can't "int" into any of them except hypercall. */
if (trap == LGUEST_TRAP_ENTRY)
flags |= (GUEST_PL << 13);
idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF);
idt->b = (handler&0xFFFF0000) | flags;
}
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++)
default_idt_entry(&state->guest_idt[i], i, def[i]);
}
void copy_traps(const struct lguest *lg, struct desc_struct *idt,
const unsigned long *def)
{
unsigned int i;
/* All hardware interrupts are same whatever the guest: only the
* traps might be different. */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) {
if (direct_trap(lg, &lg->idt[i], i))
idt[i] = lg->idt[i];
else
default_idt_entry(&idt[i], i, def[i]);
}
i = SYSCALL_VECTOR;
if (direct_trap(lg, &lg->syscall_idt, i))
idt[i] = lg->syscall_idt;
else
default_idt_entry(&idt[i], i, def[i]);
}
void guest_set_clockevent(struct lguest *lg, unsigned long delta)
{
ktime_t expires;
if (unlikely(delta == 0)) {
/* Clock event device is shutting down. */
hrtimer_cancel(&lg->hrt);
return;
}
expires = ktime_add_ns(ktime_get_real(), delta);
hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
}
static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
{
struct lguest *lg = container_of(timer, struct lguest, hrt);
set_bit(0, lg->irqs_pending);
if (lg->halted)
wake_up_process(lg->tsk);
return HRTIMER_NORESTART;
}
void init_clockdev(struct lguest *lg)
{
hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
lg->hrt.function = clockdev_fn;
}
This diff is collapsed.
#ifndef _LGUEST_H
#define _LGUEST_H
#include <asm/desc.h>
#define GDT_ENTRY_LGUEST_CS 10
#define GDT_ENTRY_LGUEST_DS 11
#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/init.h>
#include <linux/stringify.h>
#include <linux/binfmts.h>
#include <linux/futex.h>
#include <linux/lguest.h>
#include <linux/lguest_launcher.h>
#include <linux/wait.h>
#include <linux/err.h>
#include <asm/semaphore.h>
#include "irq_vectors.h"
#define GUEST_PL 1
struct lguest_regs
{
/* Manually saved part. */
unsigned long ebx, ecx, edx;
unsigned long esi, edi, ebp;
unsigned long gs;
unsigned long eax;
unsigned long fs, ds, es;
unsigned long trapnum, errcode;
/* Trap pushed part */
unsigned long eip;
unsigned long cs;
unsigned long eflags;
unsigned long esp;
unsigned long ss;
};
void free_pagetables(void);
int init_pagetables(struct page **switcher_page, unsigned int pages);
/* Full 4G segment descriptors, suitable for CS and DS. */
#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00})
#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300})
struct lguest_dma_info
{
struct list_head list;
union futex_key key;
unsigned long dmas;
u16 next_dma;
u16 num_dmas;
u16 guestid;
u8 interrupt; /* 0 when not registered */
};
/* We have separate types for the guest's ptes & pgds and the shadow ptes &
* pgds. Since this host might use three-level pagetables and the guest and
* shadow pagetables don't, we can't use the normal pte_t/pgd_t. */
typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
} spgd_t;
typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
} spte_t;
typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
} gpgd_t;
typedef union {
struct { unsigned flags:12, pfn:20; };
struct { unsigned long val; } raw;
} gpte_t;
#define mkgpte(_val) ((gpte_t){.raw.val = _val})
#define mkgpgd(_val) ((gpgd_t){.raw.val = _val})
struct pgdir
{
unsigned long cr3;
spgd_t *pgdir;
};
/* This is a guest-specific page (mapped ro) into the guest. */
struct lguest_ro_state
{
/* Host information we need to restore when we switch back. */
u32 host_cr3;
struct Xgt_desc_struct host_idt_desc;
struct Xgt_desc_struct host_gdt_desc;
u32 host_sp;
/* Fields which are used when guest is running. */
struct Xgt_desc_struct guest_idt_desc;
struct Xgt_desc_struct guest_gdt_desc;
struct i386_hw_tss guest_tss;
struct desc_struct guest_idt[IDT_ENTRIES];
struct desc_struct guest_gdt[GDT_ENTRIES];
};
/* We have two pages shared with guests, per cpu. */
struct lguest_pages
{
/* This is the stack page mapped rw in guest */
char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
struct lguest_regs regs;
/* This is the host state & guest descriptor page, ro in guest */
struct lguest_ro_state state;
} __attribute__((aligned(PAGE_SIZE)));
#define CHANGED_IDT 1
#define CHANGED_GDT 2
#define CHANGED_GDT_TLS 4 /* Actually a subset of CHANGED_GDT */
#define CHANGED_ALL 3
/* The private info the thread maintains about the guest. */
struct lguest
{
/* At end of a page shared mapped over lguest_pages in guest. */
unsigned long regs_page;
struct lguest_regs *regs;
struct lguest_data __user *lguest_data;
struct task_struct *tsk;
struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */
u16 guestid;
u32 pfn_limit;
u32 page_offset;
u32 cr2;
int halted;
int ts;
u32 next_hcall;
u32 esp1;
u8 ss1;
/* Do we need to stop what we're doing and return to userspace? */
int break_out;
wait_queue_head_t break_wq;
/* Bitmap of what has changed: see CHANGED_* above. */
int changed;
struct lguest_pages *last_pages;
/* We keep a small number of these. */
u32 pgdidx;
struct pgdir pgdirs[4];
/* Cached wakeup: we hold a reference to this task. */
struct task_struct *wake;
unsigned long noirq_start, noirq_end;
int dma_is_pending;
unsigned long pending_dma; /* struct lguest_dma */
unsigned long pending_key; /* address they're sending to */
unsigned int stack_pages;
u32 tsc_khz;
struct lguest_dma_info dma[LGUEST_MAX_DMA];
/* Dead? */
const char *dead;
/* The GDT entries copied into lguest_ro_state when running. */
struct desc_struct gdt[GDT_ENTRIES];
/* The IDT entries: some copied into lguest_ro_state when running. */
struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS];
struct desc_struct syscall_idt;
/* Virtual clock device */
struct hrtimer hrt;
/* Pending virtual interrupts */
DECLARE_BITMAP(irqs_pending, LGUEST_IRQS);
};
extern struct lguest lguests[];
extern struct mutex lguest_lock;
/* core.c: */
u32 lgread_u32(struct lguest *lg, unsigned long addr);
void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val);
void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len);
void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len);
int find_free_guest(void);
int lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
int run_guest(struct lguest *lg, unsigned long __user *user);
/* interrupts_and_traps.c: */
void maybe_do_interrupt(struct lguest *lg);
int deliver_trap(struct lguest *lg, unsigned int num);
void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 hi);
void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages);
void pin_stack_pages(struct lguest *lg);
void setup_default_idt_entries(struct lguest_ro_state *state,
const unsigned long *def);
void copy_traps(const struct lguest *lg, struct desc_struct *idt,
const unsigned long *def);
void guest_set_clockevent(struct lguest *lg, unsigned long delta);
void init_clockdev(struct lguest *lg);
/* segments.c: */
void setup_default_gdt_entries(struct lguest_ro_state *state);
void setup_guest_gdt(struct lguest *lg);
void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num);
void guest_load_tls(struct lguest *lg, unsigned long tls_array);
void copy_gdt(const struct lguest *lg, struct desc_struct *gdt);
void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt);
/* page_tables.c: */
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable);
void free_guest_pagetable(struct lguest *lg);
void guest_new_pagetable(struct lguest *lg, unsigned long pgtable);
void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i);
void guest_pagetable_clear_all(struct lguest *lg);
void guest_pagetable_flush_user(struct lguest *lg);
void guest_set_pte(struct lguest *lg, unsigned long cr3,
unsigned long vaddr, gpte_t val);
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
int demand_page(struct lguest *info, unsigned long cr2, int errcode);
void pin_page(struct lguest *lg, unsigned long vaddr);
/* lguest_user.c: */
int lguest_device_init(void);
void lguest_device_remove(void);
/* io.c: */
void lguest_io_init(void);
int bind_dma(struct lguest *lg,
unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt);
void send_dma(struct lguest *info, unsigned long key, unsigned long udma);
void release_all_dma(struct lguest *lg);
unsigned long get_dma_buffer(struct lguest *lg, unsigned long key,
unsigned long *interrupt);
/* hypercalls.c: */
void do_hypercalls(struct lguest *lg);
#define kill_guest(lg, fmt...) \
do { \
if (!(lg)->dead) { \
(lg)->dead = kasprintf(GFP_ATOMIC, fmt); \
if (!(lg)->dead) \
(lg)->dead = ERR_PTR(-ENOMEM); \
} \
} while(0)
static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
{
return vaddr - lg->page_offset;
}
#endif /* __ASSEMBLY__ */
#endif /* _LGUEST_H */
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
#include <linux/screen_info.h> #include <linux/screen_info.h>
#include <linux/irq.h> #include <linux/irq.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/lguest.h> #include <linux/lguest.h>
#include <linux/lguest_launcher.h> #include <linux/lguest_launcher.h>
#include <linux/lguest_bus.h> #include <linux/lguest_bus.h>
...@@ -37,6 +39,7 @@ ...@@ -37,6 +39,7 @@
#include <asm/e820.h> #include <asm/e820.h>
#include <asm/mce.h> #include <asm/mce.h>
#include <asm/io.h> #include <asm/io.h>
//#include <asm/sched-clock.h>
/* Declarations for definitions in lguest_guest.S */ /* Declarations for definitions in lguest_guest.S */
extern char lguest_noirq_start[], lguest_noirq_end[]; extern char lguest_noirq_start[], lguest_noirq_end[];
...@@ -54,7 +57,6 @@ struct lguest_data lguest_data = { ...@@ -54,7 +57,6 @@ struct lguest_data lguest_data = {
.blocked_interrupts = { 1 }, /* Block timer interrupts */ .blocked_interrupts = { 1 }, /* Block timer interrupts */
}; };
struct lguest_device_desc *lguest_devices; struct lguest_device_desc *lguest_devices;
static __initdata const struct lguest_boot_info *boot = __va(0);
static enum paravirt_lazy_mode lazy_mode; static enum paravirt_lazy_mode lazy_mode;
static void lguest_lazy_mode(enum paravirt_lazy_mode mode) static void lguest_lazy_mode(enum paravirt_lazy_mode mode)
...@@ -210,7 +212,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, ...@@ -210,7 +212,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
case 1: /* Basic feature request. */ case 1: /* Basic feature request. */
/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
*ecx &= 0x00002201; *ecx &= 0x00002201;
/* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
*edx &= 0x07808101; *edx &= 0x07808101;
/* Host wants to know when we flush kernel pages: set PGE. */ /* Host wants to know when we flush kernel pages: set PGE. */
*edx |= 0x00002000; *edx |= 0x00002000;
...@@ -346,24 +348,104 @@ static unsigned long lguest_get_wallclock(void) ...@@ -346,24 +348,104 @@ static unsigned long lguest_get_wallclock(void)
return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0); return hcall(LHCALL_GET_WALLCLOCK, 0, 0, 0);
} }
static cycle_t lguest_clock_read(void)
{
if (lguest_data.tsc_khz)
return native_read_tsc();
else
return jiffies;
}
/* This is what we tell the kernel is our clocksource. */
static struct clocksource lguest_clock = {
.name = "lguest",
.rating = 400,
.read = lguest_clock_read,
};
/* We also need a "struct clock_event_device": Linux asks us to set it to go
* off some time in the future. Actually, James Morris figured all this out, I
* just applied the patch. */
static int lguest_clockevent_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
if (delta < LG_CLOCK_MIN_DELTA) {
if (printk_ratelimit())
printk(KERN_DEBUG "%s: small delta %lu ns\n",
__FUNCTION__, delta);
return -ETIME;
}
hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);
return 0;
}
static void lguest_clockevent_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
switch (mode) {
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
/* A 0 argument shuts the clock down. */
hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);
break;
case CLOCK_EVT_MODE_ONESHOT:
/* This is what we expect. */
break;
case CLOCK_EVT_MODE_PERIODIC:
BUG();
}
}
/* This describes our primitive timer chip. */
static struct clock_event_device lguest_clockevent = {
.name = "lguest",
.features = CLOCK_EVT_FEAT_ONESHOT,
.set_next_event = lguest_clockevent_set_next_event,
.set_mode = lguest_clockevent_set_mode,
.rating = INT_MAX,
.mult = 1,
.shift = 0,
.min_delta_ns = LG_CLOCK_MIN_DELTA,
.max_delta_ns = LG_CLOCK_MAX_DELTA,
};
/* This is the Guest timer interrupt handler (hardware interrupt 0). We just
* call the clockevent infrastructure and it does whatever needs doing. */
static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
{ {
do_timer(hcall(LHCALL_TIMER_READ, 0, 0, 0)); unsigned long flags;
update_process_times(user_mode_vm(get_irq_regs()));
/* Don't interrupt us while this is running. */
local_irq_save(flags);
lguest_clockevent.event_handler(&lguest_clockevent);
local_irq_restore(flags);
} }
static u64 sched_clock_base;
static void lguest_time_init(void) static void lguest_time_init(void)
{ {
set_irq_handler(0, lguest_time_irq); set_irq_handler(0, lguest_time_irq);
hcall(LHCALL_TIMER_READ, 0, 0, 0);
sched_clock_base = jiffies_64;
enable_lguest_irq(0);
}
static unsigned long long lguest_sched_clock(void) /* We use the TSC if the Host tells us we can, otherwise a dumb
{ * jiffies-based clock. */
return (jiffies_64 - sched_clock_base) * (1000000000 / HZ); if (lguest_data.tsc_khz) {
lguest_clock.shift = 22;
lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
lguest_clock.shift);
lguest_clock.mask = CLOCKSOURCE_MASK(64);
lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS;
} else {
/* To understand this, start at kernel/time/jiffies.c... */
lguest_clock.shift = 8;
lguest_clock.mult = (((u64)NSEC_PER_SEC<<8)/ACTHZ) << 8;
lguest_clock.mask = CLOCKSOURCE_MASK(32);
}
clocksource_register(&lguest_clock);
/* We can't set cpumask in the initializer: damn C limitations! */
lguest_clockevent.cpumask = cpumask_of_cpu(0);
clockevents_register_device(&lguest_clockevent);
enable_lguest_irq(0);
} }
static void lguest_load_esp0(struct tss_struct *tss, static void lguest_load_esp0(struct tss_struct *tss,
...@@ -418,8 +500,7 @@ static __init char *lguest_memory_setup(void) ...@@ -418,8 +500,7 @@ static __init char *lguest_memory_setup(void)
/* We do this here because lockcheck barfs if before start_kernel */ /* We do this here because lockcheck barfs if before start_kernel */
atomic_notifier_chain_register(&panic_notifier_list, &paniced); atomic_notifier_chain_register(&panic_notifier_list, &paniced);
e820.nr_map = 0; add_memory_region(E820_MAP->addr, E820_MAP->size, E820_MAP->type);
add_memory_region(0, PFN_PHYS(boot->max_pfn), E820_RAM);
return "LGUEST"; return "LGUEST";
} }
...@@ -450,8 +531,13 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len) ...@@ -450,8 +531,13 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *insns, unsigned len)
return insn_len; return insn_len;
} }
__init void lguest_init(void) __init void lguest_init(void *boot)
{ {
/* Copy boot parameters first. */
memcpy(&boot_params, boot, PARAM_SIZE);
memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr),
COMMAND_LINE_SIZE);
paravirt_ops.name = "lguest"; paravirt_ops.name = "lguest";
paravirt_ops.paravirt_enabled = 1; paravirt_ops.paravirt_enabled = 1;
paravirt_ops.kernel_rpl = 1; paravirt_ops.kernel_rpl = 1;
...@@ -498,10 +584,8 @@ __init void lguest_init(void) ...@@ -498,10 +584,8 @@ __init void lguest_init(void)
paravirt_ops.time_init = lguest_time_init; paravirt_ops.time_init = lguest_time_init;
paravirt_ops.set_lazy_mode = lguest_lazy_mode; paravirt_ops.set_lazy_mode = lguest_lazy_mode;
paravirt_ops.wbinvd = lguest_wbinvd; paravirt_ops.wbinvd = lguest_wbinvd;
paravirt_ops.sched_clock = lguest_sched_clock;
hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
strncpy(boot_command_line, boot->cmdline, COMMAND_LINE_SIZE);
/* We use top of mem for initial pagetables. */ /* We use top of mem for initial pagetables. */
init_pg_tables_end = __pa(pg0); init_pg_tables_end = __pa(pg0);
...@@ -532,13 +616,6 @@ __init void lguest_init(void) ...@@ -532,13 +616,6 @@ __init void lguest_init(void)
add_preferred_console("hvc", 0, NULL); add_preferred_console("hvc", 0, NULL);
if (boot->initrd_size) {
/* We stash this at top of memory. */
INITRD_START = boot->max_pfn*PAGE_SIZE - boot->initrd_size;
INITRD_SIZE = boot->initrd_size;
LOADER_TYPE = 0xFF;
}
pm_power_off = lguest_power_off; pm_power_off = lguest_power_off;
start_kernel(); start_kernel();
} }
...@@ -10,7 +10,8 @@ ...@@ -10,7 +10,8 @@
* This is where we begin: we have a magic signature which the launcher looks * This is where we begin: we have a magic signature which the launcher looks
* for. The plan is that the Linux boot protocol will be extended with a * for. The plan is that the Linux boot protocol will be extended with a
* "platform type" field which will guide us here from the normal entry point, * "platform type" field which will guide us here from the normal entry point,
* but for the moment this suffices. * but for the moment this suffices. We pass the virtual address of the boot
* info to lguest_init().
* *
* We put it in .init.text will be discarded after boot. * We put it in .init.text will be discarded after boot.
*/ */
...@@ -18,6 +19,8 @@ ...@@ -18,6 +19,8 @@
.ascii "GenuineLguest" .ascii "GenuineLguest"
/* Set up initial stack. */ /* Set up initial stack. */
movl $(init_thread_union+THREAD_SIZE),%esp movl $(init_thread_union+THREAD_SIZE),%esp
movl %esi, %eax
addl $__PAGE_OFFSET, %eax
jmp lguest_init jmp lguest_init
/* The templates for inline patching. */ /* The templates for inline patching. */
......
/* Userspace control of the guest, via /dev/lguest. */
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include "lg.h"
static void setup_regs(struct lguest_regs *regs, unsigned long start)
{
/* Write out stack in format lguest expects, so we can switch to it. */
regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
regs->cs = __KERNEL_CS|GUEST_PL;
regs->eflags = 0x202; /* Interrupts enabled. */
regs->eip = start;
/* esi points to our boot information (physical address 0) */
}
/* + addr */
static long user_get_dma(struct lguest *lg, const u32 __user *input)
{
unsigned long key, udma, irq;
if (get_user(key, input) != 0)
return -EFAULT;
udma = get_dma_buffer(lg, key, &irq);
if (!udma)
return -ENOENT;
/* We put irq number in udma->used_len. */
lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
return udma;
}
/* To force the Guest to stop running and return to the Launcher, the
* Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The
* Launcher then writes LHREQ_BREAK and "0" to release the Waker. */
static int break_guest_out(struct lguest *lg, const u32 __user *input)
{
unsigned long on;
/* Fetch whether they're turning break on or off.. */
if (get_user(on, input) != 0)
return -EFAULT;
if (on) {
lg->break_out = 1;
/* Pop it out (may be running on different CPU) */
wake_up_process(lg->tsk);
/* Wait for them to reset it */
return wait_event_interruptible(lg->break_wq, !lg->break_out);
} else {
lg->break_out = 0;
wake_up(&lg->break_wq);
return 0;
}
}
/* + irq */
static int user_send_irq(struct lguest *lg, const u32 __user *input)
{
u32 irq;
if (get_user(irq, input) != 0)
return -EFAULT;
if (irq >= LGUEST_IRQS)
return -EINVAL;
set_bit(irq, lg->irqs_pending);
return 0;
}
static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
{
struct lguest *lg = file->private_data;
if (!lg)
return -EINVAL;
/* If you're not the task which owns the guest, go away. */
if (current != lg->tsk)
return -EPERM;
if (lg->dead) {
size_t len;
if (IS_ERR(lg->dead))
return PTR_ERR(lg->dead);
len = min(size, strlen(lg->dead)+1);
if (copy_to_user(user, lg->dead, len) != 0)
return -EFAULT;
return len;
}
if (lg->dma_is_pending)
lg->dma_is_pending = 0;
return run_guest(lg, (unsigned long __user *)user);
}
/* Take: pfnlimit, pgdir, start, pageoffset. */
static int initialize(struct file *file, const u32 __user *input)
{
struct lguest *lg;
int err, i;
u32 args[4];
/* We grab the Big Lguest lock, which protects the global array
* "lguests" and multiple simultaneous initializations. */
mutex_lock(&lguest_lock);
if (file->private_data) {
err = -EBUSY;
goto unlock;
}
if (copy_from_user(args, input, sizeof(args)) != 0) {
err = -EFAULT;
goto unlock;
}
i = find_free_guest();
if (i < 0) {
err = -ENOSPC;
goto unlock;
}
lg = &lguests[i];
lg->guestid = i;
lg->pfn_limit = args[0];
lg->page_offset = args[3];
lg->regs_page = get_zeroed_page(GFP_KERNEL);
if (!lg->regs_page) {
err = -ENOMEM;
goto release_guest;
}
lg->regs = (void *)lg->regs_page + PAGE_SIZE - sizeof(*lg->regs);
err = init_guest_pagetable(lg, args[1]);
if (err)
goto free_regs;
setup_regs(lg->regs, args[2]);
setup_guest_gdt(lg);
init_clockdev(lg);
lg->tsk = current;
lg->mm = get_task_mm(lg->tsk);
init_waitqueue_head(&lg->break_wq);
lg->last_pages = NULL;
file->private_data = lg;
mutex_unlock(&lguest_lock);
return sizeof(args);
free_regs:
free_page(lg->regs_page);
release_guest:
memset(lg, 0, sizeof(*lg));
unlock:
mutex_unlock(&lguest_lock);
return err;
}
static ssize_t write(struct file *file, const char __user *input,
size_t size, loff_t *off)
{
struct lguest *lg = file->private_data;
u32 req;
if (get_user(req, input) != 0)
return -EFAULT;
input += sizeof(req);
if (req != LHREQ_INITIALIZE && !lg)
return -EINVAL;
if (lg && lg->dead)
return -ENOENT;
/* If you're not the task which owns the Guest, you can only break */
if (lg && current != lg->tsk && req != LHREQ_BREAK)
return -EPERM;
switch (req) {
case LHREQ_INITIALIZE:
return initialize(file, (const u32 __user *)input);
case LHREQ_GETDMA:
return user_get_dma(lg, (const u32 __user *)input);
case LHREQ_IRQ:
return user_send_irq(lg, (const u32 __user *)input);
case LHREQ_BREAK:
return break_guest_out(lg, (const u32 __user *)input);
default:
return -EINVAL;
}
}
static int close(struct inode *inode, struct file *file)
{
struct lguest *lg = file->private_data;
if (!lg)
return 0;
mutex_lock(&lguest_lock);
/* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */
hrtimer_cancel(&lg->hrt);
release_all_dma(lg);
free_guest_pagetable(lg);
mmput(lg->mm);
if (!IS_ERR(lg->dead))
kfree(lg->dead);
free_page(lg->regs_page);
memset(lg, 0, sizeof(*lg));
mutex_unlock(&lguest_lock);
return 0;
}
static struct file_operations lguest_fops = {
.owner = THIS_MODULE,
.release = close,
.write = write,
.read = read,
};
static struct miscdevice lguest_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "lguest",
.fops = &lguest_fops,
};
int __init lguest_device_init(void)
{
return misc_register(&lguest_dev);
}
void __exit lguest_device_remove(void)
{
misc_deregister(&lguest_dev);
}
This diff is collapsed.
#include "lg.h"
static int desc_ok(const struct desc_struct *gdt)
{
/* MBZ=0, P=1, DT=1 */
return ((gdt->b & 0x00209000) == 0x00009000);
}
static int segment_present(const struct desc_struct *gdt)
{
return gdt->b & 0x8000;
}
static int ignored_gdt(unsigned int num)
{
return (num == GDT_ENTRY_TSS
|| num == GDT_ENTRY_LGUEST_CS
|| num == GDT_ENTRY_LGUEST_DS
|| num == GDT_ENTRY_DOUBLEFAULT_TSS);
}
/* We don't allow removal of CS, DS or SS; it doesn't make sense. */
static void check_segment_use(struct lguest *lg, unsigned int desc)
{
if (lg->regs->gs / 8 == desc)
lg->regs->gs = 0;
if (lg->regs->fs / 8 == desc)
lg->regs->fs = 0;
if (lg->regs->es / 8 == desc)
lg->regs->es = 0;
if (lg->regs->ds / 8 == desc
|| lg->regs->cs / 8 == desc
|| lg->regs->ss / 8 == desc)
kill_guest(lg, "Removed live GDT entry %u", desc);
}
static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
{
unsigned int i;
for (i = start; i < end; i++) {
/* We never copy these ones to real gdt */
if (ignored_gdt(i))
continue;
/* We could fault in switch_to_guest if they are using
* a removed segment. */
if (!segment_present(&lg->gdt[i])) {
check_segment_use(lg, i);
continue;
}
if (!desc_ok(&lg->gdt[i]))
kill_guest(lg, "Bad GDT descriptor %i", i);
/* DPL 0 presumably means "for use by guest". */
if ((lg->gdt[i].b & 0x00006000) == 0)
lg->gdt[i].b |= (GUEST_PL << 13);
/* Set accessed bit, since gdt isn't writable. */
lg->gdt[i].b |= 0x00000100;
}
}
void setup_default_gdt_entries(struct lguest_ro_state *state)
{
struct desc_struct *gdt = state->guest_gdt;
unsigned long tss = (unsigned long)&state->guest_tss;
/* Hypervisor segments. */
gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
/* This is the one which we *cannot* copy from guest, since tss
is depended on this lguest_ro_state, ie. this cpu. */
gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
| ((tss >> 16) & 0x000000FF);
}
void setup_guest_gdt(struct lguest *lg)
{
lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
}
/* This is a fast version for the common case where only the three TLS entries
* have changed. */
void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
{
unsigned int i;
for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++)
gdt[i] = lg->gdt[i];
}
void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
{
unsigned int i;
for (i = 0; i < GDT_ENTRIES; i++)
if (!ignored_gdt(i))
gdt[i] = lg->gdt[i];
}
void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
{
if (num > ARRAY_SIZE(lg->gdt))
kill_guest(lg, "too many gdt entries %i", num);
lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0]));
fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt));
lg->changed |= CHANGED_GDT;
}
void guest_load_tls(struct lguest *lg, unsigned long gtls)
{
struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN];
lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
lg->changed |= CHANGED_GDT_TLS;
}
/* This code sits at 0xFFC00000 to do the low-level guest<->host switch.
There is are two pages above us for this CPU (struct lguest_pages).
The second page (struct lguest_ro_state) becomes read-only after the
context switch. The first page (the stack for traps) remains writable,
but while we're in here, the guest cannot be running.
*/
#include <linux/linkage.h>
#include <asm/asm-offsets.h>
#include "lg.h"
.text
ENTRY(start_switcher_text)
/* %eax points to lguest pages for this CPU. %ebx contains cr3 value.
All normal registers can be clobbered! */
ENTRY(switch_to_guest)
/* Save host segments on host stack. */
pushl %es
pushl %ds
pushl %gs
pushl %fs
/* With CONFIG_FRAME_POINTER, gcc doesn't let us clobber this! */
pushl %ebp
/* Save host stack. */
movl %esp, LGUEST_PAGES_host_sp(%eax)
/* Switch to guest stack: if we get NMI we expect to be there. */
movl %eax, %edx
addl $LGUEST_PAGES_regs, %edx
movl %edx, %esp
/* Switch to guest's GDT, IDT. */
lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
lidt LGUEST_PAGES_guest_idt_desc(%eax)
/* Switch to guest's TSS while GDT still writable. */
movl $(GDT_ENTRY_TSS*8), %edx
ltr %dx
/* Set host's TSS GDT entry to available (clear byte 5 bit 2). */
movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
/* Switch to guest page tables: lguest_pages->state now read-only. */
movl %ebx, %cr3
/* Restore guest regs */
popl %ebx
popl %ecx
popl %edx
popl %esi
popl %edi
popl %ebp
popl %gs
popl %eax
popl %fs
popl %ds
popl %es
/* Skip error code and trap number */
addl $8, %esp
iret
#define SWITCH_TO_HOST \
/* Save guest state */ \
pushl %es; \
pushl %ds; \
pushl %fs; \
pushl %eax; \
pushl %gs; \
pushl %ebp; \
pushl %edi; \
pushl %esi; \
pushl %edx; \
pushl %ecx; \
pushl %ebx; \
/* Load lguest ds segment for convenience. */ \
movl $(LGUEST_DS), %eax; \
movl %eax, %ds; \
/* Figure out where we are, based on stack (at top of regs). */ \
movl %esp, %eax; \
subl $LGUEST_PAGES_regs, %eax; \
/* Put trap number in %ebx before we switch cr3 and lose it. */ \
movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
/* Switch to host page tables (host GDT, IDT and stack are in host \
mem, so need this first) */ \
movl LGUEST_PAGES_host_cr3(%eax), %edx; \
movl %edx, %cr3; \
/* Set guest's TSS to available (clear byte 5 bit 2). */ \
andb $0xFD, (LGUEST_PAGES_guest_gdt+GDT_ENTRY_TSS*8+5)(%eax); \
/* Switch to host's GDT & IDT. */ \
lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
lidt LGUEST_PAGES_host_idt_desc(%eax); \
/* Switch to host's stack. */ \
movl LGUEST_PAGES_host_sp(%eax), %esp; \
/* Switch to host's TSS */ \
movl $(GDT_ENTRY_TSS*8), %edx; \
ltr %dx; \
popl %ebp; \
popl %fs; \
popl %gs; \
popl %ds; \
popl %es
/* Return to run_guest_once. */
return_to_host:
SWITCH_TO_HOST
iret
deliver_to_host:
SWITCH_TO_HOST
/* Decode IDT and jump to hosts' irq handler. When that does iret, it
* will return to run_guest_once. This is a feature. */
movl (LGUEST_PAGES_host_idt_desc+2)(%eax), %edx
leal (%edx,%ebx,8), %eax
movzwl (%eax),%edx
movl 4(%eax), %eax
xorw %ax, %ax
orl %eax, %edx
jmp *%edx
/* Real hardware interrupts are delivered straight to the host. Others
cause us to return to run_guest_once so it can decide what to do. Note
that some of these are overridden by the guest to deliver directly, and
never enter here (see load_guest_idt_entry). */
.macro IRQ_STUB N TARGET
.data; .long 1f; .text; 1:
/* Make an error number for most traps, which don't have one. */
.if (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
pushl $0
.endif
pushl $\N
jmp \TARGET
ALIGN
.endm
.macro IRQ_STUBS FIRST LAST TARGET
irq=\FIRST
.rept \LAST-\FIRST+1
IRQ_STUB irq \TARGET
irq=irq+1
.endr
.endm
/* We intercept every interrupt, because we may need to switch back to
* host. Unfortunately we can't tell them apart except by entry
* point, so we need 256 entry points.
*/
.data
.global default_idt_entries
default_idt_entries:
.text
IRQ_STUBS 0 1 return_to_host /* First two traps */
IRQ_STUB 2 handle_nmi /* NMI */
IRQ_STUBS 3 31 return_to_host /* Rest of traps */
IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
IRQ_STUB 128 return_to_host /* System call (overridden) */
IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
/* We ignore NMI and return. */
handle_nmi:
addl $8, %esp
iret
ENTRY(end_switcher_text)
...@@ -63,6 +63,7 @@ extern void tsc_init(void); ...@@ -63,6 +63,7 @@ extern void tsc_init(void);
extern void mark_tsc_unstable(char *reason); extern void mark_tsc_unstable(char *reason);
extern int unsynchronized_tsc(void); extern int unsynchronized_tsc(void);
extern void init_tsc_clocksource(void); extern void init_tsc_clocksource(void);
int check_tsc_unstable(void);
/* /*
* Boot-time check whether the TSCs are synchronized across * Boot-time check whether the TSCs are synchronized across
......
...@@ -3,11 +3,6 @@ ...@@ -3,11 +3,6 @@
#ifndef _ASM_LGUEST_H #ifndef _ASM_LGUEST_H
#define _ASM_LGUEST_H #define _ASM_LGUEST_H
/* These are randomly chosen numbers which indicate we're an lguest at boot */
#define LGUEST_MAGIC_EBP 0x4C687970
#define LGUEST_MAGIC_EDI 0x652D4D65
#define LGUEST_MAGIC_ESI 0xFFFFFFFF
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
#include <asm/irq.h> #include <asm/irq.h>
...@@ -20,7 +15,7 @@ ...@@ -20,7 +15,7 @@
#define LHCALL_LOAD_IDT_ENTRY 6 #define LHCALL_LOAD_IDT_ENTRY 6
#define LHCALL_SET_STACK 7 #define LHCALL_SET_STACK 7
#define LHCALL_TS 8 #define LHCALL_TS 8
#define LHCALL_TIMER_READ 9 #define LHCALL_SET_CLOCKEVENT 9
#define LHCALL_HALT 10 #define LHCALL_HALT 10
#define LHCALL_GET_WALLCLOCK 11 #define LHCALL_GET_WALLCLOCK 11
#define LHCALL_BIND_DMA 12 #define LHCALL_BIND_DMA 12
...@@ -29,6 +24,9 @@ ...@@ -29,6 +24,9 @@
#define LHCALL_SET_PMD 15 #define LHCALL_SET_PMD 15
#define LHCALL_LOAD_TLS 16 #define LHCALL_LOAD_TLS 16
#define LG_CLOCK_MIN_DELTA 100UL
#define LG_CLOCK_MAX_DELTA ULONG_MAX
#define LGUEST_TRAP_ENTRY 0x1F #define LGUEST_TRAP_ENTRY 0x1F
static inline unsigned long static inline unsigned long
...@@ -75,6 +73,8 @@ struct lguest_data ...@@ -75,6 +73,8 @@ struct lguest_data
unsigned long reserve_mem; unsigned long reserve_mem;
/* ID of this guest (used by network driver to set ethernet address) */ /* ID of this guest (used by network driver to set ethernet address) */
u16 guestid; u16 guestid;
/* KHz for the TSC clock. */
u32 tsc_khz;
/* Fields initialized by the guest at boot: */ /* Fields initialized by the guest at boot: */
/* Instruction range to suppress interrupts even if enabled */ /* Instruction range to suppress interrupts even if enabled */
......
#ifndef _ASM_LGUEST_USER
#define _ASM_LGUEST_USER
/* Everything the "lguest" userspace program needs to know. */
/* They can register up to 32 arrays of lguest_dma. */
#define LGUEST_MAX_DMA 32
/* At most we can dma 16 lguest_dma in one op. */
#define LGUEST_MAX_DMA_SECTIONS 16
/* How many devices? Assume each one wants up to two dma arrays per device. */
#define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2)
struct lguest_dma
{
/* 0 if free to be used, filled by hypervisor. */
u32 used_len;
unsigned long addr[LGUEST_MAX_DMA_SECTIONS];
u16 len[LGUEST_MAX_DMA_SECTIONS];
};
struct lguest_block_page
{
/* 0 is a read, 1 is a write. */
int type;
u32 sector; /* Offset in device = sector * 512. */
u32 bytes; /* Length expected to be read/written in bytes */
/* 0 = pending, 1 = done, 2 = done, error */
int result;
u32 num_sectors; /* Disk length = num_sectors * 512 */
};
/* There is a shared page of these. */
struct lguest_net
{
/* Simply the mac address (with multicast bit meaning promisc). */
unsigned char mac[6];
};
/* Where the Host expects the Guest to SEND_DMA console output to. */
#define LGUEST_CONSOLE_DMA_KEY 0
/* We have a page of these descriptors in the lguest_device page. */
struct lguest_device_desc {
u16 type;
#define LGUEST_DEVICE_T_CONSOLE 1
#define LGUEST_DEVICE_T_NET 2
#define LGUEST_DEVICE_T_BLOCK 3
u16 features;
#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */
#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */
u16 status;
/* 256 and above are device specific. */
#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */
#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */
#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */
#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */
#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */
#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */
u16 num_pages;
u32 pfn;
};
/* Write command first word is a request. */
enum lguest_req
{
LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */
LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */
LHREQ_IRQ, /* + irq */
LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
};
#endif /* _ASM_LGUEST_USER */
...@@ -127,7 +127,6 @@ void __put_task_struct(struct task_struct *tsk) ...@@ -127,7 +127,6 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk)) if (!profile_handoff_task(tsk))
free_task(tsk); free_task(tsk);
} }
EXPORT_SYMBOL_GPL(__put_task_struct);
void __init fork_init(unsigned long mempages) void __init fork_init(unsigned long mempages)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment