Commit 5dea1c88 authored by Rusty Russell's avatar Rusty Russell

lguest: use a special 1:1 linear pagetable mode until first switch.

The Host used to create some page tables for the Guest to use at the
top of Guest memory; it would then tell the Guest where this was.  In
particular, it created linear mappings for 0 and 0xC0000000 addresses
because lguest used to switch to its real page tables quite late in
boot.

However, since d50d8fe1 Linux initialized boot page tables in
head_32.S even before the "are we lguest?" boot jump.  So, now we can
simplify things: the Host pagetable code assumes 1:1 linear mapping
until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do
before we reach C code.

This also means that the Host doesn't need to know anything about the
Guest's PAGE_OFFSET.  (Non-Linux guests might not even have such a
thing).
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent e0377e25
...@@ -63,7 +63,6 @@ void foo(void) ...@@ -63,7 +63,6 @@ void foo(void)
BLANK(); BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
BLANK(); BLANK();
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
......
...@@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void) ...@@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void)
/* See lguest_set_pte() below. */ /* See lguest_set_pte() below. */
static bool cr3_changed = false; static bool cr3_changed = false;
static unsigned long current_cr3;
/* /*
* cr3 is the current toplevel pagetable page: the principle is the same as * cr3 is the current toplevel pagetable page: the principle is the same as
* cr0. Keep a local copy, and tell the Host when it changes. The only * cr0. Keep a local copy, and tell the Host when it changes.
* difference is that our local copy is in lguest_data because the Host needs
* to set it upon our initial hypercall.
*/ */
static void lguest_write_cr3(unsigned long cr3) static void lguest_write_cr3(unsigned long cr3)
{ {
lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
current_cr3 = cr3;
/* These two page tables are simple, linear, and used during boot */ /* These two page tables are simple, linear, and used during boot */
if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
...@@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3) ...@@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3)
static unsigned long lguest_read_cr3(void) static unsigned long lguest_read_cr3(void)
{ {
return lguest_data.pgdir; return current_cr3;
} }
/* cr4 is used to enable and disable PGE, but we don't care. */ /* cr4 is used to enable and disable PGE, but we don't care. */
...@@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp) ...@@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
static void lguest_flush_tlb_single(unsigned long addr) static void lguest_flush_tlb_single(unsigned long addr)
{ {
/* Simply set it to zero: if it was not, it will fault back in. */ /* Simply set it to zero: if it was not, it will fault back in. */
lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
} }
/* /*
......
...@@ -27,13 +27,18 @@ ...@@ -27,13 +27,18 @@
.section .init.text, "ax", @progbits .section .init.text, "ax", @progbits
ENTRY(lguest_entry) ENTRY(lguest_entry)
/* /*
* We make the "initialization" hypercall now to tell the Host about * We make the "initialization" hypercall now to tell the Host where
* us, and also find out where it put our page tables. * our lguest_data struct is.
*/ */
movl $LHCALL_LGUEST_INIT, %eax movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %ebx movl $lguest_data - __PAGE_OFFSET, %ebx
int $LGUEST_TRAP_ENTRY int $LGUEST_TRAP_ENTRY
/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
movl $LHCALL_NEW_PGTABLE, %eax
movl $(initial_page_table - __PAGE_OFFSET), %ebx
int $LGUEST_TRAP_ENTRY
/* Set up the initial stack so we can run C code. */ /* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp movl $(init_thread_union+THREAD_SIZE),%esp
......
...@@ -59,6 +59,8 @@ struct lg_cpu { ...@@ -59,6 +59,8 @@ struct lg_cpu {
struct lguest_pages *last_pages; struct lguest_pages *last_pages;
/* Initialization mode: linear map everything. */
bool linear_pages;
int cpu_pgd; /* Which pgd this cpu is currently using */ int cpu_pgd; /* Which pgd this cpu is currently using */
/* If a hypercall was asked for, this points to the arguments. */ /* If a hypercall was asked for, this points to the arguments. */
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <linux/percpu.h> #include <linux/percpu.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/bootparam.h>
#include "lg.h" #include "lg.h"
/*M:008 /*M:008
...@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
#endif #endif
/* First step: get the top-level Guest page table entry. */ /* First step: get the top-level Guest page table entry. */
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); if (unlikely(cpu->linear_pages)) {
/* Toplevel not present? We can't map it in. */ /* Faking up a linear mapping. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) gpgd = __pgd(CHECK_GPGD_MASK);
return false; } else {
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
return false;
}
/* Now look at the matching shadow entry. */ /* Now look at the matching shadow entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
...@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
} }
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); if (unlikely(cpu->linear_pages)) {
/* Middle level not present? We can't map it in. */ /* Faking up a linear mapping. */
if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) gpmd = __pmd(_PAGE_TABLE);
return false; } else {
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
/* Middle level not present? We can't map it in. */
if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
return false;
}
/* Now look at the matching shadow entry. */ /* Now look at the matching shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr); spmd = spmd_addr(cpu, *spgd, vaddr);
...@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte_ptr = gpte_addr(cpu, gpgd, vaddr); gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
#endif #endif
/* Read the actual PTE value. */ if (unlikely(cpu->linear_pages)) {
gpte = lgread(cpu, gpte_ptr, pte_t); /* Linear? Make up a PTE which points to same page. */
gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
} else {
/* Read the actual PTE value. */
gpte = lgread(cpu, gpte_ptr, pte_t);
}
/* If this page isn't in the Guest page tables, we can't page it in. */ /* If this page isn't in the Guest page tables, we can't page it in. */
if (!(pte_flags(gpte) & _PAGE_PRESENT)) if (!(pte_flags(gpte) & _PAGE_PRESENT))
...@@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) ...@@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Finally, we write the Guest PTE entry back: we've set the * Finally, we write the Guest PTE entry back: we've set the
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
*/ */
lgwrite(cpu, gpte_ptr, pte_t, gpte); if (likely(!cpu->linear_pages))
lgwrite(cpu, gpte_ptr, pte_t, gpte);
/* /*
* The fault is fixed, the page table is populated, the mapping * The fault is fixed, the page table is populated, the mapping
...@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) ...@@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
pmd_t gpmd; pmd_t gpmd;
#endif #endif
/* Still not set up? Just map 1:1. */
if (unlikely(cpu->linear_pages))
return vaddr;
/* First step: get the top-level Guest page table entry. */ /* First step: get the top-level Guest page table entry. */
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */ /* Toplevel not present? We can't map it in. */
...@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, ...@@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
return next; return next;
} }
/*H:430
* (iv) Switching page tables
*
* Now we've seen all the page table setting and manipulation, let's see
* what happens when the Guest changes page tables (ie. changes the top-level
* pgdir). This occurs on almost every context switch.
*/
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
{
int newpgdir, repin = 0;
/* Look to see if we have this one already. */
newpgdir = find_pgdir(cpu->lg, pgtable);
/*
* If not, we allocate or mug an existing one: if it's a fresh one,
* repin gets set to 1.
*/
if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */
if (repin)
pin_stack_pages(cpu);
}
/*H:470 /*H:470
* Finally, a routine which throws away everything: all PGD entries in all * Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables, including the Guest's kernel mappings. This is used * the shadow page tables, including the Guest's kernel mappings. This is used
...@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) ...@@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
/* We need the Guest kernel stack mapped again. */ /* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu); pin_stack_pages(cpu);
} }
/*H:430
* (iv) Switching page tables
*
* Now we've seen all the page table setting and manipulation, let's see
* what happens when the Guest changes page tables (ie. changes the top-level
* pgdir). This occurs on almost every context switch.
*/
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
{
int newpgdir, repin = 0;
/*
* The very first time they call this, we're actually running without
* any page tables; we've been making it up. Throw them away now.
*/
if (unlikely(cpu->linear_pages)) {
release_all_pagetables(cpu->lg);
cpu->linear_pages = false;
/* Force allocation of a new pgdir. */
newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
} else {
/* Look to see if we have this one already. */
newpgdir = find_pgdir(cpu->lg, pgtable);
}
/*
* If not, we allocate or mug an existing one: if it's a fresh one,
* repin gets set to 1.
*/
if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir;
/* If it was completely blank, we map in the Guest kernel stack */
if (repin)
pin_stack_pages(cpu);
}
/*:*/ /*:*/
/*M:009 /*M:009
...@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) ...@@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
} }
#endif #endif
/*H:505
* To get through boot, we construct simple identity page mappings (which
* set virtual == physical) and linear mappings which will get the Guest far
* enough into the boot to create its own. The linear mapping means we
* simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
* as you'll see.
*
* We lay them out of the way, just below the initrd (which is why we need to
* know its size here).
*/
static unsigned long setup_pagetables(struct lguest *lg,
unsigned long mem,
unsigned long initrd_size)
{
pgd_t __user *pgdir;
pte_t __user *linear;
unsigned long mem_base = (unsigned long)lg->mem_base;
unsigned int mapped_pages, i, linear_pages;
#ifdef CONFIG_X86_PAE
pmd_t __user *pmds;
unsigned int j;
pgd_t pgd;
pmd_t pmd;
#else
unsigned int phys_linear;
#endif
/*
* We have mapped_pages frames to map, so we need linear_pages page
* tables to map them.
*/
mapped_pages = mem / PAGE_SIZE;
linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
/* We put the toplevel page directory page at the top of memory. */
pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
/* Now we use the next linear_pages pages as pte pages */
linear = (void *)pgdir - linear_pages * PAGE_SIZE;
#ifdef CONFIG_X86_PAE
/*
* And the single mid page goes below that. We only use one, but
* that's enough to map 1G, which definitely gets us through boot.
*/
pmds = (void *)linear - PAGE_SIZE;
#endif
/*
* Linear mapping is easy: put every page's address into the
* mapping in order.
*/
for (i = 0; i < mapped_pages; i++) {
pte_t pte;
pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
return -EFAULT;
}
#ifdef CONFIG_X86_PAE
/*
* Make the Guest PMD entries point to the corresponding place in the
* linear mapping (up to one page worth of PMD).
*/
for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
i += PTRS_PER_PTE, j++) {
pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
return -EFAULT;
}
/* One PGD entry, pointing to that PMD page. */
pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
/* Copy it in as the first PGD entry (ie. addresses 0-1G). */
if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
return -EFAULT;
/*
* And the other PGD entry to make the linear mapping at PAGE_OFFSET
*/
if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
return -EFAULT;
#else
/*
* The top level points to the linear page table pages above.
* We setup the identity and linear mappings here.
*/
phys_linear = (unsigned long)linear - mem_base;
for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
pgd_t pgd;
/*
* Create a PGD entry which points to the right part of the
* linear PTE pages.
*/
pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
/*
* Copy it into the PGD page at 0 and PAGE_OFFSET.
*/
if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
|| copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
+ i / PTRS_PER_PTE],
&pgd, sizeof(pgd)))
return -EFAULT;
}
#endif
/*
* We return the top level (guest-physical) address: we remember where
* this is to write it into lguest_data when the Guest initializes.
*/
return (unsigned long)pgdir - mem_base;
}
/*H:500 /*H:500
* (vii) Setting up the page tables initially. * (vii) Setting up the page tables initially.
* *
* When a Guest is first created, the Launcher tells us where the toplevel of * When a Guest is first created, set initialize a shadow page table which
* its first page table is. We set some things up here: * we will populate on future faults. The Guest doesn't have any actual
* pagetables yet, so we set linear_pages to tell demand_page() to fake it
* for the moment.
*/ */
int init_guest_pagetable(struct lguest *lg) int init_guest_pagetable(struct lguest *lg)
{ {
u64 mem; struct lg_cpu *cpu = &lg->cpus[0];
u32 initrd_size; int allocated = 0;
struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
#ifdef CONFIG_X86_PAE
pgd_t *pgd;
pmd_t *pmd_table;
#endif
/*
* Get the Guest memory size and the ramdisk size from the boot header
* located at lg->mem_base (Guest address 0).
*/
if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
|| get_user(initrd_size, &boot->hdr.ramdisk_size))
return -EFAULT;
/* /* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
* We start on the first shadow page table, and give it a blank PGD cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
* page. if (!allocated)
*/
lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
return lg->pgdirs[0].gpgdir;
lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
if (!lg->pgdirs[0].pgdir)
return -ENOMEM; return -ENOMEM;
#ifdef CONFIG_X86_PAE /* We start with a linear mapping until the initialize. */
/* For PAE, we also create the initial mid-level. */ cpu->linear_pages = true;
pgd = lg->pgdirs[0].pgdir;
pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
if (!pmd_table)
return -ENOMEM;
set_pgd(pgd + SWITCHER_PGD_INDEX,
__pgd(__pa(pmd_table) | _PAGE_PRESENT));
#endif
/* This is the current page table. */
lg->cpus[0].cpu_pgd = 0;
return 0; return 0;
} }
...@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu) ...@@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
* of virtual addresses used by the Switcher. * of virtual addresses used by the Switcher.
*/ */
|| put_user(RESERVE_MEM * 1024 * 1024, || put_user(RESERVE_MEM * 1024 * 1024,
&cpu->lg->lguest_data->reserve_mem) &cpu->lg->lguest_data->reserve_mem)) {
|| put_user(cpu->lg->pgdirs[0].gpgdir,
&cpu->lg->lguest_data->pgdir))
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
return;
}
/* /*
* In flush_user_mappings() we loop from 0 to * In flush_user_mappings() we loop from 0 to
......
...@@ -59,8 +59,6 @@ struct lguest_data { ...@@ -59,8 +59,6 @@ struct lguest_data {
unsigned long reserve_mem; unsigned long reserve_mem;
/* KHz for the TSC clock. */ /* KHz for the TSC clock. */
u32 tsc_khz; u32 tsc_khz;
/* Page where the top-level pagetable is */
unsigned long pgdir;
/* Fields initialized by the Guest at boot: */ /* Fields initialized by the Guest at boot: */
/* Instruction range to suppress interrupts even if enabled */ /* Instruction range to suppress interrupts even if enabled */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment