Commit acdd0b62 authored by Matias Zabaljauregui's avatar Matias Zabaljauregui Committed by Rusty Russell

lguest: PAE support

This version requires that host and guest have the same PAE status.
NX cap is not offered to the guest, yet.
Signed-off-by: default avatarMatias Zabaljauregui <zabaljauregui@gmail.com>
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent cefcad17
...@@ -37,7 +37,6 @@ Running Lguest: ...@@ -37,7 +37,6 @@ Running Lguest:
"Paravirtualized guest support" = Y "Paravirtualized guest support" = Y
"Lguest guest support" = Y "Lguest guest support" = Y
"High Memory Support" = off/4GB "High Memory Support" = off/4GB
"PAE (Physical Address Extension) Support" = N
"Alignment value to which kernel should be aligned" = 0x100000 "Alignment value to which kernel should be aligned" = 0x100000
(CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
CONFIG_PHYSICAL_ALIGN=0x100000) CONFIG_PHYSICAL_ALIGN=0x100000)
......
...@@ -17,8 +17,13 @@ ...@@ -17,8 +17,13 @@
/* Pages for switcher itself, then two pages per cpu */ /* Pages for switcher itself, then two pages per cpu */
#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
/* We map at -4M for ease of mapping into the guest (one PTE page). */ /* We map at -4M (-2M when PAE is activated) for ease of mapping
* into the guest (one PTE page). */
#ifdef CONFIG_X86_PAE
#define SWITCHER_ADDR 0xFFE00000
#else
#define SWITCHER_ADDR 0xFFC00000 #define SWITCHER_ADDR 0xFFC00000
#endif
/* Found in switcher.S */ /* Found in switcher.S */
extern unsigned long default_idt_entries[]; extern unsigned long default_idt_entries[];
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#define LHCALL_TS 8 #define LHCALL_TS 8
#define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_SET_CLOCKEVENT 9
#define LHCALL_HALT 10 #define LHCALL_HALT 10
#define LHCALL_SET_PMD 13
#define LHCALL_SET_PTE 14 #define LHCALL_SET_PTE 14
#define LHCALL_SET_PGD 15 #define LHCALL_SET_PGD 15
#define LHCALL_LOAD_TLS 16 #define LHCALL_LOAD_TLS 16
...@@ -33,7 +34,7 @@ ...@@ -33,7 +34,7 @@
* operations? There are two ways: the direct way is to make a "hypercall", * operations? There are two ways: the direct way is to make a "hypercall",
* to make requests of the Host Itself. * to make requests of the Host Itself.
* *
* We use the KVM hypercall mechanism. Eighteen hypercalls are * We use the KVM hypercall mechanism. Seventeen hypercalls are
* available: the hypercall number is put in the %eax register, and the * available: the hypercall number is put in the %eax register, and the
* arguments (when required) are placed in %ebx, %ecx, %edx and %esi. * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
* If a return value makes sense, it's returned in %eax. * If a return value makes sense, it's returned in %eax.
......
...@@ -2,7 +2,6 @@ config LGUEST_GUEST ...@@ -2,7 +2,6 @@ config LGUEST_GUEST
bool "Lguest guest support" bool "Lguest guest support"
select PARAVIRT select PARAVIRT
depends on X86_32 depends on X86_32
depends on !X86_PAE
select VIRTIO select VIRTIO
select VIRTIO_RING select VIRTIO_RING
select VIRTIO_CONSOLE select VIRTIO_CONSOLE
......
...@@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call, ...@@ -167,6 +167,7 @@ static void lazy_hcall3(unsigned long call,
async_hcall(call, arg1, arg2, arg3, 0); async_hcall(call, arg1, arg2, arg3, 0);
} }
#ifdef CONFIG_X86_PAE
static void lazy_hcall4(unsigned long call, static void lazy_hcall4(unsigned long call,
unsigned long arg1, unsigned long arg1,
unsigned long arg2, unsigned long arg2,
...@@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call, ...@@ -178,6 +179,7 @@ static void lazy_hcall4(unsigned long call,
else else
async_hcall(call, arg1, arg2, arg3, arg4); async_hcall(call, arg1, arg2, arg3, arg4);
} }
#endif
/* When lazy mode is turned off reset the per-cpu lazy mode variable and then /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
* issue the do-nothing hypercall to flush any stored calls. */ * issue the do-nothing hypercall to flush any stored calls. */
...@@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, ...@@ -380,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
case 1: /* Basic feature request. */ case 1: /* Basic feature request. */
/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
*cx &= 0x00002201; *cx &= 0x00002201;
/* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
*dx &= 0x07808111; *dx &= 0x07808151;
/* The Host can do a nice optimization if it knows that the /* The Host can do a nice optimization if it knows that the
* kernel mappings (addresses above 0xC0000000 or whatever * kernel mappings (addresses above 0xC0000000 or whatever
* PAGE_OFFSET is set to) haven't changed. But Linux calls * PAGE_OFFSET is set to) haven't changed. But Linux calls
...@@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, ...@@ -400,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
if (*ax > 0x80000008) if (*ax > 0x80000008)
*ax = 0x80000008; *ax = 0x80000008;
break; break;
case 0x80000001:
/* Here we should fix nx cap depending on host. */
/* For this version of PAE, we just clear NX bit. */
*dx &= ~(1 << 20);
break;
} }
} }
...@@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val) ...@@ -533,7 +540,12 @@ static void lguest_write_cr4(unsigned long val)
static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep) pte_t *ptep)
{ {
#ifdef CONFIG_X86_PAE
lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
ptep->pte_low, ptep->pte_high);
#else
lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
#endif
} }
static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
...@@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, ...@@ -543,15 +555,37 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
lguest_pte_update(mm, addr, ptep); lguest_pte_update(mm, addr, ptep);
} }
/* The Guest calls this to set a top-level entry. Again, we set the entry then /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
* tell the Host which top-level page we changed, and the index of the entry we * to set a middle-level entry when PAE is activated.
* changed. */ * Again, we set the entry then tell the Host which page we changed,
* and the index of the entry we changed. */
#ifdef CONFIG_X86_PAE
static void lguest_set_pud(pud_t *pudp, pud_t pudval)
{
native_set_pud(pudp, pudval);
/* 32 bytes aligned pdpt address and the index. */
lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
(__pa(pudp) & 0x1F) / sizeof(pud_t));
}
static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
native_set_pmd(pmdp, pmdval);
lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
(__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
}
#else
/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
* activated. */
static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{ {
native_set_pmd(pmdp, pmdval); native_set_pmd(pmdp, pmdval);
lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
(__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
} }
#endif
/* There are a couple of legacy places where the kernel sets a PTE, but we /* There are a couple of legacy places where the kernel sets a PTE, but we
* don't know the top level any more. This is useless for us, since we don't * don't know the top level any more. This is useless for us, since we don't
...@@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) ...@@ -569,6 +603,26 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval)
lazy_hcall1(LHCALL_FLUSH_TLB, 1); lazy_hcall1(LHCALL_FLUSH_TLB, 1);
} }
#ifdef CONFIG_X86_PAE
static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
{
native_set_pte_atomic(ptep, pte);
if (cr3_changed)
lazy_hcall1(LHCALL_FLUSH_TLB, 1);
}
void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
native_pte_clear(mm, addr, ptep);
lguest_pte_update(mm, addr, ptep);
}
void lguest_pmd_clear(pmd_t *pmdp)
{
lguest_set_pmd(pmdp, __pmd(0));
}
#endif
/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
* native page table operations. On native hardware you can set a new page * native page table operations. On native hardware you can set a new page
* table entry whenever you want, but if you want to remove one you have to do * table entry whenever you want, but if you want to remove one you have to do
...@@ -1035,6 +1089,7 @@ __init void lguest_init(void) ...@@ -1035,6 +1089,7 @@ __init void lguest_init(void)
pv_info.name = "lguest"; pv_info.name = "lguest";
pv_info.paravirt_enabled = 1; pv_info.paravirt_enabled = 1;
pv_info.kernel_rpl = 1; pv_info.kernel_rpl = 1;
pv_info.shared_kernel_pmd = 1;
/* We set up all the lguest overrides for sensitive operations. These /* We set up all the lguest overrides for sensitive operations. These
* are detailed with the operations themselves. */ * are detailed with the operations themselves. */
...@@ -1080,6 +1135,12 @@ __init void lguest_init(void) ...@@ -1080,6 +1135,12 @@ __init void lguest_init(void)
pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte = lguest_set_pte;
pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pte_at = lguest_set_pte_at;
pv_mmu_ops.set_pmd = lguest_set_pmd; pv_mmu_ops.set_pmd = lguest_set_pmd;
#ifdef CONFIG_X86_PAE
pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
pv_mmu_ops.pte_clear = lguest_pte_clear;
pv_mmu_ops.pmd_clear = lguest_pmd_clear;
pv_mmu_ops.set_pud = lguest_set_pud;
#endif
pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr2 = lguest_read_cr2;
pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
......
config LGUEST config LGUEST
tristate "Linux hypervisor example code" tristate "Linux hypervisor example code"
depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX depends on X86_32 && EXPERIMENTAL && FUTEX
select HVC_DRIVER select HVC_DRIVER
---help--- ---help---
This is a very simple module which allows you to run This is a very simple module which allows you to run
......
...@@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) ...@@ -77,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
break; break;
case LHCALL_SET_PTE: case LHCALL_SET_PTE:
#ifdef CONFIG_X86_PAE
guest_set_pte(cpu, args->arg1, args->arg2,
__pte(args->arg3 | (u64)args->arg4 << 32));
#else
guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
#endif
break; break;
case LHCALL_SET_PGD: case LHCALL_SET_PGD:
guest_set_pgd(cpu->lg, args->arg1, args->arg2); guest_set_pgd(cpu->lg, args->arg1, args->arg2);
break; break;
#ifdef CONFIG_X86_PAE
case LHCALL_SET_PMD:
guest_set_pmd(cpu->lg, args->arg1, args->arg2);
break;
#endif
case LHCALL_SET_CLOCKEVENT: case LHCALL_SET_CLOCKEVENT:
guest_set_clockevent(cpu, args->arg1); guest_set_clockevent(cpu, args->arg1);
break; break;
......
...@@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); ...@@ -137,6 +137,8 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
* in the kernel. */ * in the kernel. */
#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
/* interrupts_and_traps.c: */ /* interrupts_and_traps.c: */
unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
...@@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg); ...@@ -170,6 +172,9 @@ int init_guest_pagetable(struct lguest *lg);
void free_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg);
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
#ifdef CONFIG_X86_PAE
void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
#endif
void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_clear_all(struct lg_cpu *cpu);
void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu);
void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment