Commit 2c57ee6f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (249 commits)
  KVM: Move apic timer migration away from critical section
  KVM: Put kvm_para.h include outside __KERNEL__
  KVM: Fix unbounded preemption latency
  KVM: Initialize the mmu caches only after verifying cpu support
  KVM: MMU: Fix dirty page setting for pages removed from rmap
  KVM: Portability: Move kvm_fpu to asm-x86/kvm.h
  KVM: x86 emulator: Only allow VMCALL/VMMCALL trapped by #UD
  KVM: MMU: Merge shadow level check in FNAME(fetch)
  KVM: MMU: Move kvm_free_some_pages() into critical section
  KVM: MMU: Switch to mmu spinlock
  KVM: MMU: Avoid calling gfn_to_page() in mmu_set_spte()
  KVM: Add kvm_read_guest_atomic()
  KVM: MMU: Concurrent guest walkers
  KVM: Disable vapic support on Intel machines with FlexPriority
  KVM: Accelerated apic support
  KVM: local APIC TPR access reporting facility
  KVM: Print data for unimplemented wrmsr
  KVM: MMU: Add cache miss statistic
  KVM: MMU: Coalesce remote tlb flushes
  KVM: Expose ioapic to ia64 save/restore APIs
  ...
parents f389e9fc 2f52d58c
......@@ -107,6 +107,7 @@ config ARCH_SUPPORTS_OPROFILE
bool
default y
select HAVE_KVM
config ZONE_DMA32
bool
......@@ -1598,4 +1599,6 @@ source "security/Kconfig"
source "crypto/Kconfig"
source "arch/x86/kvm/Kconfig"
source "lib/Kconfig"
......@@ -7,6 +7,8 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
core-$(CONFIG_KVM) += arch/x86/kvm/
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
......
#
# KVM configuration
#
config HAVE_KVM
bool
menuconfig VIRTUALIZATION
bool "Virtualization"
depends on X86
depends on HAVE_KVM || X86
default y
---help---
Say Y here to get to see options for using your Linux host to run other
......@@ -16,7 +19,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on X86 && EXPERIMENTAL
depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
---help---
......
......@@ -2,7 +2,11 @@
# Makefile for Kernel-based Virtual Machine module
#
kvm-objs := kvm_main.o mmu.o x86_emulate.o i8259.o irq.o lapic.o ioapic.o
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
......
......@@ -28,6 +28,8 @@
#include <linux/mm.h>
#include "irq.h"
#include <linux/kvm_host.h>
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
......@@ -181,10 +183,8 @@ int kvm_pic_read_irq(struct kvm_pic *s)
return intno;
}
static void pic_reset(void *opaque)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
struct kvm_kpic_state *s = opaque;
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
......@@ -209,7 +209,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
addr &= 1;
if (addr == 0) {
if (val & 0x10) {
pic_reset(s); /* init */
kvm_pic_reset(s); /* init */
/*
* deassert a pending interrupt
*/
......
......@@ -20,8 +20,8 @@
*/
#include <linux/module.h>
#include <linux/kvm_host.h>
#include "kvm.h"
#include "irq.h"
/*
......@@ -63,26 +63,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
}
EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
static void vcpu_kick_intr(void *info)
{
#ifdef DEBUG
struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
#endif
}
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
int ipi_pcpu = vcpu->cpu;
if (waitqueue_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq);
++vcpu->stat.halt_wakeup;
}
if (vcpu->guest_mode)
smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
}
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
......
......@@ -22,7 +22,16 @@
#ifndef __IRQ_H
#define __IRQ_H
#include "kvm.h"
#include <linux/mm_types.h>
#include <linux/hrtimer.h>
#include <linux/kvm_host.h>
#include "iodev.h"
#include "ioapic.h"
#include "lapic.h"
struct kvm;
struct kvm_vcpu;
typedef void irq_request_func(void *opaque, int level);
......@@ -57,109 +66,23 @@ struct kvm_pic {
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
void kvm_pic_set_irq(void *opaque, int irq, int level);
int kvm_pic_read_irq(struct kvm_pic *s);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
void kvm_pic_update_irq(struct kvm_pic *s);
#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
#define IOAPIC_EDGE_TRIG 0
#define IOAPIC_LEVEL_TRIG 1
#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
#define IOAPIC_MEM_LENGTH 0x100
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
return kvm->arch.vpic;
}
/* Direct registers. */
#define IOAPIC_REG_SELECT 0x00
#define IOAPIC_REG_WINDOW 0x10
#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
static inline int irqchip_in_kernel(struct kvm *kvm)
{
return pic_irqchip(kvm) != NULL;
}
/* Indirect registers. */
#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
#define IOAPIC_REG_VERSION 0x01
#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
struct kvm_ioapic {
u64 base_address;
u32 ioregsel;
u32 id;
u32 irr;
u32 pad;
union ioapic_redir_entry {
u64 bits;
struct {
u8 vector;
u8 delivery_mode:3;
u8 dest_mode:1;
u8 delivery_status:1;
u8 polarity:1;
u8 remote_irr:1;
u8 trig_mode:1;
u8 mask:1;
u8 reserve:7;
u8 reserved[4];
u8 dest_id;
} fields;
} redirtbl[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
};
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
struct {
atomic_t pending;
s64 period; /* unit: ns */
u32 divide_count;
ktime_t last_update;
struct hrtimer dev;
} timer;
struct kvm_vcpu *vcpu;
struct page *regs_page;
void *regs;
};
#ifdef DEBUG
#define ASSERT(x) \
do { \
if (!(x)) { \
printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
__FILE__, __LINE__, #x); \
BUG(); \
} \
} while (0)
#else
#define ASSERT(x) do { } while (0)
#endif
void kvm_pic_reset(struct kvm_kpic_state *s);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu);
void kvm_free_apic(struct kvm_lapic *apic);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
unsigned long bitmap);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig);
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
#endif
......@@ -4,10 +4,10 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/kvm_host.h>
#include <asm/msr.h>
#include "svm.h"
#include "kvm.h"
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
......
......@@ -17,7 +17,7 @@
* the COPYING file in the top-level directory.
*/
#include "kvm.h"
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/mm.h>
#include <linux/highmem.h>
......@@ -56,6 +56,7 @@
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
{
return *((u32 *) (apic->regs + reg_off));
......@@ -88,7 +89,7 @@ static inline void apic_clear_vector(int vec, void *bitmap)
static inline int apic_hw_enabled(struct kvm_lapic *apic)
{
return (apic)->vcpu->apic_base & MSR_IA32_APICBASE_ENABLE;
return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
}
static inline int apic_sw_enabled(struct kvm_lapic *apic)
......@@ -172,7 +173,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
if (!apic)
......@@ -183,8 +184,10 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
int kvm_apic_set_irq(struct kvm_lapic *apic, u8 vec, u8 trig)
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
{
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic_test_and_set_irr(vec, apic)) {
/* a new pending irq is set in IRR */
if (trig)
......@@ -268,7 +271,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, int dest, int dest_mode)
{
int result = 0;
struct kvm_lapic *target = vcpu->apic;
struct kvm_lapic *target = vcpu->arch.apic;
apic_debug("target %p, source %p, dest 0x%x, "
"dest_mode 0x%x, short_hand 0x%x",
......@@ -335,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
kvm_vcpu_kick(vcpu);
else if (vcpu->mp_state == VCPU_MP_STATE_HALTED) {
vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
......@@ -359,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_INIT:
if (level) {
if (vcpu->mp_state == VCPU_MP_STATE_RUNNABLE)
if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
printk(KERN_DEBUG
"INIT on a runnable vcpu %d\n",
vcpu->vcpu_id);
vcpu->mp_state = VCPU_MP_STATE_INIT_RECEIVED;
vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
printk(KERN_DEBUG
......@@ -376,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_STARTUP:
printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
if (vcpu->mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
vcpu->sipi_vector = vector;
vcpu->mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
......@@ -392,15 +395,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
return result;
}
struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
unsigned long bitmap)
{
int vcpu_id;
int last;
int next;
struct kvm_lapic *apic;
struct kvm_lapic *apic = NULL;
last = kvm->round_robin_prev_vcpu;
last = kvm->arch.round_robin_prev_vcpu;
next = last;
do {
......@@ -408,25 +410,30 @@ struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
next = 0;
if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
continue;
apic = kvm->vcpus[next]->apic;
apic = kvm->vcpus[next]->arch.apic;
if (apic && apic_enabled(apic))
break;
apic = NULL;
} while (next != last);
kvm->round_robin_prev_vcpu = next;
kvm->arch.round_robin_prev_vcpu = next;
if (!apic) {
vcpu_id = ffs(bitmap) - 1;
if (vcpu_id < 0) {
vcpu_id = 0;
printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
}
apic = kvm->vcpus[vcpu_id]->apic;
}
if (!apic)
printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
return apic;
}
struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
unsigned long bitmap)
{
struct kvm_lapic *apic;
apic = kvm_apic_round_robin(kvm, vector, bitmap);
if (apic)
return apic->vcpu;
return NULL;
}
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
......@@ -458,7 +465,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
unsigned int vector = icr_low & APIC_VECTOR_MASK;
struct kvm_lapic *target;
struct kvm_vcpu *target;
struct kvm_vcpu *vcpu;
unsigned long lpr_map = 0;
int i;
......@@ -474,20 +481,20 @@ static void apic_send_ipi(struct kvm_lapic *apic)
if (!vcpu)
continue;
if (vcpu->apic &&
if (vcpu->arch.apic &&
apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
if (delivery_mode == APIC_DM_LOWEST)
set_bit(vcpu->vcpu_id, &lpr_map);
else
__apic_accept_irq(vcpu->apic, delivery_mode,
__apic_accept_irq(vcpu->arch.apic, delivery_mode,
vector, level, trig_mode);
}
}
if (delivery_mode == APIC_DM_LOWEST) {
target = kvm_apic_round_robin(vcpu->kvm, vector, lpr_map);
target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
if (target != NULL)
__apic_accept_irq(target, delivery_mode,
__apic_accept_irq(target->arch.apic, delivery_mode,
vector, level, trig_mode);
}
}
......@@ -544,6 +551,23 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
return tmcct;
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
{
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
kvm_x86_ops->cache_regs(vcpu);
run->tpr_access.rip = vcpu->arch.rip;
run->tpr_access.is_write = write;
}
static inline void report_tpr_access(struct kvm_lapic *apic, bool write)
{
if (apic->vcpu->arch.tpr_access_reporting)
__report_tpr_access(apic, write);
}
static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
{
u32 val = 0;
......@@ -561,6 +585,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
val = apic_get_tmcct(apic);
break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
/* fall thru */
default:
apic_update_ppr(apic);
val = apic_get_reg(apic, offset);
......@@ -670,6 +697,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
break;
case APIC_TASKPRI:
report_tpr_access(apic, true);
apic_set_tpr(apic, val & 0xff);
break;
......@@ -762,19 +790,17 @@ static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr)
return ret;
}
void kvm_free_apic(struct kvm_lapic *apic)
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
if (!apic)
if (!vcpu->arch.apic)
return;
hrtimer_cancel(&apic->timer.dev);
hrtimer_cancel(&vcpu->arch.apic->timer.dev);
if (apic->regs_page) {
__free_page(apic->regs_page);
apic->regs_page = 0;
}
if (vcpu->arch.apic->regs_page)
__free_page(vcpu->arch.apic->regs_page);
kfree(apic);
kfree(vcpu->arch.apic);
}
/*
......@@ -785,16 +811,17 @@ void kvm_free_apic(struct kvm_lapic *apic)
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic)
return;
apic_set_tpr(apic, ((cr8 & 0x0f) << 4));
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (apic_get_reg(apic, APIC_TASKPRI) & 4));
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
u64 tpr;
if (!apic)
......@@ -807,29 +834,29 @@ EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
{
struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
if (!apic) {
value |= MSR_IA32_APICBASE_BSP;
vcpu->apic_base = value;
vcpu->arch.apic_base = value;
return;
}
if (apic->vcpu->vcpu_id)
value &= ~MSR_IA32_APICBASE_BSP;
vcpu->apic_base = value;
apic->base_address = apic->vcpu->apic_base &
vcpu->arch.apic_base = value;
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
/* with FSB delivery interrupt, we can restart APIC functionality */
apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
"0x%lx.\n", apic->apic_base, apic->base_address);
"0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
}
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu)
{
return vcpu->apic_base;
return vcpu->arch.apic_base;
}
EXPORT_SYMBOL_GPL(kvm_lapic_get_base);
......@@ -841,7 +868,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_debug("%s\n", __FUNCTION__);
ASSERT(vcpu);
apic = vcpu->apic;
apic = vcpu->arch.apic;
ASSERT(apic != NULL);
/* Stop the timer in case it's a reset to an active apic */
......@@ -872,19 +899,19 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
update_divide_count(apic);
atomic_set(&apic->timer.pending, 0);
if (vcpu->vcpu_id == 0)
vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
apic_update_ppr(apic);
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
vcpu, kvm_apic_id(apic),
vcpu->apic_base, apic->base_address);
vcpu->arch.apic_base, apic->base_address);
}
EXPORT_SYMBOL_GPL(kvm_lapic_reset);
int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = (struct kvm_lapic *)vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
int ret = 0;
if (!apic)
......@@ -908,9 +935,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
if (waitqueue_active(q))
{
apic->vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
}
if (apic_lvtt_period(apic)) {
......@@ -956,13 +982,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
if (!apic)
goto nomem;
vcpu->apic = apic;
vcpu->arch.apic = apic;
apic->regs_page = alloc_page(GFP_KERNEL);
if (apic->regs_page == NULL) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
goto nomem;
goto nomem_free_apic;
}
apic->regs = page_address(apic->regs_page);
memset(apic->regs, 0, PAGE_SIZE);
......@@ -971,7 +997,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
apic->timer.dev.function = apic_timer_fn;
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->apic_base = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
kvm_lapic_reset(vcpu);
apic->dev.read = apic_mmio_read;
......@@ -980,15 +1006,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
apic->dev.private = apic;
return 0;
nomem_free_apic:
kfree(apic);
nomem:
kvm_free_apic(apic);
return -ENOMEM;
}
EXPORT_SYMBOL_GPL(kvm_create_lapic);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
int highest_irr;
if (!apic || !apic_enabled(apic))
......@@ -1004,11 +1031,11 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
u32 lvt0 = apic_get_reg(vcpu->apic, APIC_LVT0);
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
if (vcpu->vcpu_id == 0) {
if (!apic_hw_enabled(vcpu->apic))
if (!apic_hw_enabled(vcpu->arch.apic))
r = 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
......@@ -1019,7 +1046,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
atomic_read(&apic->timer.pending) > 0) {
......@@ -1030,7 +1057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
struct kvm_lapic *apic = vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
if (apic && apic_lvt_vector(apic, APIC_LVTT) == vec)
apic->timer.last_update = ktime_add_ns(
......@@ -1041,7 +1068,7 @@ void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
if (vector == -1)
return -1;
......@@ -1054,9 +1081,9 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
apic->base_address = vcpu->apic_base &
apic->base_address = vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
apic_set_reg(apic, APIC_LVR, APIC_VERSION);
apic_update_ppr(apic);
......@@ -1065,9 +1092,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
start_apic_timer(apic);
}
void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->apic;
struct kvm_lapic *apic = vcpu->arch.apic;
struct hrtimer *timer;
if (!apic)
......@@ -1077,4 +1104,51 @@ void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
if (hrtimer_cancel(timer))
hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
}
EXPORT_SYMBOL_GPL(kvm_migrate_apic_timer);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
{
u32 data;
void *vapic;
if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
return;
vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr));
kunmap_atomic(vapic, KM_USER0);
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
{
u32 data, tpr;
int max_irr, max_isr;
struct kvm_lapic *apic;
void *vapic;
if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
return;
apic = vcpu->arch.apic;
tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
max_irr = 0;
max_isr = apic_find_highest_isr(apic);
if (max_isr < 0)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0);
*(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data;
kunmap_atomic(vapic, KM_USER0);
}
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
if (!irqchip_in_kernel(vcpu->kvm))
return;
vcpu->arch.apic->vapic_addr = vapic_addr;
}
#ifndef __KVM_X86_LAPIC_H
#define __KVM_X86_LAPIC_H
#include "iodev.h"
#include <linux/kvm_host.h>
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
struct {
atomic_t pending;
s64 period; /* unit: ns */
u32 divide_count;
ktime_t last_update;
struct hrtimer dev;
} timer;
struct kvm_vcpu *vcpu;
struct page *regs_page;
void *regs;
gpa_t vapic_addr;
struct page *vapic_page;
};
int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
void kvm_apic_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
#endif
......@@ -18,16 +18,19 @@
*/
#include "vmx.h"
#include "kvm.h"
#include "mmu.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
#undef MMU_DEBUG
......@@ -82,7 +85,8 @@ static int dbg = 1;
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_MASK (1ULL << 63)
#define PT64_NX_SHIFT 63
#define PT64_NX_MASK (1ULL << PT64_NX_SHIFT)
#define PT_PAT_SHIFT 7
#define PT_DIR_PAT_SHIFT 12
......@@ -90,7 +94,8 @@ static int dbg = 1;
#define PT32_DIR_PSE36_SIZE 4
#define PT32_DIR_PSE36_SHIFT 13
#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#define PT32_DIR_PSE36_MASK \
(((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#define PT_FIRST_AVAIL_BITS_SHIFT 9
......@@ -103,7 +108,7 @@ static int dbg = 1;
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
#define PT64_LEVEL_MASK(level) \
(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
......@@ -115,7 +120,7 @@ static int dbg = 1;
#define PT32_LEVEL_BITS 10
#define PT32_LEVEL_SHIFT(level) \
( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
#define PT32_LEVEL_MASK(level) \
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
......@@ -132,6 +137,8 @@ static int dbg = 1;
#define PT32_DIR_BASE_ADDR_MASK \
(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
#define PFERR_PRESENT_MASK (1U << 0)
#define PFERR_WRITE_MASK (1U << 1)
......@@ -147,6 +154,11 @@ static int dbg = 1;
#define RMAP_EXT 4
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
......@@ -156,9 +168,19 @@ static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
shadow_trap_nonpresent_pte = trap_pte;
shadow_notrap_nonpresent_pte = notrap_pte;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
static int is_write_protection(struct kvm_vcpu *vcpu)
{
return vcpu->cr0 & X86_CR0_WP;
return vcpu->arch.cr0 & X86_CR0_WP;
}
static int is_cpuid_PSE36(void)
......@@ -168,7 +190,7 @@ static int is_cpuid_PSE36(void)
static int is_nx(struct kvm_vcpu *vcpu)
{
return vcpu->shadow_efer & EFER_NX;
return vcpu->arch.shadow_efer & EFER_NX;
}
static int is_present_pte(unsigned long pte)
......@@ -176,11 +198,23 @@ static int is_present_pte(unsigned long pte)
return pte & PT_PRESENT_MASK;
}
static int is_shadow_present_pte(u64 pte)
{
pte &= ~PT_SHADOW_IO_MARK;
return pte != shadow_trap_nonpresent_pte
&& pte != shadow_notrap_nonpresent_pte;
}
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
static int is_dirty_pte(unsigned long pte)
{
return pte & PT_DIRTY_MASK;
}
static int is_io_pte(unsigned long pte)
{
return pte & PT_SHADOW_IO_MARK;
......@@ -188,8 +222,15 @@ static int is_io_pte(unsigned long pte)
static int is_rmap_pte(u64 pte)
{
return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
return pte != shadow_trap_nonpresent_pte
&& pte != shadow_notrap_nonpresent_pte;
}
static gfn_t pse36_gfn_delta(u32 gpte)
{
int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
static void set_shadow_pte(u64 *sptep, u64 spte)
......@@ -250,19 +291,18 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
{
int r;
kvm_mmu_free_some_pages(vcpu);
r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
pte_chain_cache, 4);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
rmap_desc_cache, 1);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->mmu_page_cache, 4);
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
mmu_page_header_cache, 4);
out:
return r;
......@@ -270,10 +310,10 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->mmu_page_cache);
mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
......@@ -289,7 +329,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
{
return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
sizeof(struct kvm_pte_chain));
}
......@@ -300,7 +340,7 @@ static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
{
return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
sizeof(struct kvm_rmap_desc));
}
......@@ -309,36 +349,53 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
kfree(rd);
}
/*
* Take gfn and return the reverse mapping to it.
* Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
slot = gfn_to_memslot(kvm, gfn);
return &slot->rmap[gfn - slot->base_gfn];
}
/*
* Reverse mapping data structures:
*
* If page->private bit zero is zero, then page->private points to the
* shadow page table entry that points to page_address(page).
* If rmapp bit zero is zero, then rmapp point to the shadw page table entry
* that points to page_address(page).
*
* If page->private bit zero is one, (then page->private & ~1) points
* to a struct kvm_rmap_desc containing more mappings.
* If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
* containing more mappings.
*/
static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct page *page;
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
unsigned long *rmapp;
int i;
if (!is_rmap_pte(*spte))
return;
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
if (!page_private(page)) {
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
rmapp = gfn_to_rmap(vcpu->kvm, gfn);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
set_page_private(page,(unsigned long)spte);
} else if (!(page_private(page) & 1)) {
*rmapp = (unsigned long)spte;
} else if (!(*rmapp & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_rmap_desc(vcpu);
desc->shadow_ptes[0] = (u64 *)page_private(page);
desc->shadow_ptes[0] = (u64 *)*rmapp;
desc->shadow_ptes[1] = spte;
set_page_private(page,(unsigned long)desc | 1);
*rmapp = (unsigned long)desc | 1;
} else {
rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
if (desc->shadow_ptes[RMAP_EXT-1]) {
......@@ -351,7 +408,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
}
}
static void rmap_desc_remove_entry(struct page *page,
static void rmap_desc_remove_entry(unsigned long *rmapp,
struct kvm_rmap_desc *desc,
int i,
struct kvm_rmap_desc *prev_desc)
......@@ -365,44 +422,53 @@ static void rmap_desc_remove_entry(struct page *page,
if (j != 0)
return;
if (!prev_desc && !desc->more)
set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
*rmapp = (unsigned long)desc->shadow_ptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
set_page_private(page,(unsigned long)desc->more | 1);
*rmapp = (unsigned long)desc->more | 1;
mmu_free_rmap_desc(desc);
}
static void rmap_remove(u64 *spte)
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
struct page *page;
unsigned long *rmapp;
int i;
if (!is_rmap_pte(*spte))
return;
sp = page_header(__pa(spte));
page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
if (!page_private(page)) {
mark_page_accessed(page);
if (is_writeble_pte(*spte))
kvm_release_page_dirty(page);
else
kvm_release_page_clean(page);
rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
} else if (!(page_private(page) & 1)) {
} else if (!(*rmapp & 1)) {
rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
if ((u64 *)page_private(page) != spte) {
if ((u64 *)*rmapp != spte) {
printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
spte, *spte);
BUG();
}
set_page_private(page,0);
*rmapp = 0;
} else {
rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
if (desc->shadow_ptes[i] == spte) {
rmap_desc_remove_entry(page,
rmap_desc_remove_entry(rmapp,
desc, i,
prev_desc);
return;
......@@ -414,33 +480,56 @@ static void rmap_remove(u64 *spte)
}
}
static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm *kvm = vcpu->kvm;
struct page *page;
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
u64 *prev_spte;
int i;
if (!*rmapp)
return NULL;
else if (!(*rmapp & 1)) {
if (!spte)
return (u64 *)*rmapp;
return NULL;
}
desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
prev_desc = NULL;
prev_spte = NULL;
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) {
if (prev_spte == spte)
return desc->shadow_ptes[i];
prev_spte = desc->shadow_ptes[i];
}
desc = desc->more;
}
return NULL;
}
static void rmap_write_protect(struct kvm *kvm, u64 gfn)
{
unsigned long *rmapp;
u64 *spte;
int write_protected = 0;
page = gfn_to_page(kvm, gfn);
BUG_ON(!page);
gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn);
while (page_private(page)) {
if (!(page_private(page) & 1))
spte = (u64 *)page_private(page);
else {
desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
spte = desc->shadow_ptes[0];
}
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
BUG_ON(!spte);
BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
!= page_to_pfn(page));
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
rmap_remove(spte);
set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
kvm_flush_remote_tlbs(vcpu->kvm);
if (is_writeble_pte(*spte)) {
set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
}
if (write_protected)
kvm_flush_remote_tlbs(kvm);
}
#ifdef MMU_DEBUG
......@@ -450,7 +539,7 @@ static int is_empty_shadow_page(u64 *spt)
u64 *end;
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
if (*pos != 0) {
if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
pos, *pos);
return 0;
......@@ -459,14 +548,14 @@ static int is_empty_shadow_page(u64 *spt)
}
#endif
static void kvm_mmu_free_page(struct kvm *kvm,
struct kvm_mmu_page *page_head)
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(page_head->spt));
list_del(&page_head->link);
__free_page(virt_to_page(page_head->spt));
kfree(page_head);
++kvm->n_free_mmu_pages;
ASSERT(is_empty_shadow_page(sp->spt));
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
__free_page(virt_to_page(sp->gfns));
kfree(sp);
++kvm->arch.n_free_mmu_pages;
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
......@@ -477,26 +566,23 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
struct kvm_mmu_page *page;
if (!vcpu->kvm->n_free_mmu_pages)
return NULL;
struct kvm_mmu_page *sp;
page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
sizeof *page);
page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(page->spt), (unsigned long)page);
list_add(&page->link, &vcpu->kvm->active_mmu_pages);
ASSERT(is_empty_shadow_page(page->spt));
page->slot_bitmap = 0;
page->multimapped = 0;
page->parent_pte = parent_pte;
--vcpu->kvm->n_free_mmu_pages;
return page;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
ASSERT(is_empty_shadow_page(sp->spt));
sp->slot_bitmap = 0;
sp->multimapped = 0;
sp->parent_pte = parent_pte;
--vcpu->kvm->arch.n_free_mmu_pages;
return sp;
}
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page, u64 *parent_pte)
struct kvm_mmu_page *sp, u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
......@@ -504,20 +590,20 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
if (!parent_pte)
return;
if (!page->multimapped) {
u64 *old = page->parent_pte;
if (!sp->multimapped) {
u64 *old = sp->parent_pte;
if (!old) {
page->parent_pte = parent_pte;
sp->parent_pte = parent_pte;
return;
}
page->multimapped = 1;
sp->multimapped = 1;
pte_chain = mmu_alloc_pte_chain(vcpu);
INIT_HLIST_HEAD(&page->parent_ptes);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
INIT_HLIST_HEAD(&sp->parent_ptes);
hlist_add_head(&pte_chain->link, &sp->parent_ptes);
pte_chain->parent_ptes[0] = old;
}
hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
continue;
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
......@@ -528,23 +614,23 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
}
pte_chain = mmu_alloc_pte_chain(vcpu);
BUG_ON(!pte_chain);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
hlist_add_head(&pte_chain->link, &sp->parent_ptes);
pte_chain->parent_ptes[0] = parent_pte;
}
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
int i;
if (!page->multimapped) {
BUG_ON(page->parent_pte != parent_pte);
page->parent_pte = NULL;
if (!sp->multimapped) {
BUG_ON(sp->parent_pte != parent_pte);
sp->parent_pte = NULL;
return;
}
hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
if (!pte_chain->parent_ptes[i])
break;
......@@ -560,9 +646,9 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
if (i == 0) {
hlist_del(&pte_chain->link);
mmu_free_pte_chain(pte_chain);
if (hlist_empty(&page->parent_ptes)) {
page->multimapped = 0;
page->parent_pte = NULL;
if (hlist_empty(&sp->parent_ptes)) {
sp->multimapped = 0;
sp->parent_pte = NULL;
}
}
return;
......@@ -570,22 +656,21 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
BUG();
}
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
gfn_t gfn)
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
struct hlist_head *bucket;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
struct hlist_node *node;
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && !page->role.metaphysical) {
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.metaphysical) {
pgprintk("%s: found role %x\n",
__FUNCTION__, page->role.word);
return page;
__FUNCTION__, sp->role.word);
return sp;
}
return NULL;
}
......@@ -595,22 +680,23 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gva_t gaddr,
unsigned level,
int metaphysical,
unsigned hugepage_access,
u64 *parent_pte)
unsigned access,
u64 *parent_pte,
bool *new_page)
{
union kvm_mmu_page_role role;
unsigned index;
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
struct hlist_node *node;
role.word = 0;
role.glevels = vcpu->mmu.root_level;
role.glevels = vcpu->arch.mmu.root_level;
role.level = level;
role.metaphysical = metaphysical;
role.hugepage_access = hugepage_access;
if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
role.access = access;
if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
......@@ -618,39 +704,43 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
gfn, role.word);
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && page->role.word == role.word) {
mmu_page_add_parent_pte(vcpu, page, parent_pte);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) {
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
pgprintk("%s: found\n", __FUNCTION__);
return page;
return sp;
}
page = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!page)
return page;
++vcpu->kvm->stat.mmu_cache_miss;
sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
page->gfn = gfn;
page->role = role;
hlist_add_head(&page->hash_link, bucket);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
vcpu->arch.mmu.prefetch_page(vcpu, sp);
if (!metaphysical)
rmap_write_protect(vcpu, gfn);
return page;
rmap_write_protect(vcpu->kvm, gfn);
if (new_page)
*new_page = 1;
return sp;
}
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *page)
struct kvm_mmu_page *sp)
{
unsigned i;
u64 *pt;
u64 ent;
pt = page->spt;
pt = sp->spt;
if (page->role.level == PT_PAGE_TABLE_LEVEL) {
if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (pt[i] & PT_PRESENT_MASK)
rmap_remove(&pt[i]);
pt[i] = 0;
if (is_shadow_present_pte(pt[i]))
rmap_remove(kvm, &pt[i]);
pt[i] = shadow_trap_nonpresent_pte;
}
kvm_flush_remote_tlbs(kvm);
return;
......@@ -659,8 +749,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
pt[i] = 0;
if (!(ent & PT_PRESENT_MASK))
pt[i] = shadow_trap_nonpresent_pte;
if (!is_shadow_present_pte(ent))
continue;
ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
......@@ -668,147 +758,240 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
}
static void kvm_mmu_put_page(struct kvm_mmu_page *page,
u64 *parent_pte)
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
{
mmu_page_remove_parent_pte(sp, parent_pte);
}
static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
{
mmu_page_remove_parent_pte(page, parent_pte);
int i;
for (i = 0; i < KVM_MAX_VCPUS; ++i)
if (kvm->vcpus[i])
kvm->vcpus[i]->arch.last_pte_updated = NULL;
}
static void kvm_mmu_zap_page(struct kvm *kvm,
struct kvm_mmu_page *page)
static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
while (page->multimapped || page->parent_pte) {
if (!page->multimapped)
parent_pte = page->parent_pte;
++kvm->stat.mmu_shadow_zapped;
while (sp->multimapped || sp->parent_pte) {
if (!sp->multimapped)
parent_pte = sp->parent_pte;
else {
struct kvm_pte_chain *chain;
chain = container_of(page->parent_ptes.first,
chain = container_of(sp->parent_ptes.first,
struct kvm_pte_chain, link);
parent_pte = chain->parent_ptes[0];
}
BUG_ON(!parent_pte);
kvm_mmu_put_page(page, parent_pte);
set_shadow_pte(parent_pte, 0);
kvm_mmu_put_page(sp, parent_pte);
set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
}
kvm_mmu_page_unlink_children(kvm, page);
if (!page->root_count) {
hlist_del(&page->hash_link);
kvm_mmu_free_page(kvm, page);
kvm_mmu_page_unlink_children(kvm, sp);
if (!sp->root_count) {
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
} else
list_move(&page->link, &kvm->active_mmu_pages);
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_mmu_reset_last_pte_updated(kvm);
}
/*
* Changing the number of mmu pages allocated to the vm
* Note: if kvm_nr_mmu_pages is too small, you will get dead lock
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
/*
* If we set the number of mmu pages to be smaller be than the
* number of actived pages , we must to free some mmu pages before we
* change the value
*/
if ((kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages) >
kvm_nr_mmu_pages) {
int n_used_mmu_pages = kvm->arch.n_alloc_mmu_pages
- kvm->arch.n_free_mmu_pages;
while (n_used_mmu_pages > kvm_nr_mmu_pages) {
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(kvm, page);
n_used_mmu_pages--;
}
kvm->arch.n_free_mmu_pages = 0;
}
else
kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
- kvm->arch.n_alloc_mmu_pages;
kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages;
}
static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
struct hlist_head *bucket;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
struct hlist_node *node, *n;
int r;
pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
r = 0;
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
if (page->gfn == gfn && !page->role.metaphysical) {
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.metaphysical) {
pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
page->role.word);
kvm_mmu_zap_page(vcpu->kvm, page);
sp->role.word);
kvm_mmu_zap_page(kvm, sp);
r = 1;
}
return r;
}
static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n",
__FUNCTION__, gfn, page->role.word);
kvm_mmu_zap_page(vcpu->kvm, page);
while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
kvm_mmu_zap_page(kvm, sp);
}
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
{
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
struct kvm_mmu_page *page_head = page_header(__pa(pte));
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
struct kvm_mmu_page *sp = page_header(__pa(pte));
__set_bit(slot, &page_head->slot_bitmap);
__set_bit(slot, &sp->slot_bitmap);
}
hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
hpa_t hpa = gpa_to_hpa(vcpu, gpa);
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
if (gpa == UNMAPPED_GVA)
return NULL;
return gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
}
hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
int *ptwrite, gfn_t gfn, struct page *page)
{
struct page *page;
u64 spte;
int was_rmapped = is_rmap_pte(*shadow_pte);
int was_writeble = is_writeble_pte(*shadow_pte);
ASSERT((gpa & HPA_ERR_MASK) == 0);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
if (!page)
return gpa | HPA_ERR_MASK;
return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
| (gpa & (PAGE_SIZE-1));
}
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
__FUNCTION__, *shadow_pte, pt_access,
write_fault, user_fault, gfn);
hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
if (!dirty)
pte_access &= ~ACC_WRITE_MASK;
if (!(pte_access & ACC_EXEC_MASK))
spte |= PT64_NX_MASK;
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
if (is_error_page(page)) {
set_shadow_pte(shadow_pte,
shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
kvm_release_page_clean(page);
return;
}
if (gpa == UNMAPPED_GVA)
return UNMAPPED_GVA;
return gpa_to_hpa(vcpu, gpa);
}
spte |= page_to_phys(page);
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;
if (gpa == UNMAPPED_GVA)
return NULL;
return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
spte |= PT_WRITABLE_MASK;
if (user_fault) {
mmu_unshadow(vcpu->kvm, gfn);
goto unshadowed;
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
pte_access &= ~ACC_WRITE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
kvm_x86_ops->tlb_flush(vcpu);
}
if (write_fault)
*ptwrite = 1;
}
}
unshadowed:
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
set_shadow_pte(shadow_pte, spte);
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
rmap_add(vcpu, shadow_pte, gfn);
if (!is_rmap_pte(*shadow_pte))
kvm_release_page_clean(page);
} else {
if (was_writeble)
kvm_release_page_dirty(page);
else
kvm_release_page_clean(page);
}
if (!ptwrite || !*ptwrite)
vcpu->arch.last_pte_updated = shadow_pte;
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
gfn_t gfn, struct page *page)
{
int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->mmu.root_hpa;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
u64 *table;
u64 pte;
ASSERT(VALID_PAGE(table_addr));
table = __va(table_addr);
if (level == 1) {
pte = table[index];
if (is_present_pte(pte) && is_writeble_pte(pte))
return 0;
mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
PT_USER_MASK;
rmap_add(vcpu, &table[index]);
return 0;
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
0, write, 1, &pt_write, gfn, page);
return pt_write || is_io_pte(table[index]);
}
if (table[index] == 0) {
if (table[index] == shadow_trap_nonpresent_pte) {
struct kvm_mmu_page *new_table;
gfn_t pseudo_gfn;
......@@ -816,9 +999,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
>> PAGE_SHIFT;
new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
v, level - 1,
1, 0, &table[index]);
1, ACC_ALL, &table[index],
NULL);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
kvm_release_page_clean(page);
return -ENOMEM;
}
......@@ -829,77 +1014,109 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
}
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
struct page *page;
down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gfn);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
r = __nonpaging_map(vcpu, v, write, gfn, page);
spin_unlock(&vcpu->kvm->mmu_lock);
up_read(&current->mm->mmap_sem);
return r;
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
int i;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
sp->spt[i] = shadow_trap_nonpresent_pte;
}
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
if (!VALID_PAGE(vcpu->mmu.root_hpa))
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
spin_lock(&vcpu->kvm->mmu_lock);
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
page = page_header(root);
--page->root_count;
vcpu->mmu.root_hpa = INVALID_PAGE;
sp = page_header(root);
--sp->root_count;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
}
#endif
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->mmu.pae_root[i];
hpa_t root = vcpu->arch.mmu.pae_root[i];
if (root) {
root &= PT64_BASE_ADDR_MASK;
page = page_header(root);
--page->root_count;
sp = page_header(root);
--sp->root_count;
}
vcpu->mmu.pae_root[i] = INVALID_PAGE;
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
vcpu->mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
int i;
gfn_t root_gfn;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
root_gfn = vcpu->cr3 >> PAGE_SHIFT;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
page = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, 0, 0, NULL);
root = __pa(page->spt);
++page->root_count;
vcpu->mmu.root_hpa = root;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, 0, ACC_ALL, NULL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
return;
}
#endif
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->mmu.pae_root[i];
hpa_t root = vcpu->arch.mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
if (!is_present_pte(vcpu->pdptrs[i])) {
vcpu->mmu.pae_root[i] = 0;
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
if (!is_present_pte(vcpu->arch.pdptrs[i])) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
} else if (vcpu->mmu.root_level == 0)
root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
0, NULL);
root = __pa(page->spt);
++page->root_count;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
ACC_ALL, NULL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
......@@ -908,26 +1125,23 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
u32 error_code)
{
gpa_t addr = gva;
hpa_t paddr;
gfn_t gfn;
int r;
pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
gfn = gva >> PAGE_SHIFT;
if (is_error_hpa(paddr))
return 1;
return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
return nonpaging_map(vcpu, gva & PAGE_MASK,
error_code & PFERR_WRITE_MASK, gfn);
}
static void nonpaging_free(struct kvm_vcpu *vcpu)
......@@ -937,19 +1151,20 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
static int nonpaging_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->mmu;
struct kvm_mmu *context = &vcpu->arch.mmu;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
return 0;
}
static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
kvm_x86_ops->tlb_flush(vcpu);
......@@ -965,7 +1180,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
u64 addr,
u32 err_code)
{
kvm_x86_ops->inject_page_fault(vcpu, addr, err_code);
kvm_inject_page_fault(vcpu, addr, err_code);
}
static void paging_free(struct kvm_vcpu *vcpu)
......@@ -983,12 +1198,13 @@ static void paging_free(struct kvm_vcpu *vcpu)
static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->mmu;
struct kvm_mmu *context = &vcpu->arch.mmu;
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->prefetch_page = paging64_prefetch_page;
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
......@@ -1003,12 +1219,13 @@ static int paging64_init_context(struct kvm_vcpu *vcpu)
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->mmu;
struct kvm_mmu *context = &vcpu->arch.mmu;
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->prefetch_page = paging32_prefetch_page;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
......@@ -1023,7 +1240,7 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (!is_paging(vcpu))
return nonpaging_init_context(vcpu);
......@@ -1038,9 +1255,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
if (VALID_PAGE(vcpu->mmu.root_hpa)) {
vcpu->mmu.free(vcpu);
vcpu->mmu.root_hpa = INVALID_PAGE;
if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
vcpu->arch.mmu.free(vcpu);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
}
......@@ -1055,15 +1272,16 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
mutex_lock(&vcpu->kvm->lock);
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
kvm_x86_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out:
mutex_unlock(&vcpu->kvm->lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_load);
......@@ -1074,47 +1292,116 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
}
static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
struct kvm_mmu_page *sp,
u64 *spte)
{
u64 pte;
struct kvm_mmu_page *child;
pte = *spte;
if (is_present_pte(pte)) {
if (page->role.level == PT_PAGE_TABLE_LEVEL)
rmap_remove(spte);
if (is_shadow_present_pte(pte)) {
if (sp->role.level == PT_PAGE_TABLE_LEVEL)
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
}
}
set_shadow_pte(spte, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
struct kvm_mmu_page *sp,
u64 *spte,
const void *new, int bytes)
const void *new, int bytes,
int offset_in_pte)
{
if (page->role.level != PT_PAGE_TABLE_LEVEL)
if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
else
paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
}
static bool need_remote_flush(u64 old, u64 new)
{
if (!is_shadow_present_pte(old))
return false;
if (!is_shadow_present_pte(new))
return true;
if ((old ^ new) & PT64_BASE_ADDR_MASK)
return true;
old ^= PT64_NX_MASK;
new ^= PT64_NX_MASK;
return (old & ~new & PT64_PERM_MASK) != 0;
}
if (page->role.glevels == PT32_ROOT_LEVEL)
paging32_update_pte(vcpu, page, spte, new, bytes);
static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
{
if (need_remote_flush(old, new))
kvm_flush_remote_tlbs(vcpu->kvm);
else
paging64_update_pte(vcpu, page, spte, new, bytes);
kvm_mmu_flush_tlb(vcpu);
}
static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
{
u64 *spte = vcpu->arch.last_pte_updated;
return !!(spte && (*spte & PT_ACCESSED_MASK));
}
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
{
gfn_t gfn;
int r;
u64 gpte = 0;
if (bytes != 4 && bytes != 8)
return;
/*
* Assume that the pte write on a page table of the same type
* as the current vcpu paging mode. This is nearly always true
* (might be false while changing modes). Note it is verified later
* by update_pte().
*/
if (is_pae(vcpu)) {
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
if ((bytes == 4) && (gpa % 4 == 0)) {
r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
if (r)
return;
memcpy((void *)&gpte + (gpa % 8), new, 4);
} else if ((bytes == 8) && (gpa % 8 == 0)) {
memcpy((void *)&gpte, new, 8);
}
} else {
if ((bytes == 4) && (gpa % 4 == 0))
memcpy((void *)&gpte, new, 4);
}
if (!is_present_pte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
vcpu->arch.update_pte.gfn = gfn;
vcpu->arch.update_pte.page = gfn_to_page(vcpu->kvm, gfn);
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
struct hlist_node *node, *n;
struct hlist_head *bucket;
unsigned index;
u64 entry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
unsigned pte_size;
......@@ -1126,20 +1413,27 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
if (gfn == vcpu->last_pt_write_gfn) {
++vcpu->last_pt_write_count;
if (vcpu->last_pt_write_count >= 3)
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
++vcpu->arch.last_pt_write_count;
if (vcpu->arch.last_pt_write_count >= 3)
flooded = 1;
} else {
vcpu->last_pt_write_gfn = gfn;
vcpu->last_pt_write_count = 1;
vcpu->arch.last_pt_write_gfn = gfn;
vcpu->arch.last_pt_write_count = 1;
vcpu->arch.last_pte_updated = NULL;
}
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
if (page->gfn != gfn || page->role.metaphysical)
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.metaphysical)
continue;
pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
if (misaligned || flooded) {
......@@ -1154,14 +1448,15 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
* page.
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, page->role.word);
kvm_mmu_zap_page(vcpu->kvm, page);
gpa, bytes, sp->role.word);
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
page_offset = offset;
level = page->role.level;
level = sp->role.level;
npte = 1;
if (page->role.glevels == PT32_ROOT_LEVEL) {
if (sp->role.glevels == PT32_ROOT_LEVEL) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
......@@ -1175,46 +1470,101 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
quadrant = page_offset >> PAGE_SHIFT;
page_offset &= ~PAGE_MASK;
if (quadrant != page->role.quadrant)
if (quadrant != sp->role.quadrant)
continue;
}
spte = &page->spt[page_offset / sizeof(*spte)];
spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
mmu_pte_write_zap_pte(vcpu, page, spte);
mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
page_offset & (pte_size - 1));
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
}
kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
if (vcpu->arch.update_pte.page) {
kvm_release_page_clean(vcpu->arch.update_pte.page);
vcpu->arch.update_pte.page = NULL;
}
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
gpa_t gpa;
int r;
down_read(&current->mm->mmap_sem);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
up_read(&current->mm->mmap_sem);
return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
}
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *page;
while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) {
struct kvm_mmu_page *sp;
page = container_of(vcpu->kvm->active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, page);
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
++vcpu->kvm->stat.mmu_recycled;
}
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
{
int r;
enum emulation_result er;
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
if (r < 0)
goto out;
if (!r) {
r = 1;
goto out;
}
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0);
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
kvm_report_emulation_failure(vcpu, "pagetable");
return 1;
default:
BUG();
}
out:
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
page = container_of(vcpu->kvm->active_mmu_pages.next,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, page);
while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
}
free_page((unsigned long)vcpu->mmu.pae_root);
free_page((unsigned long)vcpu->arch.mmu.pae_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
......@@ -1224,8 +1574,12 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
ASSERT(vcpu);
vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
if (vcpu->kvm->arch.n_requested_mmu_pages)
vcpu->kvm->arch.n_free_mmu_pages =
vcpu->kvm->arch.n_requested_mmu_pages;
else
vcpu->kvm->arch.n_free_mmu_pages =
vcpu->kvm->arch.n_alloc_mmu_pages;
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
......@@ -1234,9 +1588,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
page = alloc_page(GFP_KERNEL | __GFP_DMA32);
if (!page)
goto error_1;
vcpu->mmu.pae_root = page_address(page);
vcpu->arch.mmu.pae_root = page_address(page);
for (i = 0; i < 4; ++i)
vcpu->mmu.pae_root[i] = INVALID_PAGE;
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
return 0;
......@@ -1248,7 +1602,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
return alloc_mmu_pages(vcpu);
}
......@@ -1256,7 +1610,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
return init_kvm_mmu(vcpu);
}
......@@ -1272,31 +1626,31 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
list_for_each_entry(page, &kvm->active_mmu_pages, link) {
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
u64 *pt;
if (!test_bit(slot, &page->slot_bitmap))
if (!test_bit(slot, &sp->slot_bitmap))
continue;
pt = page->spt;
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (pt[i] & PT_WRITABLE_MASK) {
rmap_remove(&pt[i]);
if (pt[i] & PT_WRITABLE_MASK)
pt[i] &= ~PT_WRITABLE_MASK;
}
}
}
void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *page, *node;
struct kvm_mmu_page *sp, *node;
list_for_each_entry_safe(page, node, &kvm->active_mmu_pages, link)
kvm_mmu_zap_page(kvm, page);
spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
kvm_mmu_zap_page(kvm, sp);
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
}
......@@ -1337,6 +1691,25 @@ int kvm_mmu_module_init(void)
return -ENOMEM;
}
/*
* Caculate mmu pages needed for kvm.
*/
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
int i;
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
for (i = 0; i < kvm->nmemslots; i++)
nr_pages += kvm->memslots[i].npages;
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
#ifdef AUDIT
static const char *audit_msg;
......@@ -1359,22 +1732,36 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
u64 ent = pt[i];
if (!(ent & PT_PRESENT_MASK))
if (ent == shadow_trap_nonpresent_pte)
continue;
va = canonicalize(va);
if (level > 1)
if (level > 1) {
if (ent == shadow_notrap_nonpresent_pte)
printk(KERN_ERR "audit: (%s) nontrapping pte"
" in nonleaf level: levels %d gva %lx"
" level %d pte %llx\n", audit_msg,
vcpu->arch.mmu.root_level, va, level, ent);
audit_mappings_page(vcpu, ent, va, level - 1);
else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
hpa_t hpa = gpa_to_hpa(vcpu, gpa);
} else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
struct page *page = gpa_to_page(vcpu, gpa);
hpa_t hpa = page_to_phys(page);
if ((ent & PT_PRESENT_MASK)
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
printk(KERN_ERR "audit error: (%s) levels %d"
" gva %lx gpa %llx hpa %llx ent %llx\n",
audit_msg, vcpu->mmu.root_level,
va, gpa, hpa, ent);
printk(KERN_ERR "xx audit error: (%s) levels %d"
" gva %lx gpa %llx hpa %llx ent %llx %d\n",
audit_msg, vcpu->arch.mmu.root_level,
va, gpa, hpa, ent,
is_shadow_present_pte(ent));
else if (ent == shadow_notrap_nonpresent_pte
&& !is_error_hpa(hpa))
printk(KERN_ERR "audit: (%s) notrap shadow,"
" valid guest gva %lx\n", audit_msg, va);
kvm_release_page_clean(page);
}
}
}
......@@ -1383,13 +1770,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
{
unsigned i;
if (vcpu->mmu.root_level == 4)
audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
if (vcpu->arch.mmu.root_level == 4)
audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
else
for (i = 0; i < 4; ++i)
if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
audit_mappings_page(vcpu,
vcpu->mmu.pae_root[i],
vcpu->arch.mmu.pae_root[i],
i << 30,
2);
}
......@@ -1404,15 +1791,15 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
struct kvm_rmap_desc *d;
for (j = 0; j < m->npages; ++j) {
struct page *page = m->phys_mem[j];
unsigned long *rmapp = &m->rmap[j];
if (!page->private)
if (!*rmapp)
continue;
if (!(page->private & 1)) {
if (!(*rmapp & 1)) {
++nmaps;
continue;
}
d = (struct kvm_rmap_desc *)(page->private & ~1ul);
d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
while (d) {
for (k = 0; k < RMAP_EXT; ++k)
if (d->shadow_ptes[k])
......@@ -1429,13 +1816,13 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
static int count_writable_mappings(struct kvm_vcpu *vcpu)
{
int nmaps = 0;
struct kvm_mmu_page *page;
struct kvm_mmu_page *sp;
int i;
list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
u64 *pt = page->spt;
list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
u64 *pt = sp->spt;
if (page->role.level != PT_PAGE_TABLE_LEVEL)
if (sp->role.level != PT_PAGE_TABLE_LEVEL)
continue;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
......@@ -1463,23 +1850,23 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
static void audit_write_protection(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *page;
list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
hfn_t hfn;
struct page *pg;
struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
gfn_t gfn;
if (page->role.metaphysical)
list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
if (sp->role.metaphysical)
continue;
hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
>> PAGE_SHIFT;
pg = pfn_to_page(hfn);
if (pg->private)
slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
" mappings: gfn %lx role %x\n",
__FUNCTION__, audit_msg, page->gfn,
page->role.word);
__FUNCTION__, audit_msg, sp->gfn,
sp->role.word);
}
}
......
#ifndef __KVM_X86_MMU_H
#define __KVM_X86_MMU_H
#include <linux/kvm_host.h>
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
__kvm_mmu_free_some_pages(vcpu);
}
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
}
static inline int is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
return vcpu->arch.shadow_efer & EFER_LME;
#else
return 0;
#endif
}
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return vcpu->arch.cr4 & X86_CR4_PAE;
}
static inline int is_pse(struct kvm_vcpu *vcpu)
{
return vcpu->arch.cr4 & X86_CR4_PSE;
}
static inline int is_paging(struct kvm_vcpu *vcpu)
{
return vcpu->arch.cr0 & X86_CR0_PG;
}
#endif
......@@ -31,9 +31,12 @@
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4
#define CMPXCHG cmpxchg
#else
#define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
#elif PTTYPE == 32
......@@ -45,11 +48,16 @@
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg
#else
#error Invalid PTTYPE value
#endif
#define gpte_to_gfn FNAME(gpte_to_gfn)
#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde)
/*
* The guest_walker structure emulates the behavior of the hardware page
* table walker.
......@@ -57,16 +65,56 @@
struct guest_walker {
int level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t *table;
pt_element_t pte;
pt_element_t *ptep;
struct page *page;
int index;
pt_element_t inherited_ar;
pt_element_t ptes[PT_MAX_FULL_LEVELS];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
u32 error_code;
};
static gfn_t gpte_to_gfn(pt_element_t gpte)
{
return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
static gfn_t gpte_to_gfn_pde(pt_element_t gpte)
{
return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
gfn_t table_gfn, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
{
pt_element_t ret;
pt_element_t *table;
struct page *page;
page = gfn_to_page(kvm, table_gfn);
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
kunmap_atomic(table, KM_USER0);
kvm_release_page_dirty(page);
return (ret != orig_pte);
}
static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
{
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
#if PTTYPE == 64
if (is_nx(vcpu))
access &= ~(gpte >> PT64_NX_SHIFT);
#endif
return access;
}
/*
* Fetch a guest pte for a guest virtual address
*/
......@@ -74,103 +122,104 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, gva_t addr,
int write_fault, int user_fault, int fetch_fault)
{
hpa_t hpa;
struct kvm_memory_slot *slot;
pt_element_t *ptep;
pt_element_t root;
pt_element_t pte;
gfn_t table_gfn;
unsigned index, pt_access, pte_access;
gpa_t pte_gpa;
pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
walker->level = vcpu->mmu.root_level;
walker->table = NULL;
walker->page = NULL;
walker->ptep = NULL;
root = vcpu->cr3;
walk:
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
root = *walker->ptep;
walker->pte = root;
if (!(root & PT_PRESENT_MASK))
pte = vcpu->arch.pdptrs[(addr >> 30) & 3];
if (!is_present_pte(pte))
goto not_present;
--walker->level;
}
#endif
table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
walker->table_gfn[walker->level - 1] = table_gfn;
pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
walker->level - 1, table_gfn);
slot = gfn_to_memslot(vcpu->kvm, table_gfn);
hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
walker->page = pfn_to_page(hpa >> PAGE_SHIFT);
walker->table = kmap_atomic(walker->page, KM_USER0);
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
(vcpu->cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
pt_access = ACC_ALL;
for (;;) {
int index = PT_INDEX(addr, walker->level);
hpa_t paddr;
index = PT_INDEX(addr, walker->level);
ptep = &walker->table[index];
walker->index = index;
ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
((unsigned long)ptep & PAGE_MASK));
table_gfn = gpte_to_gfn(pte);
pte_gpa = gfn_to_gpa(table_gfn);
pte_gpa += index * sizeof(pt_element_t);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
walker->level - 1, table_gfn);
kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
if (!is_present_pte(*ptep))
if (!is_present_pte(pte))
goto not_present;
if (write_fault && !is_writeble_pte(*ptep))
if (write_fault && !is_writeble_pte(pte))
if (user_fault || is_write_protection(vcpu))
goto access_error;
if (user_fault && !(*ptep & PT_USER_MASK))
if (user_fault && !(pte & PT_USER_MASK))
goto access_error;
#if PTTYPE == 64
if (fetch_fault && is_nx(vcpu) && (*ptep & PT64_NX_MASK))
if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
goto access_error;
#endif
if (!(*ptep & PT_ACCESSED_MASK)) {
if (!(pte & PT_ACCESSED_MASK)) {
mark_page_dirty(vcpu->kvm, table_gfn);
*ptep |= PT_ACCESSED_MASK;
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
goto walk;
pte |= PT_ACCESSED_MASK;
}
pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
walker->ptes[walker->level - 1] = pte;
if (walker->level == PT_PAGE_TABLE_LEVEL) {
walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
>> PAGE_SHIFT;
walker->gfn = gpte_to_gfn(pte);
break;
}
if (walker->level == PT_DIRECTORY_LEVEL
&& (*ptep & PT_PAGE_SIZE_MASK)
&& (pte & PT_PAGE_SIZE_MASK)
&& (PTTYPE == 64 || is_pse(vcpu))) {
walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
>> PAGE_SHIFT;
walker->gfn = gpte_to_gfn_pde(pte);
walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
if (PTTYPE == 32 && is_cpuid_PSE36())
walker->gfn += pse36_gfn_delta(pte);
break;
}
walker->inherited_ar &= walker->table[index];
table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
kunmap_atomic(walker->table, KM_USER0);
paddr = safe_gpa_to_hpa(vcpu, table_gfn << PAGE_SHIFT);
walker->page = pfn_to_page(paddr >> PAGE_SHIFT);
walker->table = kmap_atomic(walker->page, KM_USER0);
pt_access = pte_access;
--walker->level;
walker->table_gfn[walker->level - 1 ] = table_gfn;
pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
walker->level - 1, table_gfn);
}
walker->pte = *ptep;
if (walker->page)
walker->ptep = NULL;
if (walker->table)
kunmap_atomic(walker->table, KM_USER0);
pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
if (write_fault && !is_dirty_pte(pte)) {
bool ret;
mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
if (ret)
goto walk;
pte |= PT_DIRTY_MASK;
kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
walker->ptes[walker->level - 1] = pte;
}
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
__FUNCTION__, (u64)pte, pt_access, pte_access);
return 1;
not_present:
......@@ -187,153 +236,35 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
walker->error_code |= PFERR_USER_MASK;
if (fetch_fault)
walker->error_code |= PFERR_FETCH_MASK;
if (walker->table)
kunmap_atomic(walker->table, KM_USER0);
return 0;
}
static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
struct guest_walker *walker)
{
mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
}
static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
u64 *shadow_pte,
gpa_t gaddr,
pt_element_t gpte,
u64 access_bits,
int user_fault,
int write_fault,
int *ptwrite,
struct guest_walker *walker,
gfn_t gfn)
{
hpa_t paddr;
int dirty = gpte & PT_DIRTY_MASK;
u64 spte = *shadow_pte;
int was_rmapped = is_rmap_pte(spte);
pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
" user_fault %d gfn %lx\n",
__FUNCTION__, spte, (u64)gpte, access_bits,
write_fault, user_fault, gfn);
if (write_fault && !dirty) {
pt_element_t *guest_ent, *tmp = NULL;
if (walker->ptep)
guest_ent = walker->ptep;
else {
tmp = kmap_atomic(walker->page, KM_USER0);
guest_ent = &tmp[walker->index];
}
*guest_ent |= PT_DIRTY_MASK;
if (!walker->ptep)
kunmap_atomic(tmp, KM_USER0);
dirty = 1;
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
}
spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
spte |= gpte & PT64_NX_MASK;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
spte |= PT_PRESENT_MASK;
if (access_bits & PT_USER_MASK)
spte |= PT_USER_MASK;
if (is_error_hpa(paddr)) {
spte |= gaddr;
spte |= PT_SHADOW_IO_MARK;
spte &= ~PT_PRESENT_MASK;
set_shadow_pte(shadow_pte, spte);
return;
}
spte |= paddr;
if ((access_bits & PT_WRITABLE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
if (user_fault) {
mmu_unshadow(vcpu, gfn);
goto unshadowed;
}
shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
kvm_x86_ops->tlb_flush(vcpu);
}
if (write_fault)
*ptwrite = 1;
}
}
unshadowed:
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
set_shadow_pte(shadow_pte, spte);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
if (!was_rmapped)
rmap_add(vcpu, shadow_pte);
}
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
u64 *shadow_pte, u64 access_bits,
int user_fault, int write_fault, int *ptwrite,
struct guest_walker *walker, gfn_t gfn)
{
access_bits &= gpte;
FNAME(set_pte_common)(vcpu, shadow_pte, gpte & PT_BASE_ADDR_MASK,
gpte, access_bits, user_fault, write_fault,
ptwrite, walker, gfn);
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
u64 *spte, const void *pte, int bytes)
u64 *spte, const void *pte, int bytes,
int offset_in_pte)
{
pt_element_t gpte;
unsigned pte_access;
struct page *npage;
if (bytes < sizeof(pt_element_t))
return;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
if (!offset_in_pte && !is_present_pte(gpte))
set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
return;
}
if (bytes < sizeof(pt_element_t))
return;
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
0, NULL, NULL,
(gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
}
static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
u64 *shadow_pte, u64 access_bits,
int user_fault, int write_fault, int *ptwrite,
struct guest_walker *walker, gfn_t gfn)
{
gpa_t gaddr;
access_bits &= gpde;
gaddr = (gpa_t)gfn << PAGE_SHIFT;
if (PTTYPE == 32 && is_cpuid_PSE36())
gaddr |= (gpde & PT32_DIR_PSE36_MASK) <<
(32 - PT32_DIR_PSE36_SHIFT);
FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
gpde, access_bits, user_fault, write_fault,
ptwrite, walker, gfn);
pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
npage = vcpu->arch.update_pte.page;
if (!npage)
return;
get_page(npage);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
}
/*
......@@ -341,20 +272,21 @@ static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t gpde,
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker,
int user_fault, int write_fault, int *ptwrite)
int user_fault, int write_fault, int *ptwrite,
struct page *page)
{
hpa_t shadow_addr;
int level;
u64 *shadow_ent;
u64 *prev_shadow_ent = NULL;
unsigned access = walker->pt_access;
if (!is_present_pte(walker->pte))
if (!is_present_pte(walker->ptes[walker->level - 1]))
return NULL;
shadow_addr = vcpu->mmu.root_hpa;
level = vcpu->mmu.shadow_root_level;
shadow_addr = vcpu->arch.mmu.root_hpa;
level = vcpu->arch.mmu.shadow_root_level;
if (level == PT32E_ROOT_LEVEL) {
shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
--level;
}
......@@ -365,54 +297,51 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
u64 shadow_pte;
int metaphysical;
gfn_t table_gfn;
unsigned hugepage_access = 0;
bool new_page = 0;
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
if (level == PT_PAGE_TABLE_LEVEL)
break;
if (level == PT_PAGE_TABLE_LEVEL)
break;
if (is_shadow_present_pte(*shadow_ent)) {
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
prev_shadow_ent = shadow_ent;
continue;
}
if (level == PT_PAGE_TABLE_LEVEL)
break;
if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) {
metaphysical = 1;
hugepage_access = walker->pte;
hugepage_access &= PT_USER_MASK | PT_WRITABLE_MASK;
if (walker->pte & PT64_NX_MASK)
hugepage_access |= (1 << 2);
hugepage_access >>= PT_WRITABLE_SHIFT;
table_gfn = (walker->pte & PT_BASE_ADDR_MASK)
>> PAGE_SHIFT;
if (!is_dirty_pte(walker->ptes[level - 1]))
access &= ~ACC_WRITE_MASK;
table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
} else {
metaphysical = 0;
table_gfn = walker->table_gfn[level - 2];
}
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
metaphysical, hugepage_access,
shadow_ent);
metaphysical, access,
shadow_ent, &new_page);
if (new_page && !metaphysical) {
int r;
pt_element_t curr_pte;
r = kvm_read_guest_atomic(vcpu->kvm,
walker->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
if (r || curr_pte != walker->ptes[level - 2]) {
kvm_release_page_clean(page);
return NULL;
}
}
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
*shadow_ent = shadow_pte;
prev_shadow_ent = shadow_ent;
}
if (walker->level == PT_DIRECTORY_LEVEL) {
FNAME(set_pde)(vcpu, walker->pte, shadow_ent,
walker->inherited_ar, user_fault, write_fault,
ptwrite, walker, walker->gfn);
} else {
ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
FNAME(set_pte)(vcpu, walker->pte, shadow_ent,
walker->inherited_ar, user_fault, write_fault,
ptwrite, walker, walker->gfn);
}
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
ptwrite, walker->gfn, page);
return shadow_ent;
}
......@@ -440,6 +369,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u64 *shadow_pte;
int write_pt = 0;
int r;
struct page *page;
pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
......@@ -448,6 +378,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (r)
return r;
down_read(&current->mm->mmap_sem);
/*
* Look up the shadow pte for the faulting address.
*/
......@@ -460,26 +391,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (!r) {
pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->last_pt_write_count = 0; /* reset fork detector */
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
up_read(&current->mm->mmap_sem);
return 0;
}
page = gfn_to_page(vcpu->kvm, walker.gfn);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
&write_pt);
&write_pt, page);
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
shadow_pte, *shadow_pte, write_pt);
if (!write_pt)
vcpu->last_pt_write_count = 0; /* reset fork detector */
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
/*
* mmio: emulate if accessible, otherwise its a guest fault.
*/
if (is_io_pte(*shadow_pte))
if (shadow_pte && is_io_pte(*shadow_pte)) {
spin_unlock(&vcpu->kvm->mmu_lock);
up_read(&current->mm->mmap_sem);
return 1;
}
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
up_read(&current->mm->mmap_sem);
return write_pt;
}
......@@ -493,13 +434,41 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
if (r) {
gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
}
return gpa;
}
static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
int i, offset = 0, r = 0;
pt_element_t pt;
if (sp->role.metaphysical
|| (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
nonpaging_prefetch_page(vcpu, sp);
return;
}
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
gpa_t pte_gpa = gfn_to_gpa(sp->gfn);
pte_gpa += (i+offset) * sizeof(pt_element_t);
r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt,
sizeof(pt_element_t));
if (r || is_present_pte(pt))
sp->spt[i] = shadow_trap_nonpresent_pte;
else
sp->spt[i] = shadow_notrap_nonpresent_pte;
}
}
#undef pt_element_t
#undef guest_walker
#undef FNAME
......@@ -508,4 +477,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
#undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK
#undef PT_DIR_BASE_ADDR_MASK
#undef PT_LEVEL_BITS
#undef PT_MAX_FULL_LEVELS
#undef gpte_to_gfn
#undef gpte_to_gfn_pde
#undef CMPXCHG
#ifndef __SEGMENT_DESCRIPTOR_H
#define __SEGMENT_DESCRIPTOR_H
struct segment_descriptor {
u16 limit_low;
u16 base_low;
......@@ -14,4 +17,13 @@ struct segment_descriptor {
u8 base_high;
} __attribute__((packed));
#ifdef CONFIG_X86_64
/* LDT or TSS descriptor in the GDT. 16 bytes. */
struct segment_descriptor_64 {
struct segment_descriptor s;
u32 base_higher;
u32 pad_zero;
};
#endif
#endif
......@@ -13,10 +13,11 @@
* the COPYING file in the top-level directory.
*
*/
#include <linux/kvm_host.h>
#include "kvm_svm.h"
#include "x86_emulate.h"
#include "irq.h"
#include "mmu.h"
#include <linux/module.h>
#include <linux/kernel.h>
......@@ -42,9 +43,6 @@ MODULE_LICENSE("GPL");
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
#define KVM_EFER_LMA (1 << 10)
#define KVM_EFER_LME (1 << 8)
#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_DEATURE_SVML (1 << 2)
......@@ -102,20 +100,20 @@ static inline u32 svm_has(u32 feat)
static inline u8 pop_irq(struct kvm_vcpu *vcpu)
{
int word_index = __ffs(vcpu->irq_summary);
int bit_index = __ffs(vcpu->irq_pending[word_index]);
int word_index = __ffs(vcpu->arch.irq_summary);
int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
clear_bit(bit_index, &vcpu->irq_pending[word_index]);
if (!vcpu->irq_pending[word_index])
clear_bit(word_index, &vcpu->irq_summary);
clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
if (!vcpu->arch.irq_pending[word_index])
clear_bit(word_index, &vcpu->arch.irq_summary);
return irq;
}
static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
{
set_bit(irq, vcpu->irq_pending);
set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
set_bit(irq, vcpu->arch.irq_pending);
set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
}
static inline void clgi(void)
......@@ -184,35 +182,30 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (!(efer & KVM_EFER_LMA))
efer &= ~KVM_EFER_LME;
if (!(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
vcpu->shadow_efer = efer;
vcpu->arch.shadow_efer = efer;
}
static void svm_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
SVM_EVTINJ_VALID_ERR |
SVM_EVTINJ_TYPE_EXEPT |
GP_VECTOR;
svm->vmcb->control.event_inj = nr
| SVM_EVTINJ_VALID
| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
| SVM_EVTINJ_TYPE_EXEPT;
svm->vmcb->control.event_inj_err = error_code;
}
static void inject_ud(struct kvm_vcpu *vcpu)
static bool svm_exception_injected(struct kvm_vcpu *vcpu)
{
to_svm(vcpu)->vmcb->control.event_inj = SVM_EVTINJ_VALID |
SVM_EVTINJ_TYPE_EXEPT |
UD_VECTOR;
}
struct vcpu_svm *svm = to_svm(vcpu);
static int is_page_fault(uint32_t info)
{
info &= SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
return info == (PF_VECTOR | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT);
return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
}
static int is_external_interrupt(u32 info)
......@@ -229,17 +222,16 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
return;
}
if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) {
if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
__FUNCTION__,
svm->vmcb->save.rip,
svm->next_rip);
}
vcpu->rip = svm->vmcb->save.rip = svm->next_rip;
vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
vcpu->interrupt_window_open = 1;
vcpu->arch.interrupt_window_open = 1;
}
static int has_svm(void)
......@@ -312,7 +304,7 @@ static void svm_hardware_enable(void *garbage)
svm_data->next_asid = svm_data->max_asid + 1;
svm_features = cpuid_edx(SVM_CPUID_FUNC);
asm volatile ( "sgdt %0" : "=m"(gdt_descr) );
asm volatile ("sgdt %0" : "=m"(gdt_descr));
gdt = (struct desc_struct *)gdt_descr.address;
svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
......@@ -458,11 +450,13 @@ static void init_vmcb(struct vmcb *vmcb)
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
INTERCEPT_CR4_MASK |
INTERCEPT_CR8_MASK;
control->intercept_cr_write = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
INTERCEPT_CR4_MASK;
INTERCEPT_CR4_MASK |
INTERCEPT_CR8_MASK;
control->intercept_dr_read = INTERCEPT_DR0_MASK |
INTERCEPT_DR1_MASK |
......@@ -476,7 +470,8 @@ static void init_vmcb(struct vmcb *vmcb)
INTERCEPT_DR5_MASK |
INTERCEPT_DR7_MASK;
control->intercept_exceptions = 1 << PF_VECTOR;
control->intercept_exceptions = (1 << PF_VECTOR) |
(1 << UD_VECTOR);
control->intercept = (1ULL << INTERCEPT_INTR) |
......@@ -543,8 +538,7 @@ static void init_vmcb(struct vmcb *vmcb)
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
save->efer = MSR_EFER_SVME_MASK;
save->dr6 = 0xffff0ff0;
save->dr6 = 0xffff0ff0;
save->dr7 = 0x400;
save->rflags = 2;
save->rip = 0x0000fff0;
......@@ -558,7 +552,7 @@ static void init_vmcb(struct vmcb *vmcb)
/* rdx = ?? */
}
static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
......@@ -566,9 +560,11 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
if (vcpu->vcpu_id != 0) {
svm->vmcb->save.rip = 0;
svm->vmcb->save.cs.base = svm->vcpu.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.sipi_vector << 8;
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
}
return 0;
}
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
......@@ -587,12 +583,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
if (err)
goto free_svm;
if (irqchip_in_kernel(kvm)) {
err = kvm_create_lapic(&svm->vcpu);
if (err < 0)
goto free_svm;
}
page = alloc_page(GFP_KERNEL);
if (!page) {
err = -ENOMEM;
......@@ -608,9 +598,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
svm->vcpu.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (svm->vcpu.vcpu_id == 0)
svm->vcpu.apic_base |= MSR_IA32_APICBASE_BSP;
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
......@@ -644,7 +634,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* increasing TSC.
*/
rdtscll(tsc_this);
delta = vcpu->host_tsc - tsc_this;
delta = vcpu->arch.host_tsc - tsc_this;
svm->vmcb->control.tsc_offset += delta;
vcpu->cpu = cpu;
kvm_migrate_apic_timer(vcpu);
......@@ -659,11 +649,11 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
int i;
++vcpu->stat.host_state_reload;
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
rdtscll(vcpu->host_tsc);
kvm_put_guest_fpu(vcpu);
rdtscll(vcpu->arch.host_tsc);
}
static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
......@@ -674,17 +664,17 @@ static void svm_cache_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
vcpu->regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->rip = svm->vmcb->save.rip;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.rip = svm->vmcb->save.rip;
}
static void svm_decache_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.rax = vcpu->regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->rip;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.rip;
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
......@@ -782,24 +772,24 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->shadow_efer & KVM_EFER_LME) {
if (vcpu->arch.shadow_efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->shadow_efer |= KVM_EFER_LMA;
svm->vmcb->save.efer |= KVM_EFER_LMA | KVM_EFER_LME;
vcpu->arch.shadow_efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG) ) {
vcpu->shadow_efer &= ~KVM_EFER_LMA;
svm->vmcb->save.efer &= ~(KVM_EFER_LMA | KVM_EFER_LME);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
vcpu->arch.shadow_efer &= ~EFER_LMA;
svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
if ((vcpu->cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
vcpu->fpu_active = 1;
}
vcpu->cr0 = cr0;
vcpu->arch.cr0 = cr0;
cr0 |= X86_CR0_PG | X86_CR0_WP;
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
......@@ -807,7 +797,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
vcpu->cr4 = cr4;
vcpu->arch.cr4 = cr4;
to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
}
......@@ -912,7 +902,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
svm->db_regs[dr] = value;
return;
case 4 ... 5:
if (vcpu->cr4 & X86_CR4_DE) {
if (vcpu->arch.cr4 & X86_CR4_DE) {
*exception = UD_VECTOR;
return;
}
......@@ -938,51 +928,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
struct kvm *kvm = svm->vcpu.kvm;
u64 fault_address;
u32 error_code;
enum emulation_result er;
int r;
if (!irqchip_in_kernel(kvm) &&
is_external_interrupt(exit_int_info))
push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
mutex_lock(&kvm->lock);
fault_address = svm->vmcb->control.exit_info_2;
error_code = svm->vmcb->control.exit_info_1;
r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
if (r < 0) {
mutex_unlock(&kvm->lock);
return r;
}
if (!r) {
mutex_unlock(&kvm->lock);
return 1;
}
er = emulate_instruction(&svm->vcpu, kvm_run, fault_address,
error_code);
mutex_unlock(&kvm->lock);
return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
}
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_DO_MMIO:
++svm->vcpu.stat.mmio_exits;
return 0;
case EMULATE_FAIL:
kvm_report_emulation_failure(&svm->vcpu, "pagetable");
break;
default:
BUG();
}
static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
int er;
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
return 0;
er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
if (!(svm->vcpu.cr0 & X86_CR0_TS))
if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
svm->vmcb->save.cr0 &= ~X86_CR0_TS;
svm->vcpu.fpu_active = 1;
......@@ -1004,7 +973,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
u32 io_info = svm->vmcb->control.exit_info_1; //address size bug?
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, down, in, string, rep;
unsigned port;
......@@ -1015,7 +984,8 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
if (string) {
if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
if (emulate_instruction(&svm->vcpu,
kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
......@@ -1045,13 +1015,14 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
svm->next_rip = svm->vmcb->save.rip + 3;
skip_emulated_instruction(&svm->vcpu);
return kvm_hypercall(&svm->vcpu, kvm_run);
kvm_emulate_hypercall(&svm->vcpu);
return 1;
}
static int invalid_op_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
inject_ud(&svm->vcpu);
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
......@@ -1073,11 +1044,20 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
if (emulate_instruction(&svm->vcpu, NULL, 0, 0) != EMULATE_DONE)
if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
return 1;
}
static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
if (irqchip_in_kernel(svm->vcpu.kvm))
return 1;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
{
struct vcpu_svm *svm = to_svm(vcpu);
......@@ -1124,14 +1104,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data;
if (svm_get_msr(&svm->vcpu, ecx, &data))
svm_inject_gp(&svm->vcpu, 0);
kvm_inject_gp(&svm->vcpu, 0);
else {
svm->vmcb->save.rax = data & 0xffffffff;
svm->vcpu.regs[VCPU_REGS_RDX] = data >> 32;
svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
svm->next_rip = svm->vmcb->save.rip + 2;
skip_emulated_instruction(&svm->vcpu);
}
......@@ -1176,7 +1156,20 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
case MSR_IA32_SYSENTER_ESP:
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
case MSR_K7_EVNTSEL2:
case MSR_K7_EVNTSEL3:
/*
* only support writing 0 to the performance counters for now
* to make Windows happy. Should be replaced by a real
* performance counter emulation later.
*/
if (data != 0)
goto unhandled;
break;
default:
unhandled:
return kvm_set_msr_common(vcpu, ecx, data);
}
return 0;
......@@ -1184,12 +1177,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
u32 ecx = svm->vcpu.regs[VCPU_REGS_RCX];
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vmcb->save.rax & -1u)
| ((u64)(svm->vcpu.regs[VCPU_REGS_RDX] & -1u) << 32);
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
svm->next_rip = svm->vmcb->save.rip + 2;
if (svm_set_msr(&svm->vcpu, ecx, data))
svm_inject_gp(&svm->vcpu, 0);
kvm_inject_gp(&svm->vcpu, 0);
else
skip_emulated_instruction(&svm->vcpu);
return 1;
......@@ -1213,7 +1206,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
* possible
*/
if (kvm_run->request_interrupt_window &&
!svm->vcpu.irq_summary) {
!svm->vcpu.arch.irq_summary) {
++svm->vcpu.stat.irq_window_exits;
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
......@@ -1227,10 +1220,12 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_READ_CR0] = emulate_on_interception,
[SVM_EXIT_READ_CR3] = emulate_on_interception,
[SVM_EXIT_READ_CR4] = emulate_on_interception,
[SVM_EXIT_READ_CR8] = emulate_on_interception,
/* for now: */
[SVM_EXIT_WRITE_CR0] = emulate_on_interception,
[SVM_EXIT_WRITE_CR3] = emulate_on_interception,
[SVM_EXIT_WRITE_CR4] = emulate_on_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
[SVM_EXIT_READ_DR0] = emulate_on_interception,
[SVM_EXIT_READ_DR1] = emulate_on_interception,
[SVM_EXIT_READ_DR2] = emulate_on_interception,
......@@ -1241,6 +1236,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_WRITE_DR3] = emulate_on_interception,
[SVM_EXIT_WRITE_DR5] = emulate_on_interception,
[SVM_EXIT_WRITE_DR7] = emulate_on_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_INTR] = nop_on_interception,
......@@ -1293,7 +1289,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
exit_code);
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| svm_exit_handlers[exit_code] == 0) {
|| !svm_exit_handlers[exit_code]) {
kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
kvm_run->hw.hardware_exit_reason = exit_code;
return 0;
......@@ -1307,7 +1303,7 @@ static void reload_tss(struct kvm_vcpu *vcpu)
int cpu = raw_smp_processor_id();
struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu);
svm_data->tss_desc->type = 9; //available 32/64-bit TSS
svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */
load_TR_desc();
}
......@@ -1348,7 +1344,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
struct vmcb *vmcb = svm->vmcb;
int intr_vector = -1;
kvm_inject_pending_timer_irqs(vcpu);
if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
intr_vector = vmcb->control.exit_int_info &
......@@ -1388,20 +1383,20 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
push_irq(&svm->vcpu, control->int_vector);
}
svm->vcpu.interrupt_window_open =
svm->vcpu.arch.interrupt_window_open =
!(control->int_state & SVM_INTERRUPT_SHADOW_MASK);
}
static void svm_do_inject_vector(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
int word_index = __ffs(vcpu->irq_summary);
int bit_index = __ffs(vcpu->irq_pending[word_index]);
int word_index = __ffs(vcpu->arch.irq_summary);
int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
clear_bit(bit_index, &vcpu->irq_pending[word_index]);
if (!vcpu->irq_pending[word_index])
clear_bit(word_index, &vcpu->irq_summary);
clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
if (!vcpu->arch.irq_pending[word_index])
clear_bit(word_index, &vcpu->arch.irq_summary);
svm_inject_irq(svm, irq);
}
......@@ -1411,11 +1406,11 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_control_area *control = &svm->vmcb->control;
svm->vcpu.interrupt_window_open =
svm->vcpu.arch.interrupt_window_open =
(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
(svm->vmcb->save.rflags & X86_EFLAGS_IF));
if (svm->vcpu.interrupt_window_open && svm->vcpu.irq_summary)
if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
*/
......@@ -1424,13 +1419,18 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
/*
* Interrupts blocked. Wait for unblock.
*/
if (!svm->vcpu.interrupt_window_open &&
(svm->vcpu.irq_summary || kvm_run->request_interrupt_window)) {
if (!svm->vcpu.arch.interrupt_window_open &&
(svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
control->intercept |= 1ULL << INTERCEPT_VINTR;
} else
else
control->intercept &= ~(1ULL << INTERCEPT_VINTR);
}
static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
return 0;
}
static void save_db_regs(unsigned long *db_regs)
{
asm volatile ("mov %%dr0, %0" : "=r"(db_regs[0]));
......@@ -1472,7 +1472,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->host_cr2 = kvm_read_cr2();
svm->host_dr6 = read_dr6();
svm->host_dr7 = read_dr7();
svm->vmcb->save.cr2 = vcpu->cr2;
svm->vmcb->save.cr2 = vcpu->arch.cr2;
if (svm->vmcb->save.dr7 & 0xff) {
write_dr7(0);
......@@ -1486,13 +1486,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
asm volatile (
#ifdef CONFIG_X86_64
"push %%rbx; push %%rcx; push %%rdx;"
"push %%rsi; push %%rdi; push %%rbp;"
"push %%r8; push %%r9; push %%r10; push %%r11;"
"push %%r12; push %%r13; push %%r14; push %%r15;"
"push %%rbp; \n\t"
#else
"push %%ebx; push %%ecx; push %%edx;"
"push %%esi; push %%edi; push %%ebp;"
"push %%ebp; \n\t"
#endif
#ifdef CONFIG_X86_64
......@@ -1554,10 +1550,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
"pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
"pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
"pop %%rbp; pop %%rdi; pop %%rsi;"
"pop %%rdx; pop %%rcx; pop %%rbx; \n\t"
"pop %%rbp; \n\t"
#else
"mov %%ebx, %c[rbx](%[svm]) \n\t"
"mov %%ecx, %c[rcx](%[svm]) \n\t"
......@@ -1566,34 +1559,40 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %%edi, %c[rdi](%[svm]) \n\t"
"mov %%ebp, %c[rbp](%[svm]) \n\t"
"pop %%ebp; pop %%edi; pop %%esi;"
"pop %%edx; pop %%ecx; pop %%ebx; \n\t"
"pop %%ebp; \n\t"
#endif
:
: [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
[rbx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_RBP]))
[rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
#ifdef CONFIG_X86_64
,[r8 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R8])),
[r9 ]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R9 ])),
[r10]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_svm,vcpu.regs[VCPU_REGS_R15]))
, [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif
: "cc", "memory" );
: "cc", "memory"
#ifdef CONFIG_X86_64
, "rbx", "rcx", "rdx", "rsi", "rdi"
, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
#else
, "ebx", "ecx", "edx" , "esi", "edi"
#endif
);
if ((svm->vmcb->save.dr7 & 0xff))
load_db_regs(svm->host_db_regs);
vcpu->cr2 = svm->vmcb->save.cr2;
vcpu->arch.cr2 = svm->vmcb->save.cr2;
write_dr6(svm->host_dr6);
write_dr7(svm->host_dr7);
......@@ -1627,34 +1626,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
}
}
static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
unsigned long addr,
uint32_t err_code)
{
struct vcpu_svm *svm = to_svm(vcpu);
uint32_t exit_int_info = svm->vmcb->control.exit_int_info;
++vcpu->stat.pf_guest;
if (is_page_fault(exit_int_info)) {
svm->vmcb->control.event_inj_err = 0;
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
SVM_EVTINJ_VALID_ERR |
SVM_EVTINJ_TYPE_EXEPT |
DF_VECTOR;
return;
}
vcpu->cr2 = addr;
svm->vmcb->save.cr2 = addr;
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID |
SVM_EVTINJ_VALID_ERR |
SVM_EVTINJ_TYPE_EXEPT |
PF_VECTOR;
svm->vmcb->control.event_inj_err = err_code;
}
static int is_disabled(void)
{
u64 vm_cr;
......@@ -1675,7 +1646,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[0] = 0x0f;
hypercall[1] = 0x01;
hypercall[2] = 0xd9;
hypercall[3] = 0xc3;
}
static void svm_check_processor_compat(void *rtn)
......@@ -1683,6 +1653,11 @@ static void svm_check_processor_compat(void *rtn)
*(int *)rtn = 0;
}
static bool svm_cpu_has_accelerated_tpr(void)
{
return false;
}
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
......@@ -1691,6 +1666,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.check_processor_compatibility = svm_check_processor_compat,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
......@@ -1725,9 +1701,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_rflags = svm_set_rflags,
.tlb_flush = svm_flush_tlb,
.inject_page_fault = svm_inject_page_fault,
.inject_gp = svm_inject_gp,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
......@@ -1735,19 +1708,23 @@ static struct kvm_x86_ops svm_x86_ops = {
.patch_hypercall = svm_patch_hypercall,
.get_irq = svm_get_irq,
.set_irq = svm_set_irq,
.queue_exception = svm_queue_exception,
.exception_injected = svm_exception_injected,
.inject_pending_irq = svm_intr_assist,
.inject_pending_vectors = do_interrupt_requests,
.set_tss_addr = svm_set_tss_addr,
};
static int __init svm_init(void)
{
return kvm_init_x86(&svm_x86_ops, sizeof(struct vcpu_svm),
return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
THIS_MODULE);
}
static void __exit svm_exit(void)
{
kvm_exit_x86();
kvm_exit();
}
module_init(svm_init)
......
......@@ -204,6 +204,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define INTERCEPT_CR0_MASK 1
#define INTERCEPT_CR3_MASK (1 << 3)
#define INTERCEPT_CR4_MASK (1 << 4)
#define INTERCEPT_CR8_MASK (1 << 8)
#define INTERCEPT_DR0_MASK 1
#define INTERCEPT_DR1_MASK (1 << 1)
......@@ -311,7 +312,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_ERR -1
#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) // TS and MP
#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
......
......@@ -15,17 +15,18 @@
*
*/
#include "kvm.h"
#include "x86_emulate.h"
#include "irq.h"
#include "vmx.h"
#include "segment_descriptor.h"
#include "mmu.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <asm/io.h>
#include <asm/desc.h>
......@@ -33,6 +34,9 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static int bypass_guest_pf = 1;
module_param(bypass_guest_pf, bool, 0);
struct vmcs {
u32 revision_id;
u32 abort;
......@@ -43,6 +47,7 @@ struct vcpu_vmx {
struct kvm_vcpu vcpu;
int launched;
u8 fail;
u32 idt_vectoring_info;
struct kvm_msr_entry *guest_msrs;
struct kvm_msr_entry *host_msrs;
int nmsrs;
......@@ -57,8 +62,15 @@ struct vcpu_vmx {
u16 fs_sel, gs_sel, ldt_sel;
int gs_ldt_reload_needed;
int fs_reload_needed;
}host_state;
int guest_efer_loaded;
} host_state;
struct {
struct {
bool pending;
u8 vector;
unsigned rip;
} irq;
} rmode;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
......@@ -74,14 +86,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static struct page *vmx_io_bitmap_a;
static struct page *vmx_io_bitmap_b;
#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
static struct vmcs_config {
int size;
int order;
u32 revision_id;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
} vmcs_config;
......@@ -138,18 +149,6 @@ static void save_msrs(struct kvm_msr_entry *e, int n)
rdmsrl(e[i].index, e[i].data);
}
static inline u64 msr_efer_save_restore_bits(struct kvm_msr_entry msr)
{
return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
}
static inline int msr_efer_need_save_restore(struct vcpu_vmx *vmx)
{
int efer_offset = vmx->msr_offset_efer;
return msr_efer_save_restore_bits(vmx->host_msrs[efer_offset]) !=
msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
}
static inline int is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
......@@ -164,6 +163,13 @@ static inline int is_no_device(u32 intr_info)
(INTR_TYPE_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}
static inline int is_invalid_opcode(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
}
static inline int is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
......@@ -180,6 +186,24 @@ static inline int vm_need_tpr_shadow(struct kvm *kvm)
return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
}
static inline int cpu_has_secondary_exec_ctrls(void)
{
return (vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
}
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
return (vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
}
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return ((cpu_has_vmx_virtualize_apic_accesses()) &&
(irqchip_in_kernel(kvm)));
}
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
......@@ -222,16 +246,14 @@ static void __vcpu_clear(void *arg)
vmcs_clear(vmx->vmcs);
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
rdtscll(vmx->vcpu.host_tsc);
rdtscll(vmx->vcpu.arch.host_tsc);
}
static void vcpu_clear(struct vcpu_vmx *vmx)
{
if (vmx->vcpu.cpu != raw_smp_processor_id() && vmx->vcpu.cpu != -1)
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear,
vmx, 0, 1);
else
__vcpu_clear(vmx);
if (vmx->vcpu.cpu == -1)
return;
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
vmx->launched = 0;
}
......@@ -275,7 +297,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
u8 error;
asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
: "=q"(error) : "a"(value), "d"(field) : "cc" );
: "=q"(error) : "a"(value), "d"(field) : "cc");
if (unlikely(error))
vmwrite_error(field, value);
}
......@@ -315,12 +337,12 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
eb = 1u << PF_VECTOR;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled)
eb |= 1u << 1;
if (vcpu->rmode.active)
if (vcpu->arch.rmode.active)
eb = ~0;
vmcs_write32(EXCEPTION_BITMAP, eb);
}
......@@ -344,16 +366,42 @@ static void reload_tss(void)
static void load_transition_efer(struct vcpu_vmx *vmx)
{
u64 trans_efer;
int efer_offset = vmx->msr_offset_efer;
u64 host_efer = vmx->host_msrs[efer_offset].data;
u64 guest_efer = vmx->guest_msrs[efer_offset].data;
u64 ignore_bits;
trans_efer = vmx->host_msrs[efer_offset].data;
trans_efer &= ~EFER_SAVE_RESTORE_BITS;
trans_efer |= msr_efer_save_restore_bits(vmx->guest_msrs[efer_offset]);
wrmsrl(MSR_EFER, trans_efer);
if (efer_offset < 0)
return;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
return;
vmx->host_state.guest_efer_loaded = 1;
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
wrmsrl(MSR_EFER, guest_efer);
vmx->vcpu.stat.efer_reload++;
}
static void reload_host_efer(struct vcpu_vmx *vmx)
{
if (vmx->host_state.guest_efer_loaded) {
vmx->host_state.guest_efer_loaded = 0;
load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
}
}
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
......@@ -393,14 +441,13 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
if (is_long_mode(&vmx->vcpu))
save_msrs(vmx->host_msrs +
vmx->msr_offset_kernel_gs_base, 1);
}
#endif
load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
if (msr_efer_need_save_restore(vmx))
load_transition_efer(vmx);
load_transition_efer(vmx);
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
......@@ -410,6 +457,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
if (!vmx->host_state.loaded)
return;
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
if (vmx->host_state.fs_reload_needed)
load_fs(vmx->host_state.fs_sel);
......@@ -429,8 +477,7 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
reload_tss();
save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
load_msrs(vmx->host_msrs, vmx->save_nmsrs);
if (msr_efer_need_save_restore(vmx))
load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
reload_host_efer(vmx);
}
/*
......@@ -480,7 +527,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* Make sure the time stamp counter is monotonous.
*/
rdtscll(tsc_this);
delta = vcpu->host_tsc - tsc_this;
delta = vcpu->arch.host_tsc - tsc_this;
vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
}
}
......@@ -488,7 +535,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_load_host_state(to_vmx(vcpu));
kvm_put_guest_fpu(vcpu);
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
......@@ -497,7 +543,7 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
return;
vcpu->fpu_active = 1;
vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
if (vcpu->cr0 & X86_CR0_TS)
if (vcpu->arch.cr0 & X86_CR0_TS)
vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
update_exception_bitmap(vcpu);
}
......@@ -523,7 +569,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->rmode.active)
if (vcpu->arch.rmode.active)
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, rflags);
}
......@@ -545,19 +591,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (interruptibility & 3)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
interruptibility & ~3);
vcpu->interrupt_window_open = 1;
vcpu->arch.interrupt_window_open = 1;
}
static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code)
{
printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
vmcs_readl(GUEST_RIP));
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
GP_VECTOR |
INTR_TYPE_EXCEPTION |
INTR_INFO_DELIEVER_CODE_MASK |
INTR_INFO_VALID_MASK);
nr | INTR_TYPE_EXCEPTION
| (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK);
if (has_error_code)
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
}
static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
}
/*
......@@ -608,7 +660,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
* if efer.sce is enabled.
*/
index = __find_msr_index(vmx, MSR_K6_STAR);
if ((index >= 0) && (vmx->vcpu.shadow_efer & EFER_SCE))
if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
......@@ -712,8 +764,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
#ifdef CONFIG_X86_64
case MSR_EFER:
ret = kvm_set_msr_common(vcpu, msr_index, data);
if (vmx->host_state.loaded)
if (vmx->host_state.loaded) {
reload_host_efer(vmx);
load_transition_efer(vmx);
}
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
......@@ -750,12 +804,12 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
/*
* Sync the rsp and rip registers into the vcpu structure. This allows
* registers to be accessed by indexing vcpu->regs.
* registers to be accessed by indexing vcpu->arch.regs.
*/
static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
{
vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
vcpu->rip = vmcs_readl(GUEST_RIP);
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
vcpu->arch.rip = vmcs_readl(GUEST_RIP);
}
/*
......@@ -764,8 +818,8 @@ static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
*/
static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
{
vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
vmcs_writel(GUEST_RIP, vcpu->rip);
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
vmcs_writel(GUEST_RIP, vcpu->arch.rip);
}
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
......@@ -808,14 +862,15 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
static int vmx_get_irq(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 idtv_info_field;
idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
idtv_info_field = vmx->idt_vectoring_info;
if (idtv_info_field & INTR_INFO_VALID_MASK) {
if (is_external_interrupt(idtv_info_field))
return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
else
printk("pending exception: not handled yet\n");
printk(KERN_DEBUG "pending exception: not handled yet\n");
}
return -1;
}
......@@ -863,7 +918,7 @@ static void hardware_disable(void *garbage)
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32* result)
u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
......@@ -887,6 +942,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
u32 min, opt;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
......@@ -904,11 +960,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING;
#ifdef CONFIG_X86_64
opt = CPU_BASED_TPR_SHADOW;
#else
opt = 0;
#endif
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
......@@ -917,6 +970,19 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min = 0;
opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
return -EIO;
}
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
min = 0;
#ifdef CONFIG_X86_64
......@@ -954,6 +1020,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
......@@ -1043,15 +1110,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
vcpu->rmode.active = 0;
vcpu->arch.rmode.active = 0;
vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
......@@ -1059,10 +1126,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
update_exception_bitmap(vcpu);
fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
......@@ -1072,10 +1139,14 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
}
static gva_t rmode_tss_base(struct kvm* kvm)
static gva_t rmode_tss_base(struct kvm *kvm)
{
gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
if (!kvm->arch.tss_addr) {
gfn_t base_gfn = kvm->memslots[0].base_gfn +
kvm->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
}
static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
......@@ -1086,7 +1157,8 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
save->base = vmcs_readl(sf->base);
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
vmcs_write16(sf->selector, save->base >> 4);
vmcs_write32(sf->base, save->base & 0xfffff);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
}
......@@ -1095,19 +1167,20 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
vcpu->rmode.active = 1;
vcpu->arch.rmode.active = 1;
vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
vcpu->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
vcpu->arch.rmode.save_iopl
= (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
......@@ -1125,10 +1198,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CS_BASE, 0xf0000);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
kvm_mmu_reset_context(vcpu);
init_rmode_tss(vcpu->kvm);
......@@ -1149,7 +1222,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
| AR_TYPE_BUSY_64_TSS);
}
vcpu->shadow_efer |= EFER_LMA;
vcpu->arch.shadow_efer |= EFER_LMA;
find_msr_entry(to_vmx(vcpu), MSR_EFER)->data |= EFER_LMA | EFER_LME;
vmcs_write32(VM_ENTRY_CONTROLS,
......@@ -1159,7 +1232,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
static void exit_lmode(struct kvm_vcpu *vcpu)
{
vcpu->shadow_efer &= ~EFER_LMA;
vcpu->arch.shadow_efer &= ~EFER_LMA;
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
......@@ -1170,22 +1243,22 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
vcpu->cr4 &= KVM_GUEST_CR4_MASK;
vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
}
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
vmx_fpu_deactivate(vcpu);
if (vcpu->rmode.active && (cr0 & X86_CR0_PE))
if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
if (!vcpu->rmode.active && !(cr0 & X86_CR0_PE))
if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->shadow_efer & EFER_LME) {
if (vcpu->arch.shadow_efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
......@@ -1196,7 +1269,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0,
(cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
vcpu->cr0 = cr0;
vcpu->arch.cr0 = cr0;
if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
vmx_fpu_activate(vcpu);
......@@ -1205,16 +1278,16 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
vmcs_writel(GUEST_CR3, cr3);
if (vcpu->cr0 & X86_CR0_PE)
if (vcpu->arch.cr0 & X86_CR0_PE)
vmx_fpu_deactivate(vcpu);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
vcpu->cr4 = cr4;
vcpu->arch.cr4 = cr4;
}
#ifdef CONFIG_X86_64
......@@ -1224,7 +1297,7 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
vcpu->shadow_efer = efer;
vcpu->arch.shadow_efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
......@@ -1301,17 +1374,17 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
vcpu->rmode.tr.selector = var->selector;
vcpu->rmode.tr.base = var->base;
vcpu->rmode.tr.limit = var->limit;
vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
vcpu->arch.rmode.tr.selector = var->selector;
vcpu->arch.rmode.tr.base = var->base;
vcpu->arch.rmode.tr.limit = var->limit;
vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
if (vcpu->rmode.active && var->s) {
if (vcpu->arch.rmode.active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
......@@ -1355,36 +1428,38 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
vmcs_writel(GUEST_GDTR_BASE, dt->base);
}
static int init_rmode_tss(struct kvm* kvm)
static int init_rmode_tss(struct kvm *kvm)
{
struct page *p1, *p2, *p3;
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
char *page;
p1 = gfn_to_page(kvm, fn++);
p2 = gfn_to_page(kvm, fn++);
p3 = gfn_to_page(kvm, fn);
if (!p1 || !p2 || !p3) {
kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
return 0;
}
page = kmap_atomic(p1, KM_USER0);
clear_page(page);
*(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
kunmap_atomic(page, KM_USER0);
page = kmap_atomic(p2, KM_USER0);
clear_page(page);
kunmap_atomic(page, KM_USER0);
u16 data = 0;
int ret = 0;
int r;
page = kmap_atomic(p3, KM_USER0);
clear_page(page);
*(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
kunmap_atomic(page, KM_USER0);
down_read(&current->mm->mmap_sem);
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
if (r < 0)
goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
if (r < 0)
goto out;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
data = ~0;
r = kvm_write_guest_page(kvm, fn, &data,
RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
sizeof(u8));
if (r < 0)
goto out;
return 1;
ret = 1;
out:
up_read(&current->mm->mmap_sem);
return ret;
}
static void seg_setup(int seg)
......@@ -1397,6 +1472,27 @@ static void seg_setup(int seg)
vmcs_write32(sf->ar_bytes, 0x93);
}
static int alloc_apic_access_page(struct kvm *kvm)
{
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
down_write(&current->mm->mmap_sem);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
up_write(&current->mm->mmap_sem);
return r;
}
/*
* Sets up the vmcs for emulated real mode.
*/
......@@ -1407,92 +1503,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
unsigned long a;
struct descriptor_table dt;
int i;
int ret = 0;
unsigned long kvm_vmx_return;
u64 msr;
u32 exec_control;
if (!init_rmode_tss(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
}
vmx->vcpu.rmode.active = 0;
vmx->vcpu.regs[VCPU_REGS_RDX] = get_rdx_init_val();
set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (vmx->vcpu.vcpu_id == 0)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
fx_init(&vmx->vcpu);
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
if (vmx->vcpu.vcpu_id == 0) {
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.sipi_vector << 12);
}
vmcs_write32(GUEST_CS_LIMIT, 0xffff);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES);
seg_setup(VCPU_SREG_FS);
seg_setup(VCPU_SREG_GS);
seg_setup(VCPU_SREG_SS);
vmcs_write16(GUEST_TR_SELECTOR, 0);
vmcs_writel(GUEST_TR_BASE, 0);
vmcs_write32(GUEST_TR_LIMIT, 0xffff);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
vmcs_write16(GUEST_LDTR_SELECTOR, 0);
vmcs_writel(GUEST_LDTR_BASE, 0);
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
vmcs_writel(GUEST_RFLAGS, 0x02);
if (vmx->vcpu.vcpu_id == 0)
vmcs_writel(GUEST_RIP, 0xfff0);
else
vmcs_writel(GUEST_RIP, 0);
vmcs_writel(GUEST_RSP, 0);
//todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
vmcs_writel(GUEST_DR7, 0x400);
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_ACTIVITY_STATE, 0);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
/* I/O */
vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
guest_write_tsc(0);
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
/* Special registers */
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
/* Control */
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
vmcs_config.pin_based_exec_ctrl);
......@@ -1507,8 +1526,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
}
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
if (cpu_has_secondary_exec_ctrls()) {
exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
exec_control &=
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
......@@ -1536,7 +1563,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
get_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
......@@ -1567,97 +1594,145 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
++vmx->nmsrs;
}
setup_msrs(vmx);
vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
/* 22.2.1, 20.8.1 */
vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
#ifdef CONFIG_X86_64
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
if (vm_need_tpr_shadow(vmx->vcpu.kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
page_to_phys(vmx->vcpu.apic->regs_page));
vmcs_write32(TPR_THRESHOLD, 0);
#endif
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
vmx->vcpu.cr0 = 0x60000010;
vmx_set_cr0(&vmx->vcpu, vmx->vcpu.cr0); // enter rmode
vmx_set_cr4(&vmx->vcpu, 0);
#ifdef CONFIG_X86_64
vmx_set_efer(&vmx->vcpu, 0);
#endif
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
if (alloc_apic_access_page(vmx->vcpu.kvm) != 0)
return -ENOMEM;
return 0;
out:
return ret;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
int ret;
vmx_vcpu_setup(vmx);
}
static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
{
u16 ent[2];
u16 cs;
u16 ip;
unsigned long flags;
unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
u16 sp = vmcs_readl(GUEST_RSP);
u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
if (sp > ss_limit || sp < 6 ) {
vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
__FUNCTION__,
vmcs_readl(GUEST_RSP),
vmcs_readl(GUEST_SS_BASE),
vmcs_read32(GUEST_SS_LIMIT));
return;
if (!init_rmode_tss(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
}
if (emulator_read_std(irq * sizeof(ent), &ent, sizeof(ent), vcpu) !=
X86EMUL_CONTINUE) {
vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
return;
vmx->vcpu.arch.rmode.active = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (vmx->vcpu.vcpu_id == 0)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
fx_init(&vmx->vcpu);
/*
* GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
* insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
*/
if (vmx->vcpu.vcpu_id == 0) {
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0x000f0000);
} else {
vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
}
vmcs_write32(GUEST_CS_LIMIT, 0xffff);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES);
seg_setup(VCPU_SREG_FS);
seg_setup(VCPU_SREG_GS);
seg_setup(VCPU_SREG_SS);
vmcs_write16(GUEST_TR_SELECTOR, 0);
vmcs_writel(GUEST_TR_BASE, 0);
vmcs_write32(GUEST_TR_LIMIT, 0xffff);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
cs = vmcs_readl(GUEST_CS_BASE) >> 4;
ip = vmcs_readl(GUEST_RIP);
vmcs_write16(GUEST_LDTR_SELECTOR, 0);
vmcs_writel(GUEST_LDTR_BASE, 0);
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
vmcs_write32(GUEST_SYSENTER_CS, 0);
vmcs_writel(GUEST_SYSENTER_ESP, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
if (emulator_write_emulated(ss_base + sp - 2, &flags, 2, vcpu) != X86EMUL_CONTINUE ||
emulator_write_emulated(ss_base + sp - 4, &cs, 2, vcpu) != X86EMUL_CONTINUE ||
emulator_write_emulated(ss_base + sp - 6, &ip, 2, vcpu) != X86EMUL_CONTINUE) {
vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
return;
vmcs_writel(GUEST_RFLAGS, 0x02);
if (vmx->vcpu.vcpu_id == 0)
vmcs_writel(GUEST_RIP, 0xfff0);
else
vmcs_writel(GUEST_RIP, 0);
vmcs_writel(GUEST_RSP, 0);
/* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
vmcs_writel(GUEST_DR7, 0x400);
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_ACTIVITY_STATE, 0);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
guest_write_tsc(0);
/* Special registers */
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
setup_msrs(vmx);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
if (cpu_has_vmx_tpr_shadow()) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
if (vm_need_tpr_shadow(vmx->vcpu.kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
page_to_phys(vmx->vcpu.arch.apic->regs_page));
vmcs_write32(TPR_THRESHOLD, 0);
}
vmcs_writel(GUEST_RFLAGS, flags &
~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
vmcs_writel(GUEST_RIP, ent[0]);
vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
vmx->vcpu.arch.cr0 = 0x60000010;
vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
#ifdef CONFIG_X86_64
vmx_set_efer(&vmx->vcpu, 0);
#endif
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
return 0;
out:
return ret;
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
{
if (vcpu->rmode.active) {
inject_rmode_irq(vcpu, irq);
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
......@@ -1666,13 +1741,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
{
int word_index = __ffs(vcpu->irq_summary);
int bit_index = __ffs(vcpu->irq_pending[word_index]);
int word_index = __ffs(vcpu->arch.irq_summary);
int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
int irq = word_index * BITS_PER_LONG + bit_index;
clear_bit(bit_index, &vcpu->irq_pending[word_index]);
if (!vcpu->irq_pending[word_index])
clear_bit(word_index, &vcpu->irq_summary);
clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
if (!vcpu->arch.irq_pending[word_index])
clear_bit(word_index, &vcpu->arch.irq_summary);
vmx_inject_irq(vcpu, irq);
}
......@@ -1682,12 +1757,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
{
u32 cpu_based_vm_exec_control;
vcpu->interrupt_window_open =
vcpu->arch.interrupt_window_open =
((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
if (vcpu->interrupt_window_open &&
vcpu->irq_summary &&
if (vcpu->arch.interrupt_window_open &&
vcpu->arch.irq_summary &&
!(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
/*
* If interrupts enabled, and not blocked by sti or mov ss. Good.
......@@ -1695,8 +1770,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
kvm_do_inject_irq(vcpu);
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
if (!vcpu->interrupt_window_open &&
(vcpu->irq_summary || kvm_run->request_interrupt_window))
if (!vcpu->arch.interrupt_window_open &&
(vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
/*
* Interrupts blocked. Wait for unblock.
*/
......@@ -1706,6 +1781,23 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
}
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
int ret;
struct kvm_userspace_memory_region tss_mem = {
.slot = 8,
.guest_phys_addr = addr,
.memory_size = PAGE_SIZE * 3,
.flags = 0,
};
ret = kvm_set_memory_region(kvm, &tss_mem, 0);
if (ret)
return ret;
kvm->arch.tss_addr = addr;
return 0;
}
static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
{
struct kvm_guest_debug *dbg = &vcpu->guest_debug;
......@@ -1727,7 +1819,7 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
static int handle_rmode_exception(struct kvm_vcpu *vcpu,
int vec, u32 err_code)
{
if (!vcpu->rmode.active)
if (!vcpu->arch.rmode.active)
return 0;
/*
......@@ -1735,32 +1827,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
return 1;
return 0;
}
static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info, error_code;
unsigned long cr2, rip;
u32 vect_info;
enum emulation_result er;
int r;
vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vect_info = vmx->idt_vectoring_info;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
!is_page_fault(intr_info)) {
!is_page_fault(intr_info))
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
"intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
}
if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
set_bit(irq, vcpu->irq_pending);
set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
set_bit(irq, vcpu->arch.irq_pending);
set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
}
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
......@@ -1771,52 +1862,34 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
if (is_invalid_opcode(intr_info)) {
er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
error_code = 0;
rip = vmcs_readl(GUEST_RIP);
if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
cr2 = vmcs_readl(EXIT_QUALIFICATION);
mutex_lock(&vcpu->kvm->lock);
r = kvm_mmu_page_fault(vcpu, cr2, error_code);
if (r < 0) {
mutex_unlock(&vcpu->kvm->lock);
return r;
}
if (!r) {
mutex_unlock(&vcpu->kvm->lock);
return 1;
}
er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
mutex_unlock(&vcpu->kvm->lock);
switch (er) {
case EMULATE_DONE:
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
return 0;
case EMULATE_FAIL:
kvm_report_emulation_failure(vcpu, "pagetable");
break;
default:
BUG();
}
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
if (vcpu->rmode.active &&
if (vcpu->arch.rmode.active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) {
if (vcpu->halt_request) {
vcpu->halt_request = 0;
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
return kvm_emulate_halt(vcpu);
}
return 1;
}
if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) ==
(INTR_TYPE_EXCEPTION | 1)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
return 0;
}
......@@ -1850,7 +1923,8 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
string = (exit_qualification & 16) != 0;
if (string) {
if (emulate_instruction(vcpu, kvm_run, 0, 0) == EMULATE_DO_MMIO)
if (emulate_instruction(vcpu,
kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
return 0;
return 1;
}
......@@ -1873,7 +1947,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[0] = 0x0f;
hypercall[1] = 0x01;
hypercall[2] = 0xc1;
hypercall[3] = 0xc3;
}
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
......@@ -1890,23 +1963,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
switch (cr) {
case 0:
vcpu_load_rsp_rip(vcpu);
set_cr0(vcpu, vcpu->regs[reg]);
set_cr0(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 3:
vcpu_load_rsp_rip(vcpu);
set_cr3(vcpu, vcpu->regs[reg]);
set_cr3(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
set_cr4(vcpu, vcpu->regs[reg]);
set_cr4(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
set_cr8(vcpu, vcpu->regs[reg]);
set_cr8(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
};
......@@ -1914,8 +1989,8 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
case 2: /* clts */
vcpu_load_rsp_rip(vcpu);
vmx_fpu_deactivate(vcpu);
vcpu->cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
vmx_fpu_activate(vcpu);
skip_emulated_instruction(vcpu);
return 1;
......@@ -1923,13 +1998,13 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
switch (cr) {
case 3:
vcpu_load_rsp_rip(vcpu);
vcpu->regs[reg] = vcpu->cr3;
vcpu->arch.regs[reg] = vcpu->arch.cr3;
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
vcpu->regs[reg] = get_cr8(vcpu);
vcpu->arch.regs[reg] = get_cr8(vcpu);
vcpu_put_rsp_rip(vcpu);
skip_emulated_instruction(vcpu);
return 1;
......@@ -1975,7 +2050,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
default:
val = 0;
}
vcpu->regs[reg] = val;
vcpu->arch.regs[reg] = val;
} else {
/* mov to dr */
}
......@@ -1992,29 +2067,29 @@ static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u32 ecx = vcpu->regs[VCPU_REGS_RCX];
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
u64 data;
if (vmx_get_msr(vcpu, ecx, &data)) {
vmx_inject_gp(vcpu, 0);
kvm_inject_gp(vcpu, 0);
return 1;
}
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->regs[VCPU_REGS_RAX] = data & -1u;
vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
skip_emulated_instruction(vcpu);
return 1;
}
static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u32 ecx = vcpu->regs[VCPU_REGS_RCX];
u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
if (vmx_set_msr(vcpu, ecx, data) != 0) {
vmx_inject_gp(vcpu, 0);
kvm_inject_gp(vcpu, 0);
return 1;
}
......@@ -2042,7 +2117,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
* possible
*/
if (kvm_run->request_interrupt_window &&
!vcpu->irq_summary) {
!vcpu->arch.irq_summary) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.irq_window_exits;
return 0;
......@@ -2059,7 +2134,35 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
return kvm_hypercall(vcpu, kvm_run);
kvm_emulate_hypercall(vcpu);
return 1;
}
static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
skip_emulated_instruction(vcpu);
/* TODO: Add support for VT-d/pass-through device */
return 1;
}
static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u64 exit_qualification;
enum emulation_result er;
unsigned long offset;
exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
offset = exit_qualification & 0xffful;
er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
if (er != EMULATE_DONE) {
printk(KERN_ERR
"Fail to handle apic access vmexit! Offset is 0x%lx\n",
offset);
return -ENOTSUPP;
}
return 1;
}
/*
......@@ -2081,7 +2184,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
};
static const int kvm_vmx_max_exit_handlers =
......@@ -2093,9 +2198,9 @@ static const int kvm_vmx_max_exit_handlers =
*/
static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
if (unlikely(vmx->fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
......@@ -2104,8 +2209,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return 0;
}
if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
exit_reason != EXIT_REASON_EXCEPTION_NMI )
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
exit_reason != EXIT_REASON_EXCEPTION_NMI)
printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
"exit reason is 0x%x\n", __FUNCTION__, exit_reason);
if (exit_reason < kvm_vmx_max_exit_handlers
......@@ -2150,26 +2255,38 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
static void vmx_intr_assist(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 idtv_info_field, intr_info_field;
int has_ext_irq, interrupt_window_open;
int vector;
kvm_inject_pending_timer_irqs(vcpu);
update_tpr_threshold(vcpu);
has_ext_irq = kvm_cpu_has_interrupt(vcpu);
intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
idtv_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
idtv_info_field = vmx->idt_vectoring_info;
if (intr_info_field & INTR_INFO_VALID_MASK) {
if (idtv_info_field & INTR_INFO_VALID_MASK) {
/* TODO: fault when IDT_Vectoring */
printk(KERN_ERR "Fault when IDT_Vectoring\n");
if (printk_ratelimit())
printk(KERN_ERR "Fault when IDT_Vectoring\n");
}
if (has_ext_irq)
enable_irq_window(vcpu);
return;
}
if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
== INTR_TYPE_EXT_INTR
&& vcpu->arch.rmode.active) {
u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
vmx_inject_irq(vcpu, vect);
if (unlikely(has_ext_irq))
enable_irq_window(vcpu);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
......@@ -2194,6 +2311,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
enable_irq_window(vcpu);
}
/*
* Failure to inject an interrupt should give us the information
* in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
* when fetching the interrupt redirection bitmap in the real-mode
* tss, this doesn't happen. So we do it ourselves.
*/
static void fixup_rmode_irq(struct vcpu_vmx *vmx)
{
vmx->rmode.irq.pending = 0;
if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
return;
vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
return;
}
vmx->idt_vectoring_info =
VECTORING_INFO_VALID_MASK
| INTR_TYPE_EXT_INTR
| vmx->rmode.irq.vector;
}
static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
......@@ -2204,50 +2344,47 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
*/
vmcs_writel(HOST_CR0, read_cr0());
asm (
asm(
/* Store host registers */
#ifdef CONFIG_X86_64
"push %%rax; push %%rbx; push %%rdx;"
"push %%rsi; push %%rdi; push %%rbp;"
"push %%r8; push %%r9; push %%r10; push %%r11;"
"push %%r12; push %%r13; push %%r14; push %%r15;"
"push %%rdx; push %%rbp;"
"push %%rcx \n\t"
ASM_VMX_VMWRITE_RSP_RDX "\n\t"
#else
"pusha; push %%ecx \n\t"
ASM_VMX_VMWRITE_RSP_RDX "\n\t"
"push %%edx; push %%ebp;"
"push %%ecx \n\t"
#endif
ASM_VMX_VMWRITE_RSP_RDX "\n\t"
/* Check if vmlaunch of vmresume is needed */
"cmp $0, %1 \n\t"
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
#ifdef CONFIG_X86_64
"mov %c[cr2](%3), %%rax \n\t"
"mov %c[cr2](%0), %%rax \n\t"
"mov %%rax, %%cr2 \n\t"
"mov %c[rax](%3), %%rax \n\t"
"mov %c[rbx](%3), %%rbx \n\t"
"mov %c[rdx](%3), %%rdx \n\t"
"mov %c[rsi](%3), %%rsi \n\t"
"mov %c[rdi](%3), %%rdi \n\t"
"mov %c[rbp](%3), %%rbp \n\t"
"mov %c[r8](%3), %%r8 \n\t"
"mov %c[r9](%3), %%r9 \n\t"
"mov %c[r10](%3), %%r10 \n\t"
"mov %c[r11](%3), %%r11 \n\t"
"mov %c[r12](%3), %%r12 \n\t"
"mov %c[r13](%3), %%r13 \n\t"
"mov %c[r14](%3), %%r14 \n\t"
"mov %c[r15](%3), %%r15 \n\t"
"mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
"mov %c[rax](%0), %%rax \n\t"
"mov %c[rbx](%0), %%rbx \n\t"
"mov %c[rdx](%0), %%rdx \n\t"
"mov %c[rsi](%0), %%rsi \n\t"
"mov %c[rdi](%0), %%rdi \n\t"
"mov %c[rbp](%0), %%rbp \n\t"
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
"mov %c[r11](%0), %%r11 \n\t"
"mov %c[r12](%0), %%r12 \n\t"
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
"mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
#else
"mov %c[cr2](%3), %%eax \n\t"
"mov %c[cr2](%0), %%eax \n\t"
"mov %%eax, %%cr2 \n\t"
"mov %c[rax](%3), %%eax \n\t"
"mov %c[rbx](%3), %%ebx \n\t"
"mov %c[rdx](%3), %%edx \n\t"
"mov %c[rsi](%3), %%esi \n\t"
"mov %c[rdi](%3), %%edi \n\t"
"mov %c[rbp](%3), %%ebp \n\t"
"mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
"mov %c[rax](%0), %%eax \n\t"
"mov %c[rbx](%0), %%ebx \n\t"
"mov %c[rdx](%0), %%edx \n\t"
"mov %c[rsi](%0), %%esi \n\t"
"mov %c[rdi](%0), %%edi \n\t"
"mov %c[rbp](%0), %%ebp \n\t"
"mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
#endif
/* Enter guest mode */
"jne .Llaunched \n\t"
......@@ -2257,72 +2394,79 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
#ifdef CONFIG_X86_64
"xchg %3, (%%rsp) \n\t"
"mov %%rax, %c[rax](%3) \n\t"
"mov %%rbx, %c[rbx](%3) \n\t"
"pushq (%%rsp); popq %c[rcx](%3) \n\t"
"mov %%rdx, %c[rdx](%3) \n\t"
"mov %%rsi, %c[rsi](%3) \n\t"
"mov %%rdi, %c[rdi](%3) \n\t"
"mov %%rbp, %c[rbp](%3) \n\t"
"mov %%r8, %c[r8](%3) \n\t"
"mov %%r9, %c[r9](%3) \n\t"
"mov %%r10, %c[r10](%3) \n\t"
"mov %%r11, %c[r11](%3) \n\t"
"mov %%r12, %c[r12](%3) \n\t"
"mov %%r13, %c[r13](%3) \n\t"
"mov %%r14, %c[r14](%3) \n\t"
"mov %%r15, %c[r15](%3) \n\t"
"xchg %0, (%%rsp) \n\t"
"mov %%rax, %c[rax](%0) \n\t"
"mov %%rbx, %c[rbx](%0) \n\t"
"pushq (%%rsp); popq %c[rcx](%0) \n\t"
"mov %%rdx, %c[rdx](%0) \n\t"
"mov %%rsi, %c[rsi](%0) \n\t"
"mov %%rdi, %c[rdi](%0) \n\t"
"mov %%rbp, %c[rbp](%0) \n\t"
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
"mov %%cr2, %%rax \n\t"
"mov %%rax, %c[cr2](%3) \n\t"
"mov (%%rsp), %3 \n\t"
"mov %%rax, %c[cr2](%0) \n\t"
"pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
"pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
"pop %%rbp; pop %%rdi; pop %%rsi;"
"pop %%rdx; pop %%rbx; pop %%rax \n\t"
"pop %%rbp; pop %%rbp; pop %%rdx \n\t"
#else
"xchg %3, (%%esp) \n\t"
"mov %%eax, %c[rax](%3) \n\t"
"mov %%ebx, %c[rbx](%3) \n\t"
"pushl (%%esp); popl %c[rcx](%3) \n\t"
"mov %%edx, %c[rdx](%3) \n\t"
"mov %%esi, %c[rsi](%3) \n\t"
"mov %%edi, %c[rdi](%3) \n\t"
"mov %%ebp, %c[rbp](%3) \n\t"
"xchg %0, (%%esp) \n\t"
"mov %%eax, %c[rax](%0) \n\t"
"mov %%ebx, %c[rbx](%0) \n\t"
"pushl (%%esp); popl %c[rcx](%0) \n\t"
"mov %%edx, %c[rdx](%0) \n\t"
"mov %%esi, %c[rsi](%0) \n\t"
"mov %%edi, %c[rdi](%0) \n\t"
"mov %%ebp, %c[rbp](%0) \n\t"
"mov %%cr2, %%eax \n\t"
"mov %%eax, %c[cr2](%3) \n\t"
"mov (%%esp), %3 \n\t"
"mov %%eax, %c[cr2](%0) \n\t"
"pop %%ecx; popa \n\t"
"pop %%ebp; pop %%ebp; pop %%edx \n\t"
#endif
"setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
"setbe %0 \n\t"
: "=q" (vmx->fail)
: "r"(vmx->launched), "d"((unsigned long)HOST_RSP),
"c"(vcpu),
[rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
: "cc", "memory"
#ifdef CONFIG_X86_64
[r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
[r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
[r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
, "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "ebx", "edi", "rsi"
#endif
[cr2]"i"(offsetof(struct kvm_vcpu, cr2))
: "cc", "memory" );
);
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
vcpu->arch.interrupt_window_open =
(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
......@@ -2332,36 +2476,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
asm("int $2");
}
static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
unsigned long addr,
u32 err_code)
{
u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
++vcpu->stat.pf_guest;
if (is_page_fault(vect_info)) {
printk(KERN_DEBUG "inject_page_fault: "
"double fault 0x%lx @ 0x%lx\n",
addr, vmcs_readl(GUEST_RIP));
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
DF_VECTOR |
INTR_TYPE_EXCEPTION |
INTR_INFO_DELIEVER_CODE_MASK |
INTR_INFO_VALID_MASK);
return;
}
vcpu->cr2 = addr;
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
PF_VECTOR |
INTR_TYPE_EXCEPTION |
INTR_INFO_DELIEVER_CODE_MASK |
INTR_INFO_VALID_MASK);
}
static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
......@@ -2397,12 +2511,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (err)
goto free_vcpu;
if (irqchip_in_kernel(kvm)) {
err = kvm_create_lapic(&vmx->vcpu);
if (err < 0)
goto free_vcpu;
}
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!vmx->guest_msrs) {
err = -ENOMEM;
......@@ -2464,6 +2572,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
......@@ -2499,9 +2608,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_rflags = vmx_set_rflags,
.tlb_flush = vmx_flush_tlb,
.inject_page_fault = vmx_inject_page_fault,
.inject_gp = vmx_inject_gp,
.run = vmx_vcpu_run,
.handle_exit = kvm_handle_exit,
......@@ -2509,8 +2615,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
.patch_hypercall = vmx_patch_hypercall,
.get_irq = vmx_get_irq,
.set_irq = vmx_inject_irq,
.queue_exception = vmx_queue_exception,
.exception_injected = vmx_exception_injected,
.inject_pending_irq = vmx_intr_assist,
.inject_pending_vectors = do_interrupt_requests,
.set_tss_addr = vmx_set_tss_addr,
};
static int __init vmx_init(void)
......@@ -2541,10 +2651,13 @@ static int __init vmx_init(void)
memset(iova, 0xff, PAGE_SIZE);
kunmap(vmx_io_bitmap_b);
r = kvm_init_x86(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
goto out1;
if (bypass_guest_pf)
kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
return 0;
out1:
......@@ -2559,7 +2672,7 @@ static void __exit vmx_exit(void)
__free_page(vmx_io_bitmap_b);
__free_page(vmx_io_bitmap_a);
kvm_exit_x86();
kvm_exit();
}
module_init(vmx_init)
......
......@@ -25,6 +25,9 @@
*
*/
/*
* Definitions of Primary Processor-Based VM-Execution Controls.
*/
#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
#define CPU_BASED_HLT_EXITING 0x00000080
......@@ -42,6 +45,12 @@
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
/*
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
#define PIN_BASED_EXT_INTR_MASK 0x00000001
#define PIN_BASED_NMI_EXITING 0x00000008
......@@ -54,8 +63,6 @@
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
/* VMCS Encodings */
enum vmcs_field {
GUEST_ES_SELECTOR = 0x00000800,
......@@ -89,6 +96,8 @@ enum vmcs_field {
TSC_OFFSET_HIGH = 0x00002011,
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
APIC_ACCESS_ADDR = 0x00002014,
APIC_ACCESS_ADDR_HIGH = 0x00002015,
VMCS_LINK_POINTER = 0x00002800,
VMCS_LINK_POINTER_HIGH = 0x00002801,
GUEST_IA32_DEBUGCTL = 0x00002802,
......@@ -214,6 +223,8 @@ enum vmcs_field {
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_WBINVD 54
/*
* Interruption-information format
......@@ -230,13 +241,14 @@ enum vmcs_field {
#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
/*
* Exit Qualifications for MOV for Control Register Access
*/
#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control register */
#define CONTROL_REG_ACCESS_NUM 0x7 /* 2:0, number of control reg.*/
#define CONTROL_REG_ACCESS_TYPE 0x30 /* 5:4, access type */
#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose register */
#define CONTROL_REG_ACCESS_REG 0xf00 /* 10:8, general purpose reg. */
#define LMSW_SOURCE_DATA_SHIFT 16
#define LMSW_SOURCE_DATA (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
#define REG_EAX (0 << 8)
......@@ -259,11 +271,11 @@ enum vmcs_field {
/*
* Exit Qualifications for MOV for Debug Register Access
*/
#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug register */
#define DEBUG_REG_ACCESS_NUM 0x7 /* 2:0, number of debug reg. */
#define DEBUG_REG_ACCESS_TYPE 0x10 /* 4, direction of access */
#define TYPE_MOV_TO_DR (0 << 4)
#define TYPE_MOV_FROM_DR (1 << 4)
#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose register */
#define DEBUG_REG_ACCESS_REG 0xf00 /* 11:8, general purpose reg. */
/* segment AR */
......@@ -307,4 +319,6 @@ enum vmcs_field {
#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
#endif
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -23,13 +23,13 @@
#include <stdio.h>
#include <stdint.h>
#include <public/xen.h>
#define DPRINTF(_f, _a ...) printf( _f , ## _a )
#define DPRINTF(_f, _a ...) printf(_f , ## _a)
#else
#include "kvm.h"
#include <linux/kvm_host.h>
#define DPRINTF(x...) do {} while (0)
#endif
#include "x86_emulate.h"
#include <linux/module.h>
#include <asm/kvm_x86_emulate.h>
/*
* Opcode effective-address decode tables.
......@@ -62,8 +62,11 @@
/* Destination is only written; never read. */
#define Mov (1<<7)
#define BitOp (1<<8)
#define MemAbs (1<<9) /* Memory operand is absolute displacement */
#define String (1<<10) /* String instruction (rep capable) */
#define Stack (1<<11) /* Stack instruction (push/pop) */
static u8 opcode_table[256] = {
static u16 opcode_table[256] = {
/* 0x00 - 0x07 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
......@@ -96,19 +99,21 @@ static u8 opcode_table[256] = {
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
0, 0, 0, 0,
/* 0x40 - 0x4F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x47 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x48 - 0x4F */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x50 - 0x57 */
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
/* 0x58 - 0x5F */
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
/* 0x60 - 0x67 */
0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
0, 0, 0, 0,
/* 0x68 - 0x6F */
0, 0, ImplicitOps|Mov, 0,
0, 0, ImplicitOps | Mov | Stack, 0,
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
/* 0x70 - 0x77 */
......@@ -125,23 +130,24 @@ static u8 opcode_table[256] = {
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov,
0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
/* 0xA0 - 0xA7 */
ByteOp | DstReg | SrcMem | Mov, DstReg | SrcMem | Mov,
ByteOp | DstMem | SrcReg | Mov, DstMem | SrcReg | Mov,
ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
ByteOp | ImplicitOps, ImplicitOps,
ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | String, ImplicitOps | String,
/* 0xA8 - 0xAF */
0, 0, ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
ByteOp | ImplicitOps | Mov, ImplicitOps | Mov,
ByteOp | ImplicitOps, ImplicitOps,
0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
ByteOp | ImplicitOps | String, ImplicitOps | String,
/* 0xB0 - 0xBF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xC0 - 0xC7 */
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
0, ImplicitOps, 0, 0,
0, ImplicitOps | Stack, 0, 0,
ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
/* 0xC8 - 0xCF */
0, 0, 0, 0, 0, 0, 0, 0,
......@@ -154,13 +160,14 @@ static u8 opcode_table[256] = {
/* 0xE0 - 0xE7 */
0, 0, 0, 0, 0, 0, 0, 0,
/* 0xE8 - 0xEF */
ImplicitOps, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, 0, 0, 0, 0,
ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
0, 0, 0, 0,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
ImplicitOps, 0,
ImplicitOps, ImplicitOps,
ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
/* 0xF8 - 0xFF */
0, 0, 0, 0,
ImplicitOps, 0, ImplicitOps, ImplicitOps,
0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
};
......@@ -222,13 +229,6 @@ static u16 twobyte_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_IMM } type;
unsigned int bytes;
unsigned long val, orig_val, *ptr;
};
/* EFLAGS bit definitions. */
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
......@@ -260,21 +260,21 @@ struct operand {
#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
/* Before executing instruction: restore necessary bits in EFLAGS. */
#define _PRE_EFLAGS(_sav, _msk, _tmp) \
/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); */ \
"push %"_sav"; " \
"movl %"_msk",%"_LO32 _tmp"; " \
"andl %"_LO32 _tmp",("_STK"); " \
"pushf; " \
"notl %"_LO32 _tmp"; " \
"andl %"_LO32 _tmp",("_STK"); " \
"pop %"_tmp"; " \
"orl %"_LO32 _tmp",("_STK"); " \
"popf; " \
/* _sav &= ~msk; */ \
"movl %"_msk",%"_LO32 _tmp"; " \
"notl %"_LO32 _tmp"; " \
"andl %"_LO32 _tmp",%"_sav"; "
#define _PRE_EFLAGS(_sav, _msk, _tmp) \
/* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \
"movl %"_sav",%"_LO32 _tmp"; " \
"push %"_tmp"; " \
"push %"_tmp"; " \
"movl %"_msk",%"_LO32 _tmp"; " \
"andl %"_LO32 _tmp",("_STK"); " \
"pushf; " \
"notl %"_LO32 _tmp"; " \
"andl %"_LO32 _tmp",("_STK"); " \
"andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \
"pop %"_tmp"; " \
"orl %"_LO32 _tmp",("_STK"); " \
"popf; " \
"pop %"_sav"; "
/* After executing instruction: write-back necessary bits in EFLAGS. */
#define _POST_EFLAGS(_sav, _msk, _tmp) \
......@@ -292,21 +292,21 @@ struct operand {
switch ((_dst).bytes) { \
case 2: \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","4","2") \
_PRE_EFLAGS("0", "4", "2") \
_op"w %"_wx"3,%1; " \
_POST_EFLAGS("0","4","2") \
_POST_EFLAGS("0", "4", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: _wy ((_src).val), "i" (EFLAGS_MASK) ); \
: _wy ((_src).val), "i" (EFLAGS_MASK)); \
break; \
case 4: \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","4","2") \
_PRE_EFLAGS("0", "4", "2") \
_op"l %"_lx"3,%1; " \
_POST_EFLAGS("0","4","2") \
_POST_EFLAGS("0", "4", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: _ly ((_src).val), "i" (EFLAGS_MASK) ); \
: _ly ((_src).val), "i" (EFLAGS_MASK)); \
break; \
case 8: \
__emulate_2op_8byte(_op, _src, _dst, \
......@@ -318,16 +318,15 @@ struct operand {
#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
unsigned long _tmp; \
switch ( (_dst).bytes ) \
{ \
switch ((_dst).bytes) { \
case 1: \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","4","2") \
_PRE_EFLAGS("0", "4", "2") \
_op"b %"_bx"3,%1; " \
_POST_EFLAGS("0","4","2") \
_POST_EFLAGS("0", "4", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: _by ((_src).val), "i" (EFLAGS_MASK) ); \
: _by ((_src).val), "i" (EFLAGS_MASK)); \
break; \
default: \
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
......@@ -356,34 +355,33 @@ struct operand {
do { \
unsigned long _tmp; \
\
switch ( (_dst).bytes ) \
{ \
switch ((_dst).bytes) { \
case 1: \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","3","2") \
_PRE_EFLAGS("0", "3", "2") \
_op"b %1; " \
_POST_EFLAGS("0","3","2") \
_POST_EFLAGS("0", "3", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: "i" (EFLAGS_MASK) ); \
: "i" (EFLAGS_MASK)); \
break; \
case 2: \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","3","2") \
_PRE_EFLAGS("0", "3", "2") \
_op"w %1; " \
_POST_EFLAGS("0","3","2") \
_POST_EFLAGS("0", "3", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: "i" (EFLAGS_MASK) ); \
: "i" (EFLAGS_MASK)); \
break; \
case 4: \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","3","2") \
_PRE_EFLAGS("0", "3", "2") \
_op"l %1; " \
_POST_EFLAGS("0","3","2") \
_POST_EFLAGS("0", "3", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
"=&r" (_tmp) \
: "i" (EFLAGS_MASK) ); \
: "i" (EFLAGS_MASK)); \
break; \
case 8: \
__emulate_1op_8byte(_op, _dst, _eflags); \
......@@ -396,21 +394,21 @@ struct operand {
#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
do { \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","4","2") \
_PRE_EFLAGS("0", "4", "2") \
_op"q %"_qx"3,%1; " \
_POST_EFLAGS("0","4","2") \
_POST_EFLAGS("0", "4", "2") \
: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
: _qy ((_src).val), "i" (EFLAGS_MASK) ); \
: _qy ((_src).val), "i" (EFLAGS_MASK)); \
} while (0)
#define __emulate_1op_8byte(_op, _dst, _eflags) \
do { \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0","3","2") \
_PRE_EFLAGS("0", "3", "2") \
_op"q %1; " \
_POST_EFLAGS("0","3","2") \
_POST_EFLAGS("0", "3", "2") \
: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
: "i" (EFLAGS_MASK) ); \
: "i" (EFLAGS_MASK)); \
} while (0)
#elif defined(__i386__)
......@@ -421,9 +419,8 @@ struct operand {
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _size, _eip) \
({ unsigned long _x; \
rc = ops->read_std((unsigned long)(_eip) + ctxt->cs_base, &_x, \
(_size), ctxt->vcpu); \
if ( rc != 0 ) \
rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \
if (rc != 0) \
goto done; \
(_eip) += (_size); \
(_type)_x; \
......@@ -431,26 +428,63 @@ struct operand {
/* Access/update address held in a register, based on addressing mode. */
#define address_mask(reg) \
((ad_bytes == sizeof(unsigned long)) ? \
(reg) : ((reg) & ((1UL << (ad_bytes << 3)) - 1)))
((c->ad_bytes == sizeof(unsigned long)) ? \
(reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
#define register_address(base, reg) \
((base) + address_mask(reg))
#define register_address_increment(reg, inc) \
do { \
/* signed type ensures sign extension to long */ \
int _inc = (inc); \
if ( ad_bytes == sizeof(unsigned long) ) \
if (c->ad_bytes == sizeof(unsigned long)) \
(reg) += _inc; \
else \
(reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \
(((reg) + _inc) & ((1UL << (ad_bytes << 3)) - 1)); \
(reg) = ((reg) & \
~((1UL << (c->ad_bytes << 3)) - 1)) | \
(((reg) + _inc) & \
((1UL << (c->ad_bytes << 3)) - 1)); \
} while (0)
#define JMP_REL(rel) \
do { \
register_address_increment(_eip, rel); \
register_address_increment(c->eip, rel); \
} while (0)
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long linear, u8 *dest)
{
struct fetch_cache *fc = &ctxt->decode.fetch;
int rc;
int size;
if (linear < fc->start || linear >= fc->end) {
size = min(15UL, PAGE_SIZE - offset_in_page(linear));
rc = ops->read_std(linear, fc->data, size, ctxt->vcpu);
if (rc)
return rc;
fc->start = linear;
fc->end = linear + size;
}
*dest = fc->data[linear - fc->start];
return 0;
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long eip, void *dest, unsigned size)
{
int rc = 0;
eip += ctxt->cs_base;
while (size--) {
rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
if (rc)
return rc;
}
return 0;
}
/*
* Given the 'reg' portion of a ModRM byte, and a register block, return a
* pointer into the block that addresses the relevant register.
......@@ -521,984 +555,1321 @@ static int test_cc(unsigned int condition, unsigned int flags)
return (!!rc ^ (condition & 1));
}
static void decode_register_operand(struct operand *op,
struct decode_cache *c,
int inhibit_bytereg)
{
unsigned reg = c->modrm_reg;
int highbyte_regs = c->rex_prefix == 0;
if (!(c->d & ModRM))
reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
op->type = OP_REG;
if ((c->d & ByteOp) && !inhibit_bytereg) {
op->ptr = decode_register(reg, c->regs, highbyte_regs);
op->val = *(u8 *)op->ptr;
op->bytes = 1;
} else {
op->ptr = decode_register(reg, c->regs, 0);
op->bytes = c->op_bytes;
switch (op->bytes) {
case 2:
op->val = *(u16 *)op->ptr;
break;
case 4:
op->val = *(u32 *)op->ptr;
break;
case 8:
op->val = *(u64 *) op->ptr;
break;
}
}
op->orig_val = op->val;
}
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
u8 sib;
int index_reg = 0, base_reg = 0, scale, rip_relative = 0;
int rc = 0;
if (c->rex_prefix) {
c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
}
c->modrm = insn_fetch(u8, 1, c->eip);
c->modrm_mod |= (c->modrm & 0xc0) >> 6;
c->modrm_reg |= (c->modrm & 0x38) >> 3;
c->modrm_rm |= (c->modrm & 0x07);
c->modrm_ea = 0;
c->use_modrm_ea = 1;
if (c->modrm_mod == 3) {
c->modrm_val = *(unsigned long *)
decode_register(c->modrm_rm, c->regs, c->d & ByteOp);
return rc;
}
if (c->ad_bytes == 2) {
unsigned bx = c->regs[VCPU_REGS_RBX];
unsigned bp = c->regs[VCPU_REGS_RBP];
unsigned si = c->regs[VCPU_REGS_RSI];
unsigned di = c->regs[VCPU_REGS_RDI];
/* 16-bit ModR/M decode. */
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 6)
c->modrm_ea += insn_fetch(u16, 2, c->eip);
break;
case 1:
c->modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
c->modrm_ea += insn_fetch(u16, 2, c->eip);
break;
}
switch (c->modrm_rm) {
case 0:
c->modrm_ea += bx + si;
break;
case 1:
c->modrm_ea += bx + di;
break;
case 2:
c->modrm_ea += bp + si;
break;
case 3:
c->modrm_ea += bp + di;
break;
case 4:
c->modrm_ea += si;
break;
case 5:
c->modrm_ea += di;
break;
case 6:
if (c->modrm_mod != 0)
c->modrm_ea += bp;
break;
case 7:
c->modrm_ea += bx;
break;
}
if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
(c->modrm_rm == 6 && c->modrm_mod != 0))
if (!c->override_base)
c->override_base = &ctxt->ss_base;
c->modrm_ea = (u16)c->modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
switch (c->modrm_rm) {
case 4:
case 12:
sib = insn_fetch(u8, 1, c->eip);
index_reg |= (sib >> 3) & 7;
base_reg |= sib & 7;
scale = sib >> 6;
switch (base_reg) {
case 5:
if (c->modrm_mod != 0)
c->modrm_ea += c->regs[base_reg];
else
c->modrm_ea +=
insn_fetch(s32, 4, c->eip);
break;
default:
c->modrm_ea += c->regs[base_reg];
}
switch (index_reg) {
case 4:
break;
default:
c->modrm_ea += c->regs[index_reg] << scale;
}
break;
case 5:
if (c->modrm_mod != 0)
c->modrm_ea += c->regs[c->modrm_rm];
else if (ctxt->mode == X86EMUL_MODE_PROT64)
rip_relative = 1;
break;
default:
c->modrm_ea += c->regs[c->modrm_rm];
break;
}
switch (c->modrm_mod) {
case 0:
if (c->modrm_rm == 5)
c->modrm_ea += insn_fetch(s32, 4, c->eip);
break;
case 1:
c->modrm_ea += insn_fetch(s8, 1, c->eip);
break;
case 2:
c->modrm_ea += insn_fetch(s32, 4, c->eip);
break;
}
}
if (rip_relative) {
c->modrm_ea += c->eip;
switch (c->d & SrcMask) {
case SrcImmByte:
c->modrm_ea += 1;
break;
case SrcImm:
if (c->d & ByteOp)
c->modrm_ea += 1;
else
if (c->op_bytes == 8)
c->modrm_ea += 4;
else
c->modrm_ea += c->op_bytes;
}
}
done:
return rc;
}
static int decode_abs(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
int rc = 0;
switch (c->ad_bytes) {
case 2:
c->modrm_ea = insn_fetch(u16, 2, c->eip);
break;
case 4:
c->modrm_ea = insn_fetch(u32, 4, c->eip);
break;
case 8:
c->modrm_ea = insn_fetch(u64, 8, c->eip);
break;
}
done:
return rc;
}
int
x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
unsigned d;
u8 b, sib, twobyte = 0, rex_prefix = 0;
u8 modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0;
unsigned long *override_base = NULL;
unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i;
struct decode_cache *c = &ctxt->decode;
int rc = 0;
struct operand src, dst;
unsigned long cr2 = ctxt->cr2;
int mode = ctxt->mode;
unsigned long modrm_ea;
int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
int no_wb = 0;
u64 msr_data;
int def_op_bytes, def_ad_bytes;
/* Shadow copy of register state. Committed on successful emulation. */
unsigned long _regs[NR_VCPU_REGS];
unsigned long _eip = ctxt->vcpu->rip, _eflags = ctxt->eflags;
unsigned long modrm_val = 0;
memcpy(_regs, ctxt->vcpu->regs, sizeof _regs);
memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->vcpu->arch.rip;
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
switch (mode) {
case X86EMUL_MODE_REAL:
case X86EMUL_MODE_PROT16:
op_bytes = ad_bytes = 2;
def_op_bytes = def_ad_bytes = 2;
break;
case X86EMUL_MODE_PROT32:
op_bytes = ad_bytes = 4;
def_op_bytes = def_ad_bytes = 4;
break;
#ifdef CONFIG_X86_64
case X86EMUL_MODE_PROT64:
op_bytes = 4;
ad_bytes = 8;
def_op_bytes = 4;
def_ad_bytes = 8;
break;
#endif
default:
return -1;
}
c->op_bytes = def_op_bytes;
c->ad_bytes = def_ad_bytes;
/* Legacy prefixes. */
for (i = 0; i < 8; i++) {
switch (b = insn_fetch(u8, 1, _eip)) {
for (;;) {
switch (c->b = insn_fetch(u8, 1, c->eip)) {
case 0x66: /* operand-size override */
op_bytes ^= 6; /* switch between 2/4 bytes */
/* switch between 2/4 bytes */
c->op_bytes = def_op_bytes ^ 6;
break;
case 0x67: /* address-size override */
if (mode == X86EMUL_MODE_PROT64)
ad_bytes ^= 12; /* switch between 4/8 bytes */
/* switch between 4/8 bytes */
c->ad_bytes = def_ad_bytes ^ 12;
else
ad_bytes ^= 6; /* switch between 2/4 bytes */
/* switch between 2/4 bytes */
c->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x2e: /* CS override */
override_base = &ctxt->cs_base;
c->override_base = &ctxt->cs_base;
break;
case 0x3e: /* DS override */
override_base = &ctxt->ds_base;
c->override_base = &ctxt->ds_base;
break;
case 0x26: /* ES override */
override_base = &ctxt->es_base;
c->override_base = &ctxt->es_base;
break;
case 0x64: /* FS override */
override_base = &ctxt->fs_base;
c->override_base = &ctxt->fs_base;
break;
case 0x65: /* GS override */
override_base = &ctxt->gs_base;
c->override_base = &ctxt->gs_base;
break;
case 0x36: /* SS override */
override_base = &ctxt->ss_base;
c->override_base = &ctxt->ss_base;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
goto done_prefixes;
c->rex_prefix = c->b;
continue;
case 0xf0: /* LOCK */
lock_prefix = 1;
c->lock_prefix = 1;
break;
case 0xf2: /* REPNE/REPNZ */
c->rep_prefix = REPNE_PREFIX;
break;
case 0xf3: /* REP/REPE/REPZ */
rep_prefix = 1;
c->rep_prefix = REPE_PREFIX;
break;
default:
goto done_prefixes;
}
/* Any legacy prefix after a REX prefix nullifies its effect. */
c->rex_prefix = 0;
}
done_prefixes:
/* REX prefix. */
if ((mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40)) {
rex_prefix = b;
if (b & 8)
op_bytes = 8; /* REX.W */
modrm_reg = (b & 4) << 1; /* REX.R */
index_reg = (b & 2) << 2; /* REX.X */
modrm_rm = base_reg = (b & 1) << 3; /* REG.B */
b = insn_fetch(u8, 1, _eip);
}
if (c->rex_prefix)
if (c->rex_prefix & 8)
c->op_bytes = 8; /* REX.W */
/* Opcode byte(s). */
d = opcode_table[b];
if (d == 0) {
c->d = opcode_table[c->b];
if (c->d == 0) {
/* Two-byte opcode? */
if (b == 0x0f) {
twobyte = 1;
b = insn_fetch(u8, 1, _eip);
d = twobyte_table[b];
if (c->b == 0x0f) {
c->twobyte = 1;
c->b = insn_fetch(u8, 1, c->eip);
c->d = twobyte_table[c->b];
}
/* Unrecognised? */
if (d == 0)
goto cannot_emulate;
if (c->d == 0) {
DPRINTF("Cannot emulate %02x\n", c->b);
return -1;
}
}
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8;
/* ModRM and SIB bytes. */
if (d & ModRM) {
modrm = insn_fetch(u8, 1, _eip);
modrm_mod |= (modrm & 0xc0) >> 6;
modrm_reg |= (modrm & 0x38) >> 3;
modrm_rm |= (modrm & 0x07);
modrm_ea = 0;
use_modrm_ea = 1;
if (modrm_mod == 3) {
modrm_val = *(unsigned long *)
decode_register(modrm_rm, _regs, d & ByteOp);
goto modrm_done;
}
if (c->d & ModRM)
rc = decode_modrm(ctxt, ops);
else if (c->d & MemAbs)
rc = decode_abs(ctxt, ops);
if (rc)
goto done;
if (ad_bytes == 2) {
unsigned bx = _regs[VCPU_REGS_RBX];
unsigned bp = _regs[VCPU_REGS_RBP];
unsigned si = _regs[VCPU_REGS_RSI];
unsigned di = _regs[VCPU_REGS_RDI];
/* 16-bit ModR/M decode. */
switch (modrm_mod) {
case 0:
if (modrm_rm == 6)
modrm_ea += insn_fetch(u16, 2, _eip);
break;
case 1:
modrm_ea += insn_fetch(s8, 1, _eip);
break;
case 2:
modrm_ea += insn_fetch(u16, 2, _eip);
break;
}
switch (modrm_rm) {
case 0:
modrm_ea += bx + si;
break;
case 1:
modrm_ea += bx + di;
break;
case 2:
modrm_ea += bp + si;
break;
case 3:
modrm_ea += bp + di;
break;
case 4:
modrm_ea += si;
break;
case 5:
modrm_ea += di;
break;
case 6:
if (modrm_mod != 0)
modrm_ea += bp;
break;
case 7:
modrm_ea += bx;
break;
}
if (modrm_rm == 2 || modrm_rm == 3 ||
(modrm_rm == 6 && modrm_mod != 0))
if (!override_base)
override_base = &ctxt->ss_base;
modrm_ea = (u16)modrm_ea;
} else {
/* 32/64-bit ModR/M decode. */
switch (modrm_rm) {
case 4:
case 12:
sib = insn_fetch(u8, 1, _eip);
index_reg |= (sib >> 3) & 7;
base_reg |= sib & 7;
scale = sib >> 6;
switch (base_reg) {
case 5:
if (modrm_mod != 0)
modrm_ea += _regs[base_reg];
else
modrm_ea += insn_fetch(s32, 4, _eip);
break;
default:
modrm_ea += _regs[base_reg];
}
switch (index_reg) {
case 4:
break;
default:
modrm_ea += _regs[index_reg] << scale;
}
break;
case 5:
if (modrm_mod != 0)
modrm_ea += _regs[modrm_rm];
else if (mode == X86EMUL_MODE_PROT64)
rip_relative = 1;
break;
default:
modrm_ea += _regs[modrm_rm];
break;
}
switch (modrm_mod) {
case 0:
if (modrm_rm == 5)
modrm_ea += insn_fetch(s32, 4, _eip);
break;
case 1:
modrm_ea += insn_fetch(s8, 1, _eip);
break;
case 2:
modrm_ea += insn_fetch(s32, 4, _eip);
break;
}
}
if (!override_base)
override_base = &ctxt->ds_base;
if (mode == X86EMUL_MODE_PROT64 &&
override_base != &ctxt->fs_base &&
override_base != &ctxt->gs_base)
override_base = NULL;
if (override_base)
modrm_ea += *override_base;
if (rip_relative) {
modrm_ea += _eip;
switch (d & SrcMask) {
case SrcImmByte:
modrm_ea += 1;
break;
case SrcImm:
if (d & ByteOp)
modrm_ea += 1;
else
if (op_bytes == 8)
modrm_ea += 4;
else
modrm_ea += op_bytes;
}
}
if (ad_bytes != 8)
modrm_ea = (u32)modrm_ea;
cr2 = modrm_ea;
modrm_done:
;
}
if (!c->override_base)
c->override_base = &ctxt->ds_base;
if (mode == X86EMUL_MODE_PROT64 &&
c->override_base != &ctxt->fs_base &&
c->override_base != &ctxt->gs_base)
c->override_base = NULL;
if (c->override_base)
c->modrm_ea += *c->override_base;
if (c->ad_bytes != 8)
c->modrm_ea = (u32)c->modrm_ea;
/*
* Decode and fetch the source operand: register, memory
* or immediate.
*/
switch (d & SrcMask) {
switch (c->d & SrcMask) {
case SrcNone:
break;
case SrcReg:
src.type = OP_REG;
if (d & ByteOp) {
src.ptr = decode_register(modrm_reg, _regs,
(rex_prefix == 0));
src.val = src.orig_val = *(u8 *) src.ptr;
src.bytes = 1;
} else {
src.ptr = decode_register(modrm_reg, _regs, 0);
switch ((src.bytes = op_bytes)) {
case 2:
src.val = src.orig_val = *(u16 *) src.ptr;
break;
case 4:
src.val = src.orig_val = *(u32 *) src.ptr;
break;
case 8:
src.val = src.orig_val = *(u64 *) src.ptr;
break;
}
}
decode_register_operand(&c->src, c, 0);
break;
case SrcMem16:
src.bytes = 2;
c->src.bytes = 2;
goto srcmem_common;
case SrcMem32:
src.bytes = 4;
c->src.bytes = 4;
goto srcmem_common;
case SrcMem:
src.bytes = (d & ByteOp) ? 1 : op_bytes;
c->src.bytes = (c->d & ByteOp) ? 1 :
c->op_bytes;
/* Don't fetch the address for invlpg: it could be unmapped. */
if (twobyte && b == 0x01 && modrm_reg == 7)
if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
break;
srcmem_common:
srcmem_common:
/*
* For instructions with a ModR/M byte, switch to register
* access if Mod = 3.
*/
if ((d & ModRM) && modrm_mod == 3) {
src.type = OP_REG;
if ((c->d & ModRM) && c->modrm_mod == 3) {
c->src.type = OP_REG;
break;
}
src.type = OP_MEM;
src.ptr = (unsigned long *)cr2;
src.val = 0;
if ((rc = ops->read_emulated((unsigned long)src.ptr,
&src.val, src.bytes, ctxt->vcpu)) != 0)
goto done;
src.orig_val = src.val;
c->src.type = OP_MEM;
break;
case SrcImm:
src.type = OP_IMM;
src.ptr = (unsigned long *)_eip;
src.bytes = (d & ByteOp) ? 1 : op_bytes;
if (src.bytes == 8)
src.bytes = 4;
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
if (c->src.bytes == 8)
c->src.bytes = 4;
/* NB. Immediates are sign-extended as necessary. */
switch (src.bytes) {
switch (c->src.bytes) {
case 1:
src.val = insn_fetch(s8, 1, _eip);
c->src.val = insn_fetch(s8, 1, c->eip);
break;
case 2:
src.val = insn_fetch(s16, 2, _eip);
c->src.val = insn_fetch(s16, 2, c->eip);
break;
case 4:
src.val = insn_fetch(s32, 4, _eip);
c->src.val = insn_fetch(s32, 4, c->eip);
break;
}
break;
case SrcImmByte:
src.type = OP_IMM;
src.ptr = (unsigned long *)_eip;
src.bytes = 1;
src.val = insn_fetch(s8, 1, _eip);
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = 1;
c->src.val = insn_fetch(s8, 1, c->eip);
break;
}
/* Decode and fetch the destination operand: register or memory. */
switch (d & DstMask) {
switch (c->d & DstMask) {
case ImplicitOps:
/* Special instructions do their own operand decoding. */
goto special_insn;
return 0;
case DstReg:
dst.type = OP_REG;
if ((d & ByteOp)
&& !(twobyte && (b == 0xb6 || b == 0xb7))) {
dst.ptr = decode_register(modrm_reg, _regs,
(rex_prefix == 0));
dst.val = *(u8 *) dst.ptr;
dst.bytes = 1;
} else {
dst.ptr = decode_register(modrm_reg, _regs, 0);
switch ((dst.bytes = op_bytes)) {
case 2:
dst.val = *(u16 *)dst.ptr;
break;
case 4:
dst.val = *(u32 *)dst.ptr;
break;
case 8:
dst.val = *(u64 *)dst.ptr;
break;
}
}
decode_register_operand(&c->dst, c,
c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
break;
case DstMem:
dst.type = OP_MEM;
dst.ptr = (unsigned long *)cr2;
dst.bytes = (d & ByteOp) ? 1 : op_bytes;
dst.val = 0;
/*
* For instructions with a ModR/M byte, switch to register
* access if Mod = 3.
*/
if ((d & ModRM) && modrm_mod == 3) {
dst.type = OP_REG;
if ((c->d & ModRM) && c->modrm_mod == 3) {
c->dst.type = OP_REG;
break;
}
if (d & BitOp) {
unsigned long mask = ~(dst.bytes * 8 - 1);
dst.ptr = (void *)dst.ptr + (src.val & mask) / 8;
}
if (!(d & Mov) && /* optimisation - avoid slow emulated read */
((rc = ops->read_emulated((unsigned long)dst.ptr,
&dst.val, dst.bytes, ctxt->vcpu)) != 0))
goto done;
c->dst.type = OP_MEM;
break;
}
dst.orig_val = dst.val;
if (twobyte)
goto twobyte_insn;
done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
switch (b) {
case 0x00 ... 0x05:
add: /* add */
emulate_2op_SrcV("add", src, dst, _eflags);
break;
case 0x08 ... 0x0d:
or: /* or */
emulate_2op_SrcV("or", src, dst, _eflags);
break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", src, dst, _eflags);
static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
c->dst.type = OP_MEM;
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
c->dst.ptr = (void *) register_address(ctxt->ss_base,
c->regs[VCPU_REGS_RSP]);
}
static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
int rc;
rc = ops->read_std(register_address(ctxt->ss_base,
c->regs[VCPU_REGS_RSP]),
&c->dst.val, c->dst.bytes, ctxt->vcpu);
if (rc != 0)
return rc;
register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
return 0;
}
static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
{
struct decode_cache *c = &ctxt->decode;
switch (c->modrm_reg) {
case 0: /* rol */
emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
break;
case 0x18 ... 0x1d:
sbb: /* sbb */
emulate_2op_SrcV("sbb", src, dst, _eflags);
case 1: /* ror */
emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
break;
case 0x20 ... 0x23:
and: /* and */
emulate_2op_SrcV("and", src, dst, _eflags);
case 2: /* rcl */
emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
break;
case 0x24: /* and al imm8 */
dst.type = OP_REG;
dst.ptr = &_regs[VCPU_REGS_RAX];
dst.val = *(u8 *)dst.ptr;
dst.bytes = 1;
dst.orig_val = dst.val;
goto and;
case 0x25: /* and ax imm16, or eax imm32 */
dst.type = OP_REG;
dst.bytes = op_bytes;
dst.ptr = &_regs[VCPU_REGS_RAX];
if (op_bytes == 2)
dst.val = *(u16 *)dst.ptr;
else
dst.val = *(u32 *)dst.ptr;
dst.orig_val = dst.val;
goto and;
case 0x28 ... 0x2d:
sub: /* sub */
emulate_2op_SrcV("sub", src, dst, _eflags);
case 3: /* rcr */
emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
break;
case 0x30 ... 0x35:
xor: /* xor */
emulate_2op_SrcV("xor", src, dst, _eflags);
case 4: /* sal/shl */
case 6: /* sal/shl */
emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
break;
case 0x38 ... 0x3d:
cmp: /* cmp */
emulate_2op_SrcV("cmp", src, dst, _eflags);
case 5: /* shr */
emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
break;
case 0x63: /* movsxd */
if (mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
dst.val = (s32) src.val;
case 7: /* sar */
emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
break;
case 0x80 ... 0x83: /* Grp1 */
switch (modrm_reg) {
case 0:
goto add;
}
}
static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
int rc = 0;
switch (c->modrm_reg) {
case 0 ... 1: /* test */
/*
* Special case in Grp3: test has an immediate
* source operand.
*/
c->src.type = OP_IMM;
c->src.ptr = (unsigned long *)c->eip;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
if (c->src.bytes == 8)
c->src.bytes = 4;
switch (c->src.bytes) {
case 1:
goto or;
c->src.val = insn_fetch(s8, 1, c->eip);
break;
case 2:
goto adc;
case 3:
goto sbb;
c->src.val = insn_fetch(s16, 2, c->eip);
break;
case 4:
goto and;
case 5:
goto sub;
case 6:
goto xor;
case 7:
goto cmp;
c->src.val = insn_fetch(s32, 4, c->eip);
break;
}
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 0x84 ... 0x85:
test: /* test */
emulate_2op_SrcV("test", src, dst, _eflags);
case 2: /* not */
c->dst.val = ~c->dst.val;
break;
case 0x86 ... 0x87: /* xchg */
/* Write back the register source. */
switch (dst.bytes) {
case 1:
*(u8 *) src.ptr = (u8) dst.val;
break;
case 2:
*(u16 *) src.ptr = (u16) dst.val;
break;
case 4:
*src.ptr = (u32) dst.val;
break; /* 64b reg: zero-extend */
case 8:
*src.ptr = dst.val;
break;
}
/*
* Write back the memory destination with implicit LOCK
* prefix.
*/
dst.val = src.val;
lock_prefix = 1;
break;
case 0x88 ... 0x8b: /* mov */
goto mov;
case 0x8d: /* lea r16/r32, m */
dst.val = modrm_val;
case 3: /* neg */
emulate_1op("neg", c->dst, ctxt->eflags);
break;
case 0x8f: /* pop (sole member of Grp1a) */
/* 64-bit mode: POP always pops a 64-bit operand. */
if (mode == X86EMUL_MODE_PROT64)
dst.bytes = 8;
if ((rc = ops->read_std(register_address(ctxt->ss_base,
_regs[VCPU_REGS_RSP]),
&dst.val, dst.bytes, ctxt->vcpu)) != 0)
goto done;
register_address_increment(_regs[VCPU_REGS_RSP], dst.bytes);
default:
DPRINTF("Cannot emulate %02x\n", c->b);
rc = X86EMUL_UNHANDLEABLE;
break;
case 0xa0 ... 0xa1: /* mov */
dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
dst.val = src.val;
_eip += ad_bytes; /* skip src displacement */
}
done:
return rc;
}
static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
int rc;
switch (c->modrm_reg) {
case 0: /* inc */
emulate_1op("inc", c->dst, ctxt->eflags);
break;
case 0xa2 ... 0xa3: /* mov */
dst.val = (unsigned long)_regs[VCPU_REGS_RAX];
_eip += ad_bytes; /* skip dst displacement */
case 1: /* dec */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 0xc0 ... 0xc1:
grp2: /* Grp2 */
switch (modrm_reg) {
case 0: /* rol */
emulate_2op_SrcB("rol", src, dst, _eflags);
break;
case 1: /* ror */
emulate_2op_SrcB("ror", src, dst, _eflags);
break;
case 2: /* rcl */
emulate_2op_SrcB("rcl", src, dst, _eflags);
break;
case 3: /* rcr */
emulate_2op_SrcB("rcr", src, dst, _eflags);
break;
case 4: /* sal/shl */
case 6: /* sal/shl */
emulate_2op_SrcB("sal", src, dst, _eflags);
break;
case 5: /* shr */
emulate_2op_SrcB("shr", src, dst, _eflags);
break;
case 7: /* sar */
emulate_2op_SrcB("sar", src, dst, _eflags);
break;
case 4: /* jmp abs */
if (c->b == 0xff)
c->eip = c->dst.val;
else {
DPRINTF("Cannot emulate %02x\n", c->b);
return X86EMUL_UNHANDLEABLE;
}
break;
case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
mov:
dst.val = src.val;
break;
case 0xd0 ... 0xd1: /* Grp2 */
src.val = 1;
goto grp2;
case 0xd2 ... 0xd3: /* Grp2 */
src.val = _regs[VCPU_REGS_RCX];
goto grp2;
case 0xf6 ... 0xf7: /* Grp3 */
switch (modrm_reg) {
case 0 ... 1: /* test */
/*
* Special case in Grp3: test has an immediate
* source operand.
*/
src.type = OP_IMM;
src.ptr = (unsigned long *)_eip;
src.bytes = (d & ByteOp) ? 1 : op_bytes;
if (src.bytes == 8)
src.bytes = 4;
switch (src.bytes) {
case 1:
src.val = insn_fetch(s8, 1, _eip);
break;
case 2:
src.val = insn_fetch(s16, 2, _eip);
break;
case 4:
src.val = insn_fetch(s32, 4, _eip);
break;
}
goto test;
case 2: /* not */
dst.val = ~dst.val;
break;
case 3: /* neg */
emulate_1op("neg", dst, _eflags);
break;
default:
goto cannot_emulate;
case 6: /* push */
/* 64-bit mode: PUSH always pushes a 64-bit operand. */
if (ctxt->mode == X86EMUL_MODE_PROT64) {
c->dst.bytes = 8;
rc = ops->read_std((unsigned long)c->dst.ptr,
&c->dst.val, 8, ctxt->vcpu);
if (rc != 0)
return rc;
}
register_address_increment(c->regs[VCPU_REGS_RSP],
-c->dst.bytes);
rc = ops->write_emulated(register_address(ctxt->ss_base,
c->regs[VCPU_REGS_RSP]), &c->dst.val,
c->dst.bytes, ctxt->vcpu);
if (rc != 0)
return rc;
c->dst.type = OP_NONE;
break;
case 0xfe ... 0xff: /* Grp4/Grp5 */
switch (modrm_reg) {
case 0: /* inc */
emulate_1op("inc", dst, _eflags);
break;
case 1: /* dec */
emulate_1op("dec", dst, _eflags);
default:
DPRINTF("Cannot emulate %02x\n", c->b);
return X86EMUL_UNHANDLEABLE;
}
return 0;
}
static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned long memop)
{
struct decode_cache *c = &ctxt->decode;
u64 old, new;
int rc;
rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
if (rc != 0)
return rc;
if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
ctxt->eflags &= ~EFLG_ZF;
} else {
new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
(u32) c->regs[VCPU_REGS_RBX];
rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
if (rc != 0)
return rc;
ctxt->eflags |= EFLG_ZF;
}
return 0;
}
static inline int writeback(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
int rc;
struct decode_cache *c = &ctxt->decode;
switch (c->dst.type) {
case OP_REG:
/* The 4-byte case *is* correct:
* in 64-bit mode we zero-extend.
*/
switch (c->dst.bytes) {
case 1:
*(u8 *)c->dst.ptr = (u8)c->dst.val;
break;
case 4: /* jmp abs */
if (b == 0xff)
_eip = dst.val;
else
goto cannot_emulate;
case 2:
*(u16 *)c->dst.ptr = (u16)c->dst.val;
break;
case 6: /* push */
/* 64-bit mode: PUSH always pushes a 64-bit operand. */
if (mode == X86EMUL_MODE_PROT64) {
dst.bytes = 8;
if ((rc = ops->read_std((unsigned long)dst.ptr,
&dst.val, 8,
ctxt->vcpu)) != 0)
goto done;
}
register_address_increment(_regs[VCPU_REGS_RSP],
-dst.bytes);
if ((rc = ops->write_emulated(
register_address(ctxt->ss_base,
_regs[VCPU_REGS_RSP]),
&dst.val, dst.bytes, ctxt->vcpu)) != 0)
goto done;
no_wb = 1;
case 4:
*c->dst.ptr = (u32)c->dst.val;
break; /* 64b: zero-ext */
case 8:
*c->dst.ptr = c->dst.val;
break;
default:
goto cannot_emulate;
}
break;
case OP_MEM:
if (c->lock_prefix)
rc = ops->cmpxchg_emulated(
(unsigned long)c->dst.ptr,
&c->dst.orig_val,
&c->dst.val,
c->dst.bytes,
ctxt->vcpu);
else
rc = ops->write_emulated(
(unsigned long)c->dst.ptr,
&c->dst.val,
c->dst.bytes,
ctxt->vcpu);
if (rc != 0)
return rc;
break;
case OP_NONE:
/* no writeback */
break;
default:
break;
}
return 0;
}
writeback:
if (!no_wb) {
switch (dst.type) {
case OP_REG:
/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
switch (dst.bytes) {
case 1:
*(u8 *)dst.ptr = (u8)dst.val;
break;
case 2:
*(u16 *)dst.ptr = (u16)dst.val;
break;
case 4:
*dst.ptr = (u32)dst.val;
break; /* 64b: zero-ext */
case 8:
*dst.ptr = dst.val;
break;
int
x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
unsigned long memop = 0;
u64 msr_data;
unsigned long saved_eip = 0;
struct decode_cache *c = &ctxt->decode;
int rc = 0;
/* Shadow copy of register state. Committed on successful emulation.
* NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
* modify them.
*/
memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
saved_eip = c->eip;
if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
memop = c->modrm_ea;
if (c->rep_prefix && (c->d & String)) {
/* All REP prefixes have the same first termination condition */
if (c->regs[VCPU_REGS_RCX] == 0) {
ctxt->vcpu->arch.rip = c->eip;
goto done;
}
/* The second termination condition only applies for REPE
* and REPNE. Test if the repeat string operation prefix is
* REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
* corresponding termination condition according to:
* - if REPE/REPZ and ZF = 0 then done
* - if REPNE/REPNZ and ZF = 1 then done
*/
if ((c->b == 0xa6) || (c->b == 0xa7) ||
(c->b == 0xae) || (c->b == 0xaf)) {
if ((c->rep_prefix == REPE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == 0)) {
ctxt->vcpu->arch.rip = c->eip;
goto done;
}
break;
case OP_MEM:
if (lock_prefix)
rc = ops->cmpxchg_emulated((unsigned long)dst.
ptr, &dst.orig_val,
&dst.val, dst.bytes,
ctxt->vcpu);
else
rc = ops->write_emulated((unsigned long)dst.ptr,
&dst.val, dst.bytes,
ctxt->vcpu);
if (rc != 0)
if ((c->rep_prefix == REPNE_PREFIX) &&
((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
ctxt->vcpu->arch.rip = c->eip;
goto done;
default:
break;
}
}
c->regs[VCPU_REGS_RCX]--;
c->eip = ctxt->vcpu->arch.rip;
}
/* Commit shadow register state. */
memcpy(ctxt->vcpu->regs, _regs, sizeof _regs);
ctxt->eflags = _eflags;
ctxt->vcpu->rip = _eip;
if (c->src.type == OP_MEM) {
c->src.ptr = (unsigned long *)memop;
c->src.val = 0;
rc = ops->read_emulated((unsigned long)c->src.ptr,
&c->src.val,
c->src.bytes,
ctxt->vcpu);
if (rc != 0)
goto done;
c->src.orig_val = c->src.val;
}
done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
if ((c->d & DstMask) == ImplicitOps)
goto special_insn;
if (c->dst.type == OP_MEM) {
c->dst.ptr = (unsigned long *)memop;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.val = 0;
if (c->d & BitOp) {
unsigned long mask = ~(c->dst.bytes * 8 - 1);
c->dst.ptr = (void *)c->dst.ptr +
(c->src.val & mask) / 8;
}
if (!(c->d & Mov) &&
/* optimisation - avoid slow emulated read */
((rc = ops->read_emulated((unsigned long)c->dst.ptr,
&c->dst.val,
c->dst.bytes, ctxt->vcpu)) != 0))
goto done;
}
c->dst.orig_val = c->dst.val;
special_insn:
if (twobyte)
goto twobyte_special_insn;
switch(b) {
case 0x50 ... 0x57: /* push reg */
if (op_bytes == 2)
src.val = (u16) _regs[b & 0x7];
if (c->twobyte)
goto twobyte_insn;
switch (c->b) {
case 0x00 ... 0x05:
add: /* add */
emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
break;
case 0x08 ... 0x0d:
or: /* or */
emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
break;
case 0x18 ... 0x1d:
sbb: /* sbb */
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
case 0x20 ... 0x23:
and: /* and */
emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
break;
case 0x24: /* and al imm8 */
c->dst.type = OP_REG;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
c->dst.val = *(u8 *)c->dst.ptr;
c->dst.bytes = 1;
c->dst.orig_val = c->dst.val;
goto and;
case 0x25: /* and ax imm16, or eax imm32 */
c->dst.type = OP_REG;
c->dst.bytes = c->op_bytes;
c->dst.ptr = &c->regs[VCPU_REGS_RAX];
if (c->op_bytes == 2)
c->dst.val = *(u16 *)c->dst.ptr;
else
src.val = (u32) _regs[b & 0x7];
dst.type = OP_MEM;
dst.bytes = op_bytes;
dst.val = src.val;
register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
dst.ptr = (void *) register_address(
ctxt->ss_base, _regs[VCPU_REGS_RSP]);
c->dst.val = *(u32 *)c->dst.ptr;
c->dst.orig_val = c->dst.val;
goto and;
case 0x28 ... 0x2d:
sub: /* sub */
emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
break;
case 0x30 ... 0x35:
xor: /* xor */
emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
break;
case 0x38 ... 0x3d:
cmp: /* cmp */
emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
break;
case 0x40 ... 0x47: /* inc r16/r32 */
emulate_1op("inc", c->dst, ctxt->eflags);
break;
case 0x48 ... 0x4f: /* dec r16/r32 */
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 0x50 ... 0x57: /* push reg */
c->dst.type = OP_MEM;
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c->regs[VCPU_REGS_RSP],
-c->op_bytes);
c->dst.ptr = (void *) register_address(
ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
break;
case 0x58 ... 0x5f: /* pop reg */
dst.ptr = (unsigned long *)&_regs[b & 0x7];
pop_instruction:
if ((rc = ops->read_std(register_address(ctxt->ss_base,
_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt->vcpu))
!= 0)
c->regs[VCPU_REGS_RSP]), c->dst.ptr,
c->op_bytes, ctxt->vcpu)) != 0)
goto done;
register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
no_wb = 1; /* Disable writeback. */
register_address_increment(c->regs[VCPU_REGS_RSP],
c->op_bytes);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
goto cannot_emulate;
c->dst.val = (s32) c->src.val;
break;
case 0x6a: /* push imm8 */
src.val = 0L;
src.val = insn_fetch(s8, 1, _eip);
push:
dst.type = OP_MEM;
dst.bytes = op_bytes;
dst.val = src.val;
register_address_increment(_regs[VCPU_REGS_RSP], -op_bytes);
dst.ptr = (void *) register_address(ctxt->ss_base,
_regs[VCPU_REGS_RSP]);
c->src.val = 0L;
c->src.val = insn_fetch(s8, 1, c->eip);
emulate_push(ctxt);
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
1, /* in */
(d & ByteOp) ? 1 : op_bytes, /* size */
rep_prefix ?
address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
(_eflags & EFLG_DF), /* down */
1,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
address_mask(c->regs[VCPU_REGS_RCX]) : 1,
(ctxt->eflags & EFLG_DF),
register_address(ctxt->es_base,
_regs[VCPU_REGS_RDI]), /* address */
rep_prefix,
_regs[VCPU_REGS_RDX] /* port */
) == 0)
c->regs[VCPU_REGS_RDI]),
c->rep_prefix,
c->regs[VCPU_REGS_RDX]) == 0) {
c->eip = saved_eip;
return -1;
}
return 0;
case 0x6e: /* outsb */
case 0x6f: /* outsw/outsd */
if (kvm_emulate_pio_string(ctxt->vcpu, NULL,
0, /* in */
(d & ByteOp) ? 1 : op_bytes, /* size */
rep_prefix ?
address_mask(_regs[VCPU_REGS_RCX]) : 1, /* count */
(_eflags & EFLG_DF), /* down */
register_address(override_base ?
*override_base : ctxt->ds_base,
_regs[VCPU_REGS_RSI]), /* address */
rep_prefix,
_regs[VCPU_REGS_RDX] /* port */
) == 0)
0,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
address_mask(c->regs[VCPU_REGS_RCX]) : 1,
(ctxt->eflags & EFLG_DF),
register_address(c->override_base ?
*c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
c->rep_prefix,
c->regs[VCPU_REGS_RDX]) == 0) {
c->eip = saved_eip;
return -1;
}
return 0;
case 0x70 ... 0x7f: /* jcc (short) */ {
int rel = insn_fetch(s8, 1, _eip);
int rel = insn_fetch(s8, 1, c->eip);
if (test_cc(b, _eflags))
JMP_REL(rel);
if (test_cc(c->b, ctxt->eflags))
JMP_REL(rel);
break;
}
case 0x80 ... 0x83: /* Grp1 */
switch (c->modrm_reg) {
case 0:
goto add;
case 1:
goto or;
case 2:
goto adc;
case 3:
goto sbb;
case 4:
goto and;
case 5:
goto sub;
case 6:
goto xor;
case 7:
goto cmp;
}
break;
case 0x84 ... 0x85:
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 0x86 ... 0x87: /* xchg */
/* Write back the register source. */
switch (c->dst.bytes) {
case 1:
*(u8 *) c->src.ptr = (u8) c->dst.val;
break;
case 2:
*(u16 *) c->src.ptr = (u16) c->dst.val;
break;
case 4:
*c->src.ptr = (u32) c->dst.val;
break; /* 64b reg: zero-extend */
case 8:
*c->src.ptr = c->dst.val;
break;
}
/*
* Write back the memory destination with implicit LOCK
* prefix.
*/
c->dst.val = c->src.val;
c->lock_prefix = 1;
break;
case 0x88 ... 0x8b: /* mov */
goto mov;
case 0x8d: /* lea r16/r32, m */
c->dst.val = c->modrm_val;
break;
case 0x8f: /* pop (sole member of Grp1a) */
rc = emulate_grp1a(ctxt, ops);
if (rc != 0)
goto done;
break;
case 0x9c: /* pushf */
src.val = (unsigned long) _eflags;
goto push;
c->src.val = (unsigned long) ctxt->eflags;
emulate_push(ctxt);
break;
case 0x9d: /* popf */
dst.ptr = (unsigned long *) &_eflags;
goto pop_instruction;
case 0xc3: /* ret */
dst.ptr = &_eip;
c->dst.ptr = (unsigned long *) &ctxt->eflags;
goto pop_instruction;
case 0xf4: /* hlt */
ctxt->vcpu->halt_request = 1;
goto done;
}
if (rep_prefix) {
if (_regs[VCPU_REGS_RCX] == 0) {
ctxt->vcpu->rip = _eip;
goto done;
}
_regs[VCPU_REGS_RCX]--;
_eip = ctxt->vcpu->rip;
}
switch (b) {
case 0xa0 ... 0xa1: /* mov */
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
c->dst.val = c->src.val;
break;
case 0xa2 ... 0xa3: /* mov */
c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
break;
case 0xa4 ... 0xa5: /* movs */
dst.type = OP_MEM;
dst.bytes = (d & ByteOp) ? 1 : op_bytes;
dst.ptr = (unsigned long *)register_address(ctxt->es_base,
_regs[VCPU_REGS_RDI]);
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)register_address(
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
if ((rc = ops->read_emulated(register_address(
override_base ? *override_base : ctxt->ds_base,
_regs[VCPU_REGS_RSI]), &dst.val, dst.bytes, ctxt->vcpu)) != 0)
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
&c->dst.val,
c->dst.bytes, ctxt->vcpu)) != 0)
goto done;
register_address_increment(_regs[VCPU_REGS_RSI],
(_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
register_address_increment(_regs[VCPU_REGS_RDI],
(_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
register_address_increment(c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
register_address_increment(c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
case 0xa6 ... 0xa7: /* cmps */
DPRINTF("Urk! I don't handle CMPS.\n");
goto cannot_emulate;
c->src.type = OP_NONE; /* Disable writeback. */
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->src.ptr = (unsigned long *)register_address(
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]);
if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
&c->src.val,
c->src.bytes,
ctxt->vcpu)) != 0)
goto done;
c->dst.type = OP_NONE; /* Disable writeback. */
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)register_address(
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
&c->dst.val,
c->dst.bytes,
ctxt->vcpu)) != 0)
goto done;
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
register_address_increment(c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->src.bytes
: c->src.bytes);
register_address_increment(c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
case 0xaa ... 0xab: /* stos */
dst.type = OP_MEM;
dst.bytes = (d & ByteOp) ? 1 : op_bytes;
dst.ptr = (unsigned long *)cr2;
dst.val = _regs[VCPU_REGS_RAX];
register_address_increment(_regs[VCPU_REGS_RDI],
(_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)register_address(
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
c->dst.val = c->regs[VCPU_REGS_RAX];
register_address_increment(c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
case 0xac ... 0xad: /* lods */
dst.type = OP_REG;
dst.bytes = (d & ByteOp) ? 1 : op_bytes;
dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
if ((rc = ops->read_emulated(cr2, &dst.val, dst.bytes,
ctxt->vcpu)) != 0)
c->dst.type = OP_REG;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
if ((rc = ops->read_emulated(register_address(
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
&c->dst.val,
c->dst.bytes,
ctxt->vcpu)) != 0)
goto done;
register_address_increment(_regs[VCPU_REGS_RSI],
(_eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
register_address_increment(c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate;
case 0xc0 ... 0xc1:
emulate_grp2(ctxt);
break;
case 0xc3: /* ret */
c->dst.ptr = &c->eip;
goto pop_instruction;
case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
mov:
c->dst.val = c->src.val;
break;
case 0xd0 ... 0xd1: /* Grp2 */
c->src.val = 1;
emulate_grp2(ctxt);
break;
case 0xd2 ... 0xd3: /* Grp2 */
c->src.val = c->regs[VCPU_REGS_RCX];
emulate_grp2(ctxt);
break;
case 0xe8: /* call (near) */ {
long int rel;
switch (op_bytes) {
switch (c->op_bytes) {
case 2:
rel = insn_fetch(s16, 2, _eip);
rel = insn_fetch(s16, 2, c->eip);
break;
case 4:
rel = insn_fetch(s32, 4, _eip);
break;
case 8:
rel = insn_fetch(s64, 8, _eip);
rel = insn_fetch(s32, 4, c->eip);
break;
default:
DPRINTF("Call: Invalid op_bytes\n");
goto cannot_emulate;
}
src.val = (unsigned long) _eip;
c->src.val = (unsigned long) c->eip;
JMP_REL(rel);
op_bytes = ad_bytes;
goto push;
c->op_bytes = c->ad_bytes;
emulate_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
JMP_REL(src.val);
no_wb = 1; /* Disable writeback. */
JMP_REL(c->src.val);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
goto done;
case 0xf5: /* cmc */
/* complement carry flag from eflags reg */
ctxt->eflags ^= EFLG_CF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf6 ... 0xf7: /* Grp3 */
rc = emulate_grp3(ctxt, ops);
if (rc != 0)
goto done;
break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
ctxt->eflags &= ~X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfb: /* sti */
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfe ... 0xff: /* Grp4/Grp5 */
rc = emulate_grp45(ctxt, ops);
if (rc != 0)
goto done;
break;
}
writeback:
rc = writeback(ctxt, ops);
if (rc != 0)
goto done;
/* Commit shadow register state. */
memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
ctxt->vcpu->arch.rip = c->eip;
done:
if (rc == X86EMUL_UNHANDLEABLE) {
c->eip = saved_eip;
return -1;
}
goto writeback;
return 0;
twobyte_insn:
switch (b) {
switch (c->b) {
case 0x01: /* lgdt, lidt, lmsw */
/* Disable writeback. */
no_wb = 1;
switch (modrm_reg) {
switch (c->modrm_reg) {
u16 size;
unsigned long address;
case 2: /* lgdt */
rc = read_descriptor(ctxt, ops, src.ptr,
&size, &address, op_bytes);
case 0: /* vmcall */
if (c->modrm_mod != 3 || c->modrm_rm != 1)
goto cannot_emulate;
rc = kvm_fix_hypercall(ctxt->vcpu);
if (rc)
goto done;
realmode_lgdt(ctxt->vcpu, size, address);
kvm_emulate_hypercall(ctxt->vcpu);
break;
case 3: /* lidt */
rc = read_descriptor(ctxt, ops, src.ptr,
&size, &address, op_bytes);
case 2: /* lgdt */
rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address, c->op_bytes);
if (rc)
goto done;
realmode_lidt(ctxt->vcpu, size, address);
realmode_lgdt(ctxt->vcpu, size, address);
break;
case 3: /* lidt/vmmcall */
if (c->modrm_mod == 3 && c->modrm_rm == 1) {
rc = kvm_fix_hypercall(ctxt->vcpu);
if (rc)
goto done;
kvm_emulate_hypercall(ctxt->vcpu);
} else {
rc = read_descriptor(ctxt, ops, c->src.ptr,
&size, &address,
c->op_bytes);
if (rc)
goto done;
realmode_lidt(ctxt->vcpu, size, address);
}
break;
case 4: /* smsw */
if (modrm_mod != 3)
if (c->modrm_mod != 3)
goto cannot_emulate;
*(u16 *)&_regs[modrm_rm]
*(u16 *)&c->regs[c->modrm_rm]
= realmode_get_cr(ctxt->vcpu, 0);
break;
case 6: /* lmsw */
if (modrm_mod != 3)
if (c->modrm_mod != 3)
goto cannot_emulate;
realmode_lmsw(ctxt->vcpu, (u16)modrm_val, &_eflags);
realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
&ctxt->eflags);
break;
case 7: /* invlpg*/
emulate_invlpg(ctxt->vcpu, cr2);
emulate_invlpg(ctxt->vcpu, memop);
break;
default:
goto cannot_emulate;
}
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
case 0x06:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
break;
case 0x08: /* invd */
case 0x09: /* wbinvd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
c->dst.type = OP_NONE;
break;
case 0x20: /* mov cr, reg */
if (c->modrm_mod != 3)
goto cannot_emulate;
c->regs[c->modrm_rm] =
realmode_get_cr(ctxt->vcpu, c->modrm_reg);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x21: /* mov from dr to reg */
no_wb = 1;
if (modrm_mod != 3)
if (c->modrm_mod != 3)
goto cannot_emulate;
rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
if (rc)
goto cannot_emulate;
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x22: /* mov reg, cr */
if (c->modrm_mod != 3)
goto cannot_emulate;
rc = emulator_get_dr(ctxt, modrm_reg, &_regs[modrm_rm]);
realmode_set_cr(ctxt->vcpu,
c->modrm_reg, c->modrm_val, &ctxt->eflags);
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
no_wb = 1;
if (modrm_mod != 3)
if (c->modrm_mod != 3)
goto cannot_emulate;
rc = emulator_set_dr(ctxt, modrm_reg, _regs[modrm_rm]);
rc = emulator_set_dr(ctxt, c->modrm_reg,
c->regs[c->modrm_rm]);
if (rc)
goto cannot_emulate;
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
if (rc) {
kvm_inject_gp(ctxt->vcpu, 0);
c->eip = ctxt->vcpu->arch.rip;
}
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
case 0x32:
/* rdmsr */
rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
if (rc) {
kvm_inject_gp(ctxt->vcpu, 0);
c->eip = ctxt->vcpu->arch.rip;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
c->regs[VCPU_REGS_RDX] = msr_data >> 32;
}
rc = X86EMUL_CONTINUE;
c->dst.type = OP_NONE;
break;
case 0x40 ... 0x4f: /* cmov */
dst.val = dst.orig_val = src.val;
no_wb = 1;
/*
* First, assume we're decoding an even cmov opcode
* (lsb == 0).
*/
switch ((b & 15) >> 1) {
case 0: /* cmovo */
no_wb = (_eflags & EFLG_OF) ? 0 : 1;
break;
case 1: /* cmovb/cmovc/cmovnae */
no_wb = (_eflags & EFLG_CF) ? 0 : 1;
break;
case 2: /* cmovz/cmove */
no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
break;
case 3: /* cmovbe/cmovna */
no_wb = (_eflags & (EFLG_CF | EFLG_ZF)) ? 0 : 1;
break;
case 4: /* cmovs */
no_wb = (_eflags & EFLG_SF) ? 0 : 1;
c->dst.val = c->dst.orig_val = c->src.val;
if (!test_cc(c->b, ctxt->eflags))
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/ {
long int rel;
switch (c->op_bytes) {
case 2:
rel = insn_fetch(s16, 2, c->eip);
break;
case 5: /* cmovp/cmovpe */
no_wb = (_eflags & EFLG_PF) ? 0 : 1;
case 4:
rel = insn_fetch(s32, 4, c->eip);
break;
case 7: /* cmovle/cmovng */
no_wb = (_eflags & EFLG_ZF) ? 0 : 1;
/* fall through */
case 6: /* cmovl/cmovnge */
no_wb &= (!(_eflags & EFLG_SF) !=
!(_eflags & EFLG_OF)) ? 0 : 1;
case 8:
rel = insn_fetch(s64, 8, c->eip);
break;
default:
DPRINTF("jnz: Invalid op_bytes\n");
goto cannot_emulate;
}
/* Odd cmov opcodes (lsb == 1) have inverted sense. */
no_wb ^= b & 1;
if (test_cc(c->b, ctxt->eflags))
JMP_REL(rel);
c->dst.type = OP_NONE;
break;
}
case 0xa3:
bt: /* bt */
src.val &= (dst.bytes << 3) - 1; /* only subword offset */
emulate_2op_SrcV_nobyte("bt", src, dst, _eflags);
c->dst.type = OP_NONE;
/* only subword offset */
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
break;
case 0xab:
bts: /* bts */
src.val &= (dst.bytes << 3) - 1; /* only subword offset */
emulate_2op_SrcV_nobyte("bts", src, dst, _eflags);
/* only subword offset */
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
break;
case 0xb0 ... 0xb1: /* cmpxchg */
/*
* Save real source value, then compare EAX against
* destination.
*/
src.orig_val = src.val;
src.val = _regs[VCPU_REGS_RAX];
emulate_2op_SrcV("cmp", src, dst, _eflags);
if (_eflags & EFLG_ZF) {
c->src.orig_val = c->src.val;
c->src.val = c->regs[VCPU_REGS_RAX];
emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
if (ctxt->eflags & EFLG_ZF) {
/* Success: write back to memory. */
dst.val = src.orig_val;
c->dst.val = c->src.orig_val;
} else {
/* Failure: write the value we saw to EAX. */
dst.type = OP_REG;
dst.ptr = (unsigned long *)&_regs[VCPU_REGS_RAX];
c->dst.type = OP_REG;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
}
break;
case 0xb3:
btr: /* btr */
src.val &= (dst.bytes << 3) - 1; /* only subword offset */
emulate_2op_SrcV_nobyte("btr", src, dst, _eflags);
/* only subword offset */
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
break;
case 0xb6 ... 0xb7: /* movzx */
dst.bytes = op_bytes;
dst.val = (d & ByteOp) ? (u8) src.val : (u16) src.val;
c->dst.bytes = c->op_bytes;
c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
: (u16) c->src.val;
break;
case 0xba: /* Grp8 */
switch (modrm_reg & 3) {
switch (c->modrm_reg & 3) {
case 0:
goto bt;
case 1:
......@@ -1511,152 +1882,31 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
break;
case 0xbb:
btc: /* btc */
src.val &= (dst.bytes << 3) - 1; /* only subword offset */
emulate_2op_SrcV_nobyte("btc", src, dst, _eflags);
/* only subword offset */
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
break;
case 0xbe ... 0xbf: /* movsx */
dst.bytes = op_bytes;
dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
c->dst.bytes = c->op_bytes;
c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
(s16) c->src.val;
break;
case 0xc3: /* movnti */
dst.bytes = op_bytes;
dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
break;
}
goto writeback;
twobyte_special_insn:
/* Disable writeback. */
no_wb = 1;
switch (b) {
case 0x06:
emulate_clts(ctxt->vcpu);
break;
case 0x08: /* invd */
break;
case 0x09: /* wbinvd */
break;
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
break;
case 0x20: /* mov cr, reg */
if (modrm_mod != 3)
goto cannot_emulate;
_regs[modrm_rm] = realmode_get_cr(ctxt->vcpu, modrm_reg);
break;
case 0x22: /* mov reg, cr */
if (modrm_mod != 3)
goto cannot_emulate;
realmode_set_cr(ctxt->vcpu, modrm_reg, modrm_val, &_eflags);
break;
case 0x30:
/* wrmsr */
msr_data = (u32)_regs[VCPU_REGS_RAX]
| ((u64)_regs[VCPU_REGS_RDX] << 32);
rc = kvm_set_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], msr_data);
if (rc) {
kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
_eip = ctxt->vcpu->rip;
}
rc = X86EMUL_CONTINUE;
c->dst.bytes = c->op_bytes;
c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
(u64) c->src.val;
break;
case 0x32:
/* rdmsr */
rc = kvm_get_msr(ctxt->vcpu, _regs[VCPU_REGS_RCX], &msr_data);
if (rc) {
kvm_x86_ops->inject_gp(ctxt->vcpu, 0);
_eip = ctxt->vcpu->rip;
} else {
_regs[VCPU_REGS_RAX] = (u32)msr_data;
_regs[VCPU_REGS_RDX] = msr_data >> 32;
}
rc = X86EMUL_CONTINUE;
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/ {
long int rel;
switch (op_bytes) {
case 2:
rel = insn_fetch(s16, 2, _eip);
break;
case 4:
rel = insn_fetch(s32, 4, _eip);
break;
case 8:
rel = insn_fetch(s64, 8, _eip);
break;
default:
DPRINTF("jnz: Invalid op_bytes\n");
goto cannot_emulate;
}
if (test_cc(b, _eflags))
JMP_REL(rel);
break;
}
case 0xc7: /* Grp9 (cmpxchg8b) */
{
u64 old, new;
if ((rc = ops->read_emulated(cr2, &old, 8, ctxt->vcpu))
!= 0)
goto done;
if (((u32) (old >> 0) != (u32) _regs[VCPU_REGS_RAX]) ||
((u32) (old >> 32) != (u32) _regs[VCPU_REGS_RDX])) {
_regs[VCPU_REGS_RAX] = (u32) (old >> 0);
_regs[VCPU_REGS_RDX] = (u32) (old >> 32);
_eflags &= ~EFLG_ZF;
} else {
new = ((u64)_regs[VCPU_REGS_RCX] << 32)
| (u32) _regs[VCPU_REGS_RBX];
if ((rc = ops->cmpxchg_emulated(cr2, &old,
&new, 8, ctxt->vcpu)) != 0)
goto done;
_eflags |= EFLG_ZF;
}
break;
}
rc = emulate_grp9(ctxt, ops, memop);
if (rc != 0)
goto done;
c->dst.type = OP_NONE;
break;
}
goto writeback;
cannot_emulate:
DPRINTF("Cannot emulate %02x\n", b);
DPRINTF("Cannot emulate %02x\n", c->b);
c->eip = saved_eip;
return -1;
}
#ifdef __XEN__
#include <asm/mm.h>
#include <asm/uaccess.h>
int
x86_emulate_read_std(unsigned long addr,
unsigned long *val,
unsigned int bytes, struct x86_emulate_ctxt *ctxt)
{
unsigned int rc;
*val = 0;
if ((rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0) {
propagate_page_fault(addr + bytes - rc, 0); /* read fault */
return X86EMUL_PROPAGATE_FAULT;
}
return X86EMUL_CONTINUE;
}
int
x86_emulate_write_std(unsigned long addr,
unsigned long val,
unsigned int bytes, struct x86_emulate_ctxt *ctxt)
{
unsigned int rc;
if ((rc = copy_to_user((void *)addr, (void *)&val, bytes)) != 0) {
propagate_page_fault(addr + bytes - rc, PGERR_write_access);
return X86EMUL_PROPAGATE_FAULT;
}
return X86EMUL_CONTINUE;
}
#endif
......@@ -90,8 +90,6 @@ source "drivers/dca/Kconfig"
source "drivers/auxdisplay/Kconfig"
source "drivers/kvm/Kconfig"
source "drivers/uio/Kconfig"
source "drivers/virtio/Kconfig"
......
......@@ -47,7 +47,6 @@ obj-$(CONFIG_SPI) += spi/
obj-$(CONFIG_PCCARD) += pcmcia/
obj-$(CONFIG_DIO) += dio/
obj-$(CONFIG_SBUS) += sbus/
obj-$(CONFIG_KVM) += kvm/
obj-$(CONFIG_ZORRO) += zorro/
obj-$(CONFIG_MAC) += macintosh/
obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/
......
......@@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm
header-y += boot.h
header-y += bootparam.h
header-y += debugreg.h
header-y += kvm.h
header-y += ldt.h
header-y += msr-index.h
header-y += prctl.h
......
#ifndef __LINUX_KVM_X86_H
#define __LINUX_KVM_X86_H
/*
* KVM x86 specific structures and definitions
*
*/
#include <asm/types.h>
#include <linux/ioctl.h>
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
struct kvm_memory_alias {
__u32 slot; /* this has a different namespace than memory slots */
__u32 flags;
__u64 guest_phys_addr;
__u64 memory_size;
__u64 target_phys_addr;
};
/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
struct kvm_pic_state {
__u8 last_irr; /* edge detection */
__u8 irr; /* interrupt request register */
__u8 imr; /* interrupt mask register */
__u8 isr; /* interrupt service register */
__u8 priority_add; /* highest irq priority */
__u8 irq_base;
__u8 read_reg_select;
__u8 poll;
__u8 special_mask;
__u8 init_state;
__u8 auto_eoi;
__u8 rotate_on_auto_eoi;
__u8 special_fully_nested_mode;
__u8 init4; /* true if 4 byte init */
__u8 elcr; /* PIIX edge/trigger selection */
__u8 elcr_mask;
};
#define KVM_IOAPIC_NUM_PINS 24
struct kvm_ioapic_state {
__u64 base_address;
__u32 ioregsel;
__u32 id;
__u32 irr;
__u32 pad;
union {
__u64 bits;
struct {
__u8 vector;
__u8 delivery_mode:3;
__u8 dest_mode:1;
__u8 delivery_status:1;
__u8 polarity:1;
__u8 remote_irr:1;
__u8 trig_mode:1;
__u8 mask:1;
__u8 reserve:7;
__u8 reserved[4];
__u8 dest_id;
} fields;
} redirtbl[KVM_IOAPIC_NUM_PINS];
};
#define KVM_IRQCHIP_PIC_MASTER 0
#define KVM_IRQCHIP_PIC_SLAVE 1
#define KVM_IRQCHIP_IOAPIC 2
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
__u64 rax, rbx, rcx, rdx;
__u64 rsi, rdi, rsp, rbp;
__u64 r8, r9, r10, r11;
__u64 r12, r13, r14, r15;
__u64 rip, rflags;
};
/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
#define KVM_APIC_REG_SIZE 0x400
struct kvm_lapic_state {
char regs[KVM_APIC_REG_SIZE];
};
struct kvm_segment {
__u64 base;
__u32 limit;
__u16 selector;
__u8 type;
__u8 present, dpl, db, s, l, g, avl;
__u8 unusable;
__u8 padding;
};
struct kvm_dtable {
__u64 base;
__u16 limit;
__u16 padding[3];
};
/* for KVM_GET_SREGS and KVM_SET_SREGS */
struct kvm_sregs {
/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
struct kvm_segment cs, ds, es, fs, gs, ss;
struct kvm_segment tr, ldt;
struct kvm_dtable gdt, idt;
__u64 cr0, cr2, cr3, cr4, cr8;
__u64 efer;
__u64 apic_base;
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
};
/* for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];
__u16 fcw;
__u16 fsw;
__u8 ftwx; /* in fxsave format */
__u8 pad1;
__u16 last_opcode;
__u64 last_ip;
__u64 last_dp;
__u8 xmm[16][16];
__u32 mxcsr;
__u32 pad2;
};
struct kvm_msr_entry {
__u32 index;
__u32 reserved;
__u64 data;
};
/* for KVM_GET_MSRS and KVM_SET_MSRS */
struct kvm_msrs {
__u32 nmsrs; /* number of msrs in entries */
__u32 pad;
struct kvm_msr_entry entries[0];
};
/* for KVM_GET_MSR_INDEX_LIST */
struct kvm_msr_list {
__u32 nmsrs; /* number of msrs in entries */
__u32 indices[0];
};
struct kvm_cpuid_entry {
__u32 function;
__u32 eax;
__u32 ebx;
__u32 ecx;
__u32 edx;
__u32 padding;
};
/* for KVM_SET_CPUID */
struct kvm_cpuid {
__u32 nent;
__u32 padding;
struct kvm_cpuid_entry entries[0];
};
struct kvm_cpuid_entry2 {
__u32 function;
__u32 index;
__u32 flags;
__u32 eax;
__u32 ebx;
__u32 ecx;
__u32 edx;
__u32 padding[3];
};
#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
/* for KVM_SET_CPUID2 */
struct kvm_cpuid2 {
__u32 nent;
__u32 padding;
struct kvm_cpuid_entry2 entries[0];
};
#endif
#ifndef __KVM_H
#define __KVM_H
/*
#/*
* Kernel-based Virtual Machine driver for Linux
*
* This header defines architecture specific interfaces, x86 version
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#ifndef ASM_KVM_HOST_H
#define ASM_KVM_HOST_H
#include <linux/types.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <asm/signal.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
#include <linux/kvm_types.h>
#include <asm/desc.h>
#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
......@@ -37,15 +38,8 @@
#define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0)
#define KVM_MAX_VCPUS 4
#define KVM_ALIAS_SLOTS 4
#define KVM_MEMORY_SLOTS 8
#define KVM_NUM_MMU_PAGES 1024
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40
#define DE_VECTOR 0
#define UD_VECTOR 6
#define NM_VECTOR 7
#define DF_VECTOR 8
#define TS_VECTOR 10
......@@ -59,31 +53,66 @@
#define IOPL_SHIFT 12
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_ALIAS_SLOTS 4
/*
* vcpu->requests bit members
*/
#define KVM_TLB_FLUSH 0
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
#define KVM_NUM_MMU_PAGES 1024
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40
/*
* Address types:
*
* gva - guest virtual address
* gpa - guest physical address
* gfn - guest frame number
* hva - host virtual address
* hpa - host physical address
* hfn - host frame number
*/
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
struct kvm;
enum {
VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1,
VCPU_REGS_RDX = 2,
VCPU_REGS_RBX = 3,
VCPU_REGS_RSP = 4,
VCPU_REGS_RBP = 5,
VCPU_REGS_RSI = 6,
VCPU_REGS_RDI = 7,
#ifdef CONFIG_X86_64
VCPU_REGS_R8 = 8,
VCPU_REGS_R9 = 9,
VCPU_REGS_R10 = 10,
VCPU_REGS_R11 = 11,
VCPU_REGS_R12 = 12,
VCPU_REGS_R13 = 13,
VCPU_REGS_R14 = 14,
VCPU_REGS_R15 = 15,
#endif
NR_VCPU_REGS
};
enum {
VCPU_SREG_CS,
VCPU_SREG_DS,
VCPU_SREG_ES,
VCPU_SREG_FS,
VCPU_SREG_GS,
VCPU_SREG_SS,
VCPU_SREG_TR,
VCPU_SREG_LDTR,
};
typedef unsigned long gva_t;
typedef u64 gpa_t;
typedef unsigned long gfn_t;
#include <asm/kvm_x86_emulate.h>
typedef unsigned long hva_t;
typedef u64 hpa_t;
typedef unsigned long hfn_t;
#define KVM_NR_MEM_OBJS 40
/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
*/
struct kvm_mmu_memory_cache {
int nobjs;
void *objects[KVM_NR_MEM_OBJS];
};
#define NR_PTE_CHAIN_ENTRIES 5
......@@ -99,7 +128,7 @@ struct kvm_pte_chain {
* bits 4:7 - page table level for this shadow (1-4)
* bits 8:9 - page table quadrant for 2-level guests
* bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
* bits 17:19 - "access" - the user, writable, and nx bits of a huge page pde
* bits 17:19 - common access permissions for all ptes in this shadow page
*/
union kvm_mmu_page_role {
unsigned word;
......@@ -109,7 +138,7 @@ union kvm_mmu_page_role {
unsigned quadrant : 2;
unsigned pad_for_nice_hex_output : 6;
unsigned metaphysical : 1;
unsigned hugepage_access : 3;
unsigned access : 3;
};
};
......@@ -125,6 +154,8 @@ struct kvm_mmu_page {
union kvm_mmu_page_role role;
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
unsigned long slot_bitmap; /* One bit set per slot which has memory
* in this shadow page.
*/
......@@ -136,9 +167,6 @@ struct kvm_mmu_page {
};
};
struct kvm_vcpu;
extern struct kmem_cache *kvm_vcpu_cache;
/*
* x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
* 32-bit). The kvm_mmu structure abstracts the details of the current mmu
......@@ -149,6 +177,8 @@ struct kvm_mmu {
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
hpa_t root_hpa;
int root_level;
int shadow_root_level;
......@@ -156,159 +186,9 @@ struct kvm_mmu {
u64 *pae_root;
};
#define KVM_NR_MEM_OBJS 20
struct kvm_mmu_memory_cache {
int nobjs;
void *objects[KVM_NR_MEM_OBJS];
};
/*
* We don't want allocation failures within the mmu code, so we preallocate
* enough memory for a single page fault in a cache.
*/
struct kvm_guest_debug {
int enabled;
unsigned long bp[4];
int singlestep;
};
enum {
VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1,
VCPU_REGS_RDX = 2,
VCPU_REGS_RBX = 3,
VCPU_REGS_RSP = 4,
VCPU_REGS_RBP = 5,
VCPU_REGS_RSI = 6,
VCPU_REGS_RDI = 7,
#ifdef CONFIG_X86_64
VCPU_REGS_R8 = 8,
VCPU_REGS_R9 = 9,
VCPU_REGS_R10 = 10,
VCPU_REGS_R11 = 11,
VCPU_REGS_R12 = 12,
VCPU_REGS_R13 = 13,
VCPU_REGS_R14 = 14,
VCPU_REGS_R15 = 15,
#endif
NR_VCPU_REGS
};
enum {
VCPU_SREG_CS,
VCPU_SREG_DS,
VCPU_SREG_ES,
VCPU_SREG_FS,
VCPU_SREG_GS,
VCPU_SREG_SS,
VCPU_SREG_TR,
VCPU_SREG_LDTR,
};
struct kvm_pio_request {
unsigned long count;
int cur_count;
struct page *guest_pages[2];
unsigned guest_page_offset;
int in;
int port;
int size;
int string;
int down;
int rep;
};
struct kvm_stat {
u32 pf_fixed;
u32 pf_guest;
u32 tlb_flush;
u32 invlpg;
u32 exits;
u32 io_exits;
u32 mmio_exits;
u32 signal_exits;
u32 irq_window_exits;
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
u32 irq_exits;
u32 light_exits;
u32 efer_reload;
};
struct kvm_io_device {
void (*read)(struct kvm_io_device *this,
gpa_t addr,
int len,
void *val);
void (*write)(struct kvm_io_device *this,
gpa_t addr,
int len,
const void *val);
int (*in_range)(struct kvm_io_device *this, gpa_t addr);
void (*destructor)(struct kvm_io_device *this);
void *private;
};
static inline void kvm_iodevice_read(struct kvm_io_device *dev,
gpa_t addr,
int len,
void *val)
{
dev->read(dev, addr, len, val);
}
static inline void kvm_iodevice_write(struct kvm_io_device *dev,
gpa_t addr,
int len,
const void *val)
{
dev->write(dev, addr, len, val);
}
static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
{
return dev->in_range(dev, addr);
}
static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
{
if (dev->destructor)
dev->destructor(dev);
}
/*
* It would be nice to use something smarter than a linear search, TBD...
* Thankfully we dont expect many devices to register (famous last words :),
* so until then it will suffice. At least its abstracted so we can change
* in one place.
*/
struct kvm_io_bus {
int dev_count;
#define NR_IOBUS_DEVS 6
struct kvm_io_device *devs[NR_IOBUS_DEVS];
};
void kvm_io_bus_init(struct kvm_io_bus *bus);
void kvm_io_bus_destroy(struct kvm_io_bus *bus);
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
struct kvm_io_device *dev);
struct kvm_vcpu {
struct kvm *kvm;
struct preempt_notifier preempt_notifier;
int vcpu_id;
struct mutex mutex;
int cpu;
struct kvm_vcpu_arch {
u64 host_tsc;
struct kvm_run *run;
int interrupt_window_open;
int guest_mode;
unsigned long requests;
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
......@@ -317,9 +197,6 @@ struct kvm_vcpu {
unsigned long cr0;
unsigned long cr2;
unsigned long cr3;
gpa_t para_state_gpa;
struct page *para_state_page;
gpa_t hypercall_gpa;
unsigned long cr4;
unsigned long cr8;
u64 pdptrs[4]; /* pae */
......@@ -334,6 +211,7 @@ struct kvm_vcpu {
int mp_state;
int sipi_vector;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
struct kvm_mmu mmu;
......@@ -344,29 +222,26 @@ struct kvm_vcpu {
gfn_t last_pt_write_gfn;
int last_pt_write_count;
u64 *last_pte_updated;
struct kvm_guest_debug guest_debug;
struct {
gfn_t gfn; /* presumed gfn during guest pte update */
struct page *page; /* page corresponding to that gfn */
} update_pte;
struct i387_fxsave_struct host_fx_image;
struct i387_fxsave_struct guest_fx_image;
int fpu_active;
int guest_fpu_loaded;
int mmio_needed;
int mmio_read_completed;
int mmio_is_write;
int mmio_size;
unsigned char mmio_data[8];
gpa_t mmio_phys_addr;
gva_t mmio_fault_cr2;
struct kvm_pio_request pio;
void *pio_data;
wait_queue_head_t wq;
int sigset_active;
sigset_t sigset;
struct kvm_stat stat;
struct kvm_queued_exception {
bool pending;
bool has_error_code;
u8 nr;
u32 error_code;
} exception;
struct {
int active;
......@@ -381,7 +256,10 @@ struct kvm_vcpu {
int halt_request; /* real mode on Intel only */
int cpuid_nent;
struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
/* emulate context */
struct x86_emulate_ctxt emulate_ctxt;
};
struct kvm_mem_alias {
......@@ -390,51 +268,58 @@ struct kvm_mem_alias {
gfn_t target_gfn;
};
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
struct page **phys_mem;
unsigned long *dirty_bitmap;
};
struct kvm {
struct mutex lock; /* protects everything except vcpus */
struct kvm_arch{
int naliases;
struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
unsigned int n_free_mmu_pages;
unsigned int n_requested_mmu_pages;
unsigned int n_alloc_mmu_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
int n_free_mmu_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
unsigned long rmap_overflow;
struct list_head vm_list;
struct file *filp;
struct kvm_io_bus mmio_bus;
struct kvm_io_bus pio_bus;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
int round_robin_prev_vcpu;
unsigned int tss_addr;
struct page *apic_access_page;
};
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
return kvm->vpic;
}
struct kvm_vm_stat {
u32 mmu_shadow_zapped;
u32 mmu_pte_write;
u32 mmu_pte_updated;
u32 mmu_pde_zapped;
u32 mmu_flooded;
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 remote_tlb_flush;
};
static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
{
return kvm->vioapic;
}
struct kvm_vcpu_stat {
u32 pf_fixed;
u32 pf_guest;
u32 tlb_flush;
u32 invlpg;
static inline int irqchip_in_kernel(struct kvm *kvm)
{
return pic_irqchip(kvm) != 0;
}
u32 exits;
u32 io_exits;
u32 mmio_exits;
u32 signal_exits;
u32 irq_window_exits;
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
u32 irq_exits;
u32 host_state_reload;
u32 efer_reload;
u32 fpu_reload;
u32 insn_emulation;
u32 insn_emulation_fail;
};
struct descriptor_table {
u16 limit;
......@@ -449,11 +334,12 @@ struct kvm_x86_ops {
void (*check_processor_compatibility)(void *rtn);
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
bool (*cpu_has_accelerated_tpr)(void);
/* Create, but do not attach this VCPU */
struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
void (*vcpu_free)(struct kvm_vcpu *vcpu);
void (*vcpu_reset)(struct kvm_vcpu *vcpu);
int (*vcpu_reset)(struct kvm_vcpu *vcpu);
void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
......@@ -489,10 +375,6 @@ struct kvm_x86_ops {
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
unsigned long addr, u32 err_code);
void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code);
void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
......@@ -501,54 +383,31 @@ struct kvm_x86_ops {
unsigned char *hypercall_addr);
int (*get_irq)(struct kvm_vcpu *vcpu);
void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code);
bool (*exception_injected)(struct kvm_vcpu *vcpu);
void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
struct kvm_run *run);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
};
extern struct kvm_x86_ops *kvm_x86_ops;
/* The guest did something we don't support. */
#define pr_unimpl(vcpu, fmt, ...) \
do { \
if (printk_ratelimit()) \
printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
} while(0)
#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
struct module *module);
void kvm_exit_x86(void);
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
void kvm_mmu_zap_all(struct kvm *kvm);
hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva);
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
extern hpa_t bad_page_address;
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
enum emulation_result {
EMULATE_DONE, /* no further processing */
......@@ -556,8 +415,10 @@ enum emulation_result {
EMULATE_FAIL, /* can't emulate this instruction */
};
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
unsigned long cr2, u16 error_code);
unsigned long cr2, u16 error_code, int emulation_type);
void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
......@@ -572,7 +433,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
struct x86_emulate_ctxt;
int kvm_emulate_pio (struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned port);
int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned long count, int down,
......@@ -581,7 +442,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu);
int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long *dest);
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value);
......@@ -597,15 +458,15 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
void fx_init(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
u32 error_code);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_flush_remote_tlbs(struct kvm *kvm);
void fx_init(struct kvm_vcpu *vcpu);
int emulator_read_std(unsigned long addr,
void *val,
void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu);
int emulator_write_emulated(unsigned long addr,
......@@ -615,6 +476,7 @@ int emulator_write_emulated(unsigned long addr,
unsigned long segment_base(u16 selector);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
......@@ -622,66 +484,14 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
static inline void kvm_guest_enter(void)
{
current->flags |= PF_VCPU;
}
int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
static inline void kvm_guest_exit(void)
{
current->flags &= ~PF_VCPU;
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
return vcpu->mmu.page_fault(vcpu, gva, error_code);
}
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
__kvm_mmu_free_some_pages(vcpu);
}
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
}
static inline int is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
return vcpu->shadow_efer & EFER_LME;
#else
return 0;
#endif
}
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return vcpu->cr4 & X86_CR4_PAE;
}
static inline int is_pse(struct kvm_vcpu *vcpu)
{
return vcpu->cr4 & X86_CR4_PSE;
}
static inline int is_paging(struct kvm_vcpu *vcpu)
{
return vcpu->cr0 & X86_CR0_PG;
}
static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
{
return slot - kvm->memslots;
}
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
......@@ -693,55 +503,55 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
static inline u16 read_fs(void)
{
u16 seg;
asm ("mov %%fs, %0" : "=g"(seg));
asm("mov %%fs, %0" : "=g"(seg));
return seg;
}
static inline u16 read_gs(void)
{
u16 seg;
asm ("mov %%gs, %0" : "=g"(seg));
asm("mov %%gs, %0" : "=g"(seg));
return seg;
}
static inline u16 read_ldt(void)
{
u16 ldt;
asm ("sldt %0" : "=g"(ldt));
asm("sldt %0" : "=g"(ldt));
return ldt;
}
static inline void load_fs(u16 sel)
{
asm ("mov %0, %%fs" : : "rm"(sel));
asm("mov %0, %%fs" : : "rm"(sel));
}
static inline void load_gs(u16 sel)
{
asm ("mov %0, %%gs" : : "rm"(sel));
asm("mov %0, %%gs" : : "rm"(sel));
}
#ifndef load_ldt
static inline void load_ldt(u16 sel)
{
asm ("lldt %0" : : "rm"(sel));
asm("lldt %0" : : "rm"(sel));
}
#endif
static inline void get_idt(struct descriptor_table *table)
{
asm ("sidt %0" : "=m"(*table));
asm("sidt %0" : "=m"(*table));
}
static inline void get_gdt(struct descriptor_table *table)
{
asm ("sgdt %0" : "=m"(*table));
asm("sgdt %0" : "=m"(*table));
}
static inline unsigned long read_tr_base(void)
{
u16 tr;
asm ("str %0" : "=g"(tr));
asm("str %0" : "=g"(tr));
return segment_base(tr);
}
......@@ -757,17 +567,17 @@ static inline unsigned long read_msr(unsigned long msr)
static inline void fx_save(struct i387_fxsave_struct *image)
{
asm ("fxsave (%0)":: "r" (image));
asm("fxsave (%0)":: "r" (image));
}
static inline void fx_restore(struct i387_fxsave_struct *image)
{
asm ("fxrstor (%0)":: "r" (image));
asm("fxrstor (%0)":: "r" (image));
}
static inline void fpu_init(void)
{
asm ("finit");
asm("finit");
}
static inline u32 get_rdx_init_val(void)
......@@ -775,6 +585,11 @@ static inline u32 get_rdx_init_val(void)
return 0x600; /* P6 family */
}
static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
......
#ifndef __X86_KVM_PARA_H
#define __X86_KVM_PARA_H
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
/* This CPUID returns a feature bitmap in eax. Before enabling a particular
* paravirtualization, the appropriate feature bit should be checked.
*/
#define KVM_CPUID_FEATURES 0x40000001
#ifdef __KERNEL__
#include <asm/processor.h>
/* This instruction is vmcall. On non-VT architectures, it will generate a
* trap that we will then rewrite to the appropriate instruction.
*/
#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
* instruction. The hypervisor may replace it with something else but only the
* instructions are guaranteed to be supported.
*
* Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
* The hypercall number should be placed in rax and the return value will be
* placed in rax. No other registers will be clobbered unless explicited
* noted by the particular hypercall.
*/
static inline long kvm_hypercall0(unsigned int nr)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr));
return ret;
}
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1));
return ret;
}
static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
unsigned long p2)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2));
return ret;
}
static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3));
return ret;
}
static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3,
unsigned long p4)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4));
return ret;
}
static inline int kvm_para_available(void)
{
unsigned int eax, ebx, ecx, edx;
char signature[13];
cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
memcpy(signature + 0, &ebx, 4);
memcpy(signature + 4, &ecx, 4);
memcpy(signature + 8, &edx, 4);
signature[12] = 0;
if (strcmp(signature, "KVMKVMKVM") == 0)
return 1;
return 0;
}
static inline unsigned int kvm_arch_para_features(void)
{
return cpuid_eax(KVM_CPUID_FEATURES);
}
#endif
#endif
......@@ -62,17 +62,6 @@ struct x86_emulate_ops {
int (*read_std)(unsigned long addr, void *val,
unsigned int bytes, struct kvm_vcpu *vcpu);
/*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for stack operations, and others.
* @addr: [IN ] Linear address to which to write.
* @val: [IN ] Value to write to memory (low-order bytes used as
* required).
* @bytes: [IN ] Number of bytes to write to memory.
*/
int (*write_std)(unsigned long addr, const void *val,
unsigned int bytes, struct kvm_vcpu *vcpu);
/*
* read_emulated: Read bytes from emulated/special memory area.
* @addr: [IN ] Linear address from which to read.
......@@ -112,13 +101,50 @@ struct x86_emulate_ops {
};
/* Type, address-of, and value of an instruction's operand. */
struct operand {
enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
unsigned int bytes;
unsigned long val, orig_val, *ptr;
};
struct fetch_cache {
u8 data[15];
unsigned long start;
unsigned long end;
};
struct decode_cache {
u8 twobyte;
u8 b;
u8 lock_prefix;
u8 rep_prefix;
u8 op_bytes;
u8 ad_bytes;
u8 rex_prefix;
struct operand src;
struct operand dst;
unsigned long *override_base;
unsigned int d;
unsigned long regs[NR_VCPU_REGS];
unsigned long eip;
/* modrm */
u8 modrm;
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
u8 use_modrm_ea;
unsigned long modrm_ea;
unsigned long modrm_val;
struct fetch_cache fetch;
};
struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
/* Linear faulting address (if emulating a page-faulting instruction). */
unsigned long eflags;
unsigned long cr2;
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
......@@ -129,8 +155,16 @@ struct x86_emulate_ctxt {
unsigned long ss_base;
unsigned long gs_base;
unsigned long fs_base;
/* decode cache */
struct decode_cache decode;
};
/* Repeat String Operation Prefix */
#define REPE_PREFIX 1
#define REPNE_PREFIX 2
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
......@@ -144,12 +178,9 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
/*
* x86_emulate_memop: Emulate an instruction that faulted attempting to
* read/write a 'special' memory area.
* Returns -1 on failure, 0 on success.
*/
int x86_emulate_memop(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops);
#endif /* __X86_EMULATE_H__ */
......@@ -100,7 +100,6 @@ header-y += iso_fs.h
header-y += ixjuser.h
header-y += jffs2.h
header-y += keyctl.h
header-y += kvm.h
header-y += limits.h
header-y += lock_dlm_plock.h
header-y += magic.h
......@@ -256,6 +255,7 @@ unifdef-y += kd.h
unifdef-y += kernelcapi.h
unifdef-y += kernel.h
unifdef-y += keyboard.h
unifdef-$(CONFIG_HAVE_KVM) += kvm.h
unifdef-y += llc.h
unifdef-y += loop.h
unifdef-y += lp.h
......
......@@ -9,12 +9,10 @@
#include <asm/types.h>
#include <linux/ioctl.h>
#include <asm/kvm.h>
#define KVM_API_VERSION 12
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
/* for KVM_CREATE_MEMORY_REGION */
struct kvm_memory_region {
__u32 slot;
......@@ -23,17 +21,19 @@ struct kvm_memory_region {
__u64 memory_size; /* bytes */
};
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
struct kvm_memory_alias {
__u32 slot; /* this has a different namespace than memory slots */
/* for KVM_SET_USER_MEMORY_REGION */
struct kvm_userspace_memory_region {
__u32 slot;
__u32 flags;
__u64 guest_phys_addr;
__u64 memory_size;
__u64 target_phys_addr;
__u64 memory_size; /* bytes */
__u64 userspace_addr; /* start of the userspace allocated memory */
};
/* for kvm_memory_region::flags */
#define KVM_MEM_LOG_DIRTY_PAGES 1UL
/* for KVM_IRQ_LINE */
struct kvm_irq_level {
/*
......@@ -45,62 +45,18 @@ struct kvm_irq_level {
__u32 level;
};
/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
struct kvm_pic_state {
__u8 last_irr; /* edge detection */
__u8 irr; /* interrupt request register */
__u8 imr; /* interrupt mask register */
__u8 isr; /* interrupt service register */
__u8 priority_add; /* highest irq priority */
__u8 irq_base;
__u8 read_reg_select;
__u8 poll;
__u8 special_mask;
__u8 init_state;
__u8 auto_eoi;
__u8 rotate_on_auto_eoi;
__u8 special_fully_nested_mode;
__u8 init4; /* true if 4 byte init */
__u8 elcr; /* PIIX edge/trigger selection */
__u8 elcr_mask;
};
#define KVM_IOAPIC_NUM_PINS 24
struct kvm_ioapic_state {
__u64 base_address;
__u32 ioregsel;
__u32 id;
__u32 irr;
__u32 pad;
union {
__u64 bits;
struct {
__u8 vector;
__u8 delivery_mode:3;
__u8 dest_mode:1;
__u8 delivery_status:1;
__u8 polarity:1;
__u8 remote_irr:1;
__u8 trig_mode:1;
__u8 mask:1;
__u8 reserve:7;
__u8 reserved[4];
__u8 dest_id;
} fields;
} redirtbl[KVM_IOAPIC_NUM_PINS];
};
#define KVM_IRQCHIP_PIC_MASTER 0
#define KVM_IRQCHIP_PIC_SLAVE 1
#define KVM_IRQCHIP_IOAPIC 2
struct kvm_irqchip {
__u32 chip_id;
__u32 pad;
union {
char dummy[512]; /* reserving space */
#ifdef CONFIG_X86
struct kvm_pic_state pic;
#endif
#if defined(CONFIG_X86) || defined(CONFIG_IA64)
struct kvm_ioapic_state ioapic;
#endif
} chip;
};
......@@ -116,6 +72,7 @@ struct kvm_irqchip {
#define KVM_EXIT_FAIL_ENTRY 9
#define KVM_EXIT_INTR 10
#define KVM_EXIT_SET_TPR 11
#define KVM_EXIT_TPR_ACCESS 12
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
......@@ -174,90 +131,17 @@ struct kvm_run {
__u32 longmode;
__u32 pad;
} hypercall;
/* KVM_EXIT_TPR_ACCESS */
struct {
__u64 rip;
__u32 is_write;
__u32 pad;
} tpr_access;
/* Fix the size of the union. */
char padding[256];
};
};
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
__u64 rax, rbx, rcx, rdx;
__u64 rsi, rdi, rsp, rbp;
__u64 r8, r9, r10, r11;
__u64 r12, r13, r14, r15;
__u64 rip, rflags;
};
/* for KVM_GET_FPU and KVM_SET_FPU */
struct kvm_fpu {
__u8 fpr[8][16];
__u16 fcw;
__u16 fsw;
__u8 ftwx; /* in fxsave format */
__u8 pad1;
__u16 last_opcode;
__u64 last_ip;
__u64 last_dp;
__u8 xmm[16][16];
__u32 mxcsr;
__u32 pad2;
};
/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
#define KVM_APIC_REG_SIZE 0x400
struct kvm_lapic_state {
char regs[KVM_APIC_REG_SIZE];
};
struct kvm_segment {
__u64 base;
__u32 limit;
__u16 selector;
__u8 type;
__u8 present, dpl, db, s, l, g, avl;
__u8 unusable;
__u8 padding;
};
struct kvm_dtable {
__u64 base;
__u16 limit;
__u16 padding[3];
};
/* for KVM_GET_SREGS and KVM_SET_SREGS */
struct kvm_sregs {
/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
struct kvm_segment cs, ds, es, fs, gs, ss;
struct kvm_segment tr, ldt;
struct kvm_dtable gdt, idt;
__u64 cr0, cr2, cr3, cr4, cr8;
__u64 efer;
__u64 apic_base;
__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
};
struct kvm_msr_entry {
__u32 index;
__u32 reserved;
__u64 data;
};
/* for KVM_GET_MSRS and KVM_SET_MSRS */
struct kvm_msrs {
__u32 nmsrs; /* number of msrs in entries */
__u32 pad;
struct kvm_msr_entry entries[0];
};
/* for KVM_GET_MSR_INDEX_LIST */
struct kvm_msr_list {
__u32 nmsrs; /* number of msrs in entries */
__u32 indices[0];
};
/* for KVM_TRANSLATE */
struct kvm_translation {
/* in */
......@@ -302,28 +186,24 @@ struct kvm_dirty_log {
};
};
struct kvm_cpuid_entry {
__u32 function;
__u32 eax;
__u32 ebx;
__u32 ecx;
__u32 edx;
__u32 padding;
};
/* for KVM_SET_CPUID */
struct kvm_cpuid {
__u32 nent;
__u32 padding;
struct kvm_cpuid_entry entries[0];
};
/* for KVM_SET_SIGNAL_MASK */
struct kvm_signal_mask {
__u32 len;
__u8 sigset[0];
};
/* for KVM_TPR_ACCESS_REPORTING */
struct kvm_tpr_access_ctl {
__u32 enabled;
__u32 flags;
__u32 reserved[8];
};
/* for KVM_SET_VAPIC_ADDR */
struct kvm_vapic_addr {
__u64 vapic_addr;
};
#define KVMIO 0xAE
/*
......@@ -347,11 +227,21 @@ struct kvm_signal_mask {
*/
#define KVM_CAP_IRQCHIP 0
#define KVM_CAP_HLT 1
#define KVM_CAP_MMU_SHADOW_CACHE_CONTROL 2
#define KVM_CAP_USER_MEMORY 3
#define KVM_CAP_SET_TSS_ADDR 4
#define KVM_CAP_EXT_CPUID 5
#define KVM_CAP_VAPIC 6
/*
* ioctls for VM fds
*/
#define KVM_SET_MEMORY_REGION _IOW(KVMIO, 0x40, struct kvm_memory_region)
#define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44)
#define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45)
#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\
struct kvm_userspace_memory_region)
#define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47)
/*
* KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
* a vcpu fd.
......@@ -359,6 +249,7 @@ struct kvm_signal_mask {
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log)
#define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias)
#define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x48, struct kvm_cpuid2)
/* Device model IOC */
#define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60)
#define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level)
......@@ -384,5 +275,11 @@ struct kvm_signal_mask {
#define KVM_SET_FPU _IOW(KVMIO, 0x8d, struct kvm_fpu)
#define KVM_GET_LAPIC _IOR(KVMIO, 0x8e, struct kvm_lapic_state)
#define KVM_SET_LAPIC _IOW(KVMIO, 0x8f, struct kvm_lapic_state)
#define KVM_SET_CPUID2 _IOW(KVMIO, 0x90, struct kvm_cpuid2)
#define KVM_GET_CPUID2 _IOWR(KVMIO, 0x91, struct kvm_cpuid2)
/* Available with KVM_CAP_VAPIC */
#define KVM_TPR_ACCESS_REPORTING _IOWR(KVMIO, 0x92, struct kvm_tpr_access_ctl)
/* Available with KVM_CAP_VAPIC */
#define KVM_SET_VAPIC_ADDR _IOW(KVMIO, 0x93, struct kvm_vapic_addr)
#endif
#ifndef __KVM_HOST_H
#define __KVM_HOST_H
/*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*/
#include <linux/types.h>
#include <linux/hardirq.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <asm/signal.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
#include <linux/kvm_types.h>
#include <asm/kvm_host.h>
#define KVM_MAX_VCPUS 4
#define KVM_MEMORY_SLOTS 8
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
#define KVM_PIO_PAGE_OFFSET 1
/*
* vcpu->requests bit members
*/
#define KVM_REQ_TLB_FLUSH 0
#define KVM_REQ_MIGRATE_TIMER 1
#define KVM_REQ_REPORT_TPR_ACCESS 2
struct kvm_vcpu;
extern struct kmem_cache *kvm_vcpu_cache;
struct kvm_guest_debug {
int enabled;
unsigned long bp[4];
int singlestep;
};
/*
* It would be nice to use something smarter than a linear search, TBD...
* Thankfully we dont expect many devices to register (famous last words :),
* so until then it will suffice. At least its abstracted so we can change
* in one place.
*/
struct kvm_io_bus {
int dev_count;
#define NR_IOBUS_DEVS 6
struct kvm_io_device *devs[NR_IOBUS_DEVS];
};
void kvm_io_bus_init(struct kvm_io_bus *bus);
void kvm_io_bus_destroy(struct kvm_io_bus *bus);
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
struct kvm_io_device *dev);
struct kvm_vcpu {
struct kvm *kvm;
struct preempt_notifier preempt_notifier;
int vcpu_id;
struct mutex mutex;
int cpu;
struct kvm_run *run;
int guest_mode;
unsigned long requests;
struct kvm_guest_debug guest_debug;
int fpu_active;
int guest_fpu_loaded;
wait_queue_head_t wq;
int sigset_active;
sigset_t sigset;
struct kvm_vcpu_stat stat;
#ifdef CONFIG_HAS_IOMEM
int mmio_needed;
int mmio_read_completed;
int mmio_is_write;
int mmio_size;
unsigned char mmio_data[8];
gpa_t mmio_phys_addr;
#endif
struct kvm_vcpu_arch arch;
};
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
unsigned long flags;
unsigned long *rmap;
unsigned long *dirty_bitmap;
unsigned long userspace_addr;
int user_alloc;
};
struct kvm {
struct mutex lock; /* protects the vcpus array and APIC accesses */
spinlock_t mmu_lock;
struct mm_struct *mm; /* userspace tied to this vm */
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
KVM_PRIVATE_MEM_SLOTS];
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
struct list_head vm_list;
struct file *filp;
struct kvm_io_bus mmio_bus;
struct kvm_io_bus pio_bus;
struct kvm_vm_stat stat;
struct kvm_arch arch;
};
/* The guest did something we don't support. */
#define pr_unimpl(vcpu, fmt, ...) \
do { \
if (printk_ratelimit()) \
printk(KERN_ERR "kvm: %i: cpu%i " fmt, \
current->tgid, (vcpu)->vcpu_id , ## __VA_ARGS__); \
} while (0)
#define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
#define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
void decache_vcpus_on_cpu(int cpu);
int kvm_init(void *opaque, unsigned int vcpu_size,
struct module *module);
void kvm_exit(void);
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
extern struct page *bad_page;
int is_error_page(struct page *page);
int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc);
int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc);
int kvm_arch_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot old,
int user_alloc);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len);
int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
unsigned long len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
unsigned long len);
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_flush_remote_tlbs(struct kvm *kvm);
long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
int kvm_dev_ioctl_check_extension(long ext);
int kvm_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log, int *is_dirty);
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log);
int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
struct
kvm_userspace_memory_region *mem,
int user_alloc);
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
void kvm_arch_destroy_vm(struct kvm *kvm);
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
struct kvm_translation *tr);
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs);
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs);
int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
struct kvm_debug_guest *dbg);
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
int kvm_arch_init(void *opaque);
void kvm_arch_exit(void);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id);
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu);
void kvm_arch_hardware_enable(void *garbage);
void kvm_arch_hardware_disable(void *garbage);
int kvm_arch_hardware_setup(void);
void kvm_arch_hardware_unsetup(void);
void kvm_arch_check_processor_compat(void *rtn);
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
void kvm_free_physmem(struct kvm *kvm);
struct kvm *kvm_arch_create_vm(void);
void kvm_arch_destroy_vm(struct kvm *kvm);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
static inline void kvm_guest_enter(void)
{
account_system_vtime(current);
current->flags |= PF_VCPU;
}
static inline void kvm_guest_exit(void)
{
account_system_vtime(current);
current->flags &= ~PF_VCPU;
}
static inline int memslot_id(struct kvm *kvm, struct kvm_memory_slot *slot)
{
return slot - kvm->memslots;
}
static inline gpa_t gfn_to_gpa(gfn_t gfn)
{
return (gpa_t)gfn << PAGE_SHIFT;
}
static inline void kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
{
set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
}
enum kvm_stat_kind {
KVM_STAT_VM,
KVM_STAT_VCPU,
};
struct kvm_stats_debugfs_item {
const char *name;
int offset;
enum kvm_stat_kind kind;
struct dentry *dentry;
};
extern struct kvm_stats_debugfs_item debugfs_entries[];
#endif
......@@ -2,72 +2,30 @@
#define __LINUX_KVM_PARA_H
/*
* Guest OS interface for KVM paravirtualization
*
* Note: this interface is totally experimental, and is certain to change
* as we make progress.
* This header file provides a method for making a hypercall to the host
* Architectures should define:
* - kvm_hypercall0, kvm_hypercall1...
* - kvm_arch_para_features
* - kvm_para_available
*/
/*
* Per-VCPU descriptor area shared between guest and host. Writable to
* both guest and host. Registered with the host by the guest when
* a guest acknowledges paravirtual mode.
*
* NOTE: all addresses are guest-physical addresses (gpa), to make it
* easier for the hypervisor to map between the various addresses.
*/
struct kvm_vcpu_para_state {
/*
* API version information for compatibility. If there's any support
* mismatch (too old host trying to execute too new guest) then
* the host will deny entry into paravirtual mode. Any other
* combination (new host + old guest and new host + new guest)
* is supposed to work - new host versions will support all old
* guest API versions.
*/
u32 guest_version;
u32 host_version;
u32 size;
u32 ret;
/*
* The address of the vm exit instruction (VMCALL or VMMCALL),
* which the host will patch according to the CPU model the
* VM runs on:
*/
u64 hypercall_gpa;
} __attribute__ ((aligned(PAGE_SIZE)));
#define KVM_PARA_API_VERSION 1
/*
* This is used for an RDMSR's ECX parameter to probe for a KVM host.
* Hopefully no CPU vendor will use up this number. This is placed well
* out of way of the typical space occupied by CPU vendors' MSR indices,
* and we think (or at least hope) it wont be occupied in the future
* either.
*/
#define MSR_KVM_API_MAGIC 0x87655678
/* Return values for hypercalls */
#define KVM_ENOSYS 1000
#define KVM_EINVAL 1
#define KVM_HC_VAPIC_POLL_IRQ 1
/*
* Hypercall calling convention:
*
* Each hypercall may have 0-6 parameters.
*
* 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1
*
* 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention
* order: RDI, RSI, RDX, RCX, R8, R9.
*
* 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP.
* (the first 3 are according to the gcc regparm calling convention)
*
* No registers are clobbered by the hypercall, except that the
* return value is in RAX.
* hypercalls use architecture specific
*/
#define __NR_hypercalls 0
#include <asm/kvm_para.h>
#ifdef __KERNEL__
static inline int kvm_para_has_feature(unsigned int feature)
{
if (kvm_arch_para_features() & (1UL << feature))
return 1;
return 0;
}
#endif /* __KERNEL__ */
#endif /* __LINUX_KVM_PARA_H */
#endif
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
*/
#ifndef __KVM_TYPES_H__
#define __KVM_TYPES_H__
#include <asm/types.h>
/*
* Address types:
*
* gva - guest virtual address
* gpa - guest physical address
* gfn - guest frame number
* hva - host virtual address
* hpa - host physical address
* hfn - host frame number
*/
typedef unsigned long gva_t;
typedef u64 gpa_t;
typedef unsigned long gfn_t;
typedef unsigned long hva_t;
typedef u64 hpa_t;
typedef unsigned long hfn_t;
struct kvm_pio_request {
unsigned long count;
int cur_count;
struct page *guest_pages[2];
unsigned guest_page_offset;
int in;
int port;
int size;
int string;
int down;
int rep;
};
#endif /* __KVM_TYPES_H__ */
......@@ -393,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
destroy_context(mm);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
/*
* Decrement the use count and release all resources for an mm.
......
......@@ -26,7 +26,7 @@
* Based on Xen 3.1 code.
*/
#include "kvm.h"
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/mm.h>
#include <linux/highmem.h>
......@@ -34,14 +34,17 @@
#include <linux/hrtimer.h>
#include <linux/io.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/io_apic.h>
#include "irq.h"
/* #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
#include "ioapic.h"
#include "lapic.h"
#if 0
#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
#else
#define ioapic_debug(fmt, arg...)
#endif
static void ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
......@@ -113,7 +116,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
default:
index = (ioapic->ioregsel - 0x10) >> 1;
ioapic_debug("change redir index %x val %x", index, val);
ioapic_debug("change redir index %x val %x\n", index, val);
if (index >= IOAPIC_NUM_PINS)
return;
if (ioapic->ioregsel & 1) {
......@@ -131,16 +134,16 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
}
static void ioapic_inj_irq(struct kvm_ioapic *ioapic,
struct kvm_lapic *target,
struct kvm_vcpu *vcpu,
u8 vector, u8 trig_mode, u8 delivery_mode)
{
ioapic_debug("irq %d trig %d deliv %d", vector, trig_mode,
ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
delivery_mode);
ASSERT((delivery_mode == dest_Fixed) ||
(delivery_mode == dest_LowestPrio));
ASSERT((delivery_mode == IOAPIC_FIXED) ||
(delivery_mode == IOAPIC_LOWEST_PRIORITY));
kvm_apic_set_irq(target, vector, trig_mode);
kvm_apic_set_irq(vcpu, vector, trig_mode);
}
static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
......@@ -151,12 +154,12 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
struct kvm *kvm = ioapic->kvm;
struct kvm_vcpu *vcpu;
ioapic_debug("dest %d dest_mode %d", dest, dest_mode);
ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
if (dest_mode == 0) { /* Physical mode. */
if (dest == 0xFF) { /* Broadcast. */
for (i = 0; i < KVM_MAX_VCPUS; ++i)
if (kvm->vcpus[i] && kvm->vcpus[i]->apic)
if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
mask |= 1 << i;
return mask;
}
......@@ -164,8 +167,8 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
vcpu = kvm->vcpus[i];
if (!vcpu)
continue;
if (kvm_apic_match_physical_addr(vcpu->apic, dest)) {
if (vcpu->apic)
if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
if (vcpu->arch.apic)
mask = 1 << i;
break;
}
......@@ -175,11 +178,11 @@ static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
vcpu = kvm->vcpus[i];
if (!vcpu)
continue;
if (vcpu->apic &&
kvm_apic_match_logical_addr(vcpu->apic, dest))
if (vcpu->arch.apic &&
kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
mask |= 1 << vcpu->vcpu_id;
}
ioapic_debug("mask %x", mask);
ioapic_debug("mask %x\n", mask);
return mask;
}
......@@ -191,41 +194,39 @@ static void ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
u8 vector = ioapic->redirtbl[irq].fields.vector;
u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
u32 deliver_bitmask;
struct kvm_lapic *target;
struct kvm_vcpu *vcpu;
int vcpu_id;
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
"vector=%x trig_mode=%x",
"vector=%x trig_mode=%x\n",
dest, dest_mode, delivery_mode, vector, trig_mode);
deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
if (!deliver_bitmask) {
ioapic_debug("no target on destination");
ioapic_debug("no target on destination\n");
return;
}
switch (delivery_mode) {
case dest_LowestPrio:
target =
kvm_apic_round_robin(ioapic->kvm, vector, deliver_bitmask);
if (target != NULL)
ioapic_inj_irq(ioapic, target, vector,
case IOAPIC_LOWEST_PRIORITY:
vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
deliver_bitmask);
if (vcpu != NULL)
ioapic_inj_irq(ioapic, vcpu, vector,
trig_mode, delivery_mode);
else
ioapic_debug("null round robin: "
"mask=%x vector=%x delivery_mode=%x",
deliver_bitmask, vector, dest_LowestPrio);
ioapic_debug("null lowest prio vcpu: "
"mask=%x vector=%x delivery_mode=%x\n",
deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
break;
case dest_Fixed:
case IOAPIC_FIXED:
for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
if (!(deliver_bitmask & (1 << vcpu_id)))
continue;
deliver_bitmask &= ~(1 << vcpu_id);
vcpu = ioapic->kvm->vcpus[vcpu_id];
if (vcpu) {
target = vcpu->apic;
ioapic_inj_irq(ioapic, target, vector,
ioapic_inj_irq(ioapic, vcpu, vector,
trig_mode, delivery_mode);
}
}
......@@ -271,7 +272,7 @@ static int get_eoi_gsi(struct kvm_ioapic *ioapic, int vector)
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
{
struct kvm_ioapic *ioapic = kvm->vioapic;
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
union ioapic_redir_entry *ent;
int gsi;
......@@ -304,7 +305,7 @@ static void ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
u32 result;
ioapic_debug("addr %lx", (unsigned long)addr);
ioapic_debug("addr %lx\n", (unsigned long)addr);
ASSERT(!(addr & 0xf)); /* check alignment */
addr &= 0xff;
......@@ -341,8 +342,8 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
struct kvm_ioapic *ioapic = (struct kvm_ioapic *)this->private;
u32 data;
ioapic_debug("ioapic_mmio_write addr=%lx len=%d val=%p\n",
addr, len, val);
ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
(void*)addr, len, val);
ASSERT(!(addr & 0xf)); /* check alignment */
if (len == 4 || len == 8)
data = *(u32 *) val;
......@@ -360,24 +361,38 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
case IOAPIC_REG_WINDOW:
ioapic_write_indirect(ioapic, data);
break;
#ifdef CONFIG_IA64
case IOAPIC_REG_EOI:
kvm_ioapic_update_eoi(ioapic->kvm, data);
break;
#endif
default:
break;
}
}
void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
{
int i;
for (i = 0; i < IOAPIC_NUM_PINS; i++)
ioapic->redirtbl[i].fields.mask = 1;
ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
ioapic->ioregsel = 0;
ioapic->irr = 0;
ioapic->id = 0;
}
int kvm_ioapic_init(struct kvm *kvm)
{
struct kvm_ioapic *ioapic;
int i;
ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
if (!ioapic)
return -ENOMEM;
kvm->vioapic = ioapic;
for (i = 0; i < IOAPIC_NUM_PINS; i++)
ioapic->redirtbl[i].fields.mask = 1;
ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
kvm->arch.vioapic = ioapic;
kvm_ioapic_reset(ioapic);
ioapic->dev.read = ioapic_mmio_read;
ioapic->dev.write = ioapic_mmio_write;
ioapic->dev.in_range = ioapic_in_range;
......
#ifndef __KVM_IO_APIC_H
#define __KVM_IO_APIC_H
#include <linux/kvm_host.h>
#include "iodev.h"
struct kvm;
struct kvm_vcpu;
#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
#define IOAPIC_EDGE_TRIG 0
#define IOAPIC_LEVEL_TRIG 1
#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
#define IOAPIC_MEM_LENGTH 0x100
/* Direct registers. */
#define IOAPIC_REG_SELECT 0x00
#define IOAPIC_REG_WINDOW 0x10
#define IOAPIC_REG_EOI 0x40 /* IA64 IOSAPIC only */
/* Indirect registers. */
#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
#define IOAPIC_REG_VERSION 0x01
#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
/*ioapic delivery mode*/
#define IOAPIC_FIXED 0x0
#define IOAPIC_LOWEST_PRIORITY 0x1
#define IOAPIC_PMI 0x2
#define IOAPIC_NMI 0x4
#define IOAPIC_INIT 0x5
#define IOAPIC_EXTINT 0x7
struct kvm_ioapic {
u64 base_address;
u32 ioregsel;
u32 id;
u32 irr;
u32 pad;
union ioapic_redir_entry {
u64 bits;
struct {
u8 vector;
u8 delivery_mode:3;
u8 dest_mode:1;
u8 delivery_status:1;
u8 polarity:1;
u8 remote_irr:1;
u8 trig_mode:1;
u8 mask:1;
u8 reserve:7;
u8 reserved[4];
u8 dest_id;
} fields;
} redirtbl[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
};
#ifdef DEBUG
#define ASSERT(x) \
do { \
if (!(x)) { \
printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
__FILE__, __LINE__, #x); \
BUG(); \
} \
} while (0)
#else
#define ASSERT(x) do { } while (0)
#endif
static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
{
return kvm->arch.vioapic;
}
#ifdef CONFIG_IA64
static inline int irqchip_in_kernel(struct kvm *kvm)
{
return 1;
}
#endif
struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
unsigned long bitmap);
void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
int kvm_ioapic_init(struct kvm *kvm);
void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
#endif
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef __KVM_IODEV_H__
#define __KVM_IODEV_H__
#include <linux/kvm_types.h>
struct kvm_io_device {
void (*read)(struct kvm_io_device *this,
gpa_t addr,
int len,
void *val);
void (*write)(struct kvm_io_device *this,
gpa_t addr,
int len,
const void *val);
int (*in_range)(struct kvm_io_device *this, gpa_t addr);
void (*destructor)(struct kvm_io_device *this);
void *private;
};
static inline void kvm_iodevice_read(struct kvm_io_device *dev,
gpa_t addr,
int len,
void *val)
{
dev->read(dev, addr, len, val);
}
static inline void kvm_iodevice_write(struct kvm_io_device *dev,
gpa_t addr,
int len,
const void *val)
{
dev->write(dev, addr, len, val);
}
static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
{
return dev->in_range(dev, addr);
}
static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
{
if (dev->destructor)
dev->destructor(dev);
}
#endif /* __KVM_IODEV_H__ */
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include "iodev.h"
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/sysdev.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include <linux/profile.h>
#include <linux/kvm_para.h>
#include <linux/pagemap.h>
#include <linux/mman.h>
#include <asm/processor.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
DEFINE_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
static cpumask_t cpus_hardware_enabled;
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
static __read_mostly struct preempt_ops kvm_preempt_ops;
static struct dentry *debugfs_dir;
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
static inline int valid_vcpu(int n)
{
return likely(n >= 0 && n < KVM_MAX_VCPUS);
}
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
void vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
mutex_lock(&vcpu->mutex);
cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
}
void vcpu_put(struct kvm_vcpu *vcpu)
{
preempt_disable();
kvm_arch_vcpu_put(vcpu);
preempt_notifier_unregister(&vcpu->preempt_notifier);
preempt_enable();
mutex_unlock(&vcpu->mutex);
}
static void ack_flush(void *_completed)
{
}
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
int i, cpu;
cpumask_t cpus;
struct kvm_vcpu *vcpu;
cpus_clear(cpus);
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
vcpu = kvm->vcpus[i];
if (!vcpu)
continue;
if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
continue;
cpu = vcpu->cpu;
if (cpu != -1 && cpu != raw_smp_processor_id())
cpu_set(cpu, cpus);
}
if (cpus_empty(cpus))
return;
++kvm->stat.remote_tlb_flush;
smp_call_function_mask(cpus, ack_flush, NULL, 1);
}
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
struct page *page;
int r;
mutex_init(&vcpu->mutex);
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
init_waitqueue_head(&vcpu->wq);
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto fail;
}
vcpu->run = page_address(page);
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
return 0;
fail_free_run:
free_page((unsigned long)vcpu->run);
fail:
return r;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
static struct kvm *kvm_create_vm(void)
{
struct kvm *kvm = kvm_arch_create_vm();
if (IS_ERR(kvm))
goto out;
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock);
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
kvm_io_bus_init(&kvm->mmio_bus);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
out:
return kvm;
}
/*
* Free any memory in @free but not in @dont.
*/
static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
if (!dont || free->rmap != dont->rmap)
vfree(free->rmap);
if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
vfree(free->dirty_bitmap);
free->npages = 0;
free->dirty_bitmap = NULL;
free->rmap = NULL;
}
void kvm_free_physmem(struct kvm *kvm)
{
int i;
for (i = 0; i < kvm->nmemslots; ++i)
kvm_free_physmem_slot(&kvm->memslots[i], NULL);
}
static void kvm_destroy_vm(struct kvm *kvm)
{
struct mm_struct *mm = kvm->mm;
spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
spin_unlock(&kvm_lock);
kvm_io_bus_destroy(&kvm->pio_bus);
kvm_io_bus_destroy(&kvm->mmio_bus);
kvm_arch_destroy_vm(kvm);
mmdrop(mm);
}
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
struct kvm *kvm = filp->private_data;
kvm_destroy_vm(kvm);
return 0;
}
/*
* Allocate some memory and give it an address in the guest physical address
* space.
*
* Discontiguous memory is allowed, mostly for framebuffers.
*
* Must be called holding mmap_sem for write.
*/
int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
int r;
gfn_t base_gfn;
unsigned long npages;
unsigned long i;
struct kvm_memory_slot *memslot;
struct kvm_memory_slot old, new;
r = -EINVAL;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
goto out;
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
goto out;
if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
goto out;
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
goto out;
memslot = &kvm->memslots[mem->slot];
base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
npages = mem->memory_size >> PAGE_SHIFT;
if (!npages)
mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
new = old = *memslot;
new.base_gfn = base_gfn;
new.npages = npages;
new.flags = mem->flags;
/* Disallow changing a memory slot's size. */
r = -EINVAL;
if (npages && old.npages && npages != old.npages)
goto out_free;
/* Check for overlaps */
r = -EEXIST;
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
struct kvm_memory_slot *s = &kvm->memslots[i];
if (s == memslot)
continue;
if (!((base_gfn + npages <= s->base_gfn) ||
(base_gfn >= s->base_gfn + s->npages)))
goto out_free;
}
/* Free page dirty bitmap if unneeded */
if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
new.dirty_bitmap = NULL;
r = -ENOMEM;
/* Allocate if a slot is being created */
if (npages && !new.rmap) {
new.rmap = vmalloc(npages * sizeof(struct page *));
if (!new.rmap)
goto out_free;
memset(new.rmap, 0, npages * sizeof(*new.rmap));
new.user_alloc = user_alloc;
new.userspace_addr = mem->userspace_addr;
}
/* Allocate page dirty bitmap if needed */
if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
new.dirty_bitmap = vmalloc(dirty_bytes);
if (!new.dirty_bitmap)
goto out_free;
memset(new.dirty_bitmap, 0, dirty_bytes);
}
if (mem->slot >= kvm->nmemslots)
kvm->nmemslots = mem->slot + 1;
*memslot = new;
r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
if (r) {
*memslot = old;
goto out_free;
}
kvm_free_physmem_slot(&old, &new);
return 0;
out_free:
kvm_free_physmem_slot(&new, &old);
out:
return r;
}
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
int r;
down_write(&current->mm->mmap_sem);
r = __kvm_set_memory_region(kvm, mem, user_alloc);
up_write(&current->mm->mmap_sem);
return r;
}
EXPORT_SYMBOL_GPL(kvm_set_memory_region);
int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
struct
kvm_userspace_memory_region *mem,
int user_alloc)
{
if (mem->slot >= KVM_MEMORY_SLOTS)
return -EINVAL;
return kvm_set_memory_region(kvm, mem, user_alloc);
}
int kvm_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log, int *is_dirty)
{
struct kvm_memory_slot *memslot;
int r, i;
int n;
unsigned long any = 0;
r = -EINVAL;
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
memslot = &kvm->memslots[log->slot];
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
for (i = 0; !any && i < n/sizeof(long); ++i)
any = memslot->dirty_bitmap[i];
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
goto out;
if (any)
*is_dirty = 1;
r = 0;
out:
return r;
}
int is_error_page(struct page *page)
{
return page == bad_page;
}
EXPORT_SYMBOL_GPL(is_error_page);
static inline unsigned long bad_hva(void)
{
return PAGE_OFFSET;
}
int kvm_is_error_hva(unsigned long addr)
{
return addr == bad_hva();
}
EXPORT_SYMBOL_GPL(kvm_is_error_hva);
static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
int i;
for (i = 0; i < kvm->nmemslots; ++i) {
struct kvm_memory_slot *memslot = &kvm->memslots[i];
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages)
return memslot;
}
return NULL;
}
struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
gfn = unalias_gfn(kvm, gfn);
return __gfn_to_memslot(kvm, gfn);
}
int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
int i;
gfn = unalias_gfn(kvm, gfn);
for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
struct kvm_memory_slot *memslot = &kvm->memslots[i];
if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages)
return 1;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
gfn = unalias_gfn(kvm, gfn);
slot = __gfn_to_memslot(kvm, gfn);
if (!slot)
return bad_hva();
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
/*
* Requires current->mm->mmap_sem to be held
*/
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
struct page *page[1];
unsigned long addr;
int npages;
might_sleep();
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr)) {
get_page(bad_page);
return bad_page;
}
npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
NULL);
if (npages != 1) {
get_page(bad_page);
return bad_page;
}
return page[0];
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
put_page(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_page_dirty(struct page *page)
{
if (!PageReserved(page))
SetPageDirty(page);
put_page(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
static int next_segment(unsigned long len, int offset)
{
if (len > PAGE_SIZE - offset)
return PAGE_SIZE - offset;
else
return len;
}
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len)
{
int r;
unsigned long addr;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = copy_from_user(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
int seg;
int offset = offset_in_page(gpa);
int ret;
while ((seg = next_segment(len, offset)) != 0) {
ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
if (ret < 0)
return ret;
offset = 0;
len -= seg;
data += seg;
++gfn;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest);
int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
unsigned long len)
{
int r;
unsigned long addr;
gfn_t gfn = gpa >> PAGE_SHIFT;
int offset = offset_in_page(gpa);
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
}
EXPORT_SYMBOL(kvm_read_guest_atomic);
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
int offset, int len)
{
int r;
unsigned long addr;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = copy_to_user((void __user *)addr + offset, data, len);
if (r)
return -EFAULT;
mark_page_dirty(kvm, gfn);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
unsigned long len)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
int seg;
int offset = offset_in_page(gpa);
int ret;
while ((seg = next_segment(len, offset)) != 0) {
ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
if (ret < 0)
return ret;
offset = 0;
len -= seg;
data += seg;
++gfn;
}
return 0;
}
int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
{
return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
int seg;
int offset = offset_in_page(gpa);
int ret;
while ((seg = next_segment(len, offset)) != 0) {
ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
if (ret < 0)
return ret;
offset = 0;
len -= seg;
++gfn;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *memslot;
gfn = unalias_gfn(kvm, gfn);
memslot = __gfn_to_memslot(kvm, gfn);
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
/* avoid RMW */
if (!test_bit(rel_gfn, memslot->dirty_bitmap))
set_bit(rel_gfn, memslot->dirty_bitmap);
}
}
/*
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
*/
void kvm_vcpu_block(struct kvm_vcpu *vcpu)
{
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(&vcpu->wq, &wait);
/*
* We will block until either an interrupt or a signal wakes us up
*/
while (!kvm_cpu_has_interrupt(vcpu)
&& !signal_pending(current)
&& !kvm_arch_vcpu_runnable(vcpu)) {
set_current_state(TASK_INTERRUPTIBLE);
vcpu_put(vcpu);
schedule();
vcpu_load(vcpu);
}
__set_current_state(TASK_RUNNING);
remove_wait_queue(&vcpu->wq, &wait);
}
void kvm_resched(struct kvm_vcpu *vcpu)
{
if (!need_resched())
return;
cond_resched();
}
EXPORT_SYMBOL_GPL(kvm_resched);
static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct kvm_vcpu *vcpu = vma->vm_file->private_data;
struct page *page;
if (vmf->pgoff == 0)
page = virt_to_page(vcpu->run);
else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
page = virt_to_page(vcpu->arch.pio_data);
else
return VM_FAULT_SIGBUS;
get_page(page);
vmf->page = page;
return 0;
}
static struct vm_operations_struct kvm_vcpu_vm_ops = {
.fault = kvm_vcpu_fault,
};
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
vma->vm_ops = &kvm_vcpu_vm_ops;
return 0;
}
static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
struct kvm_vcpu *vcpu = filp->private_data;
fput(vcpu->kvm->filp);
return 0;
}
static struct file_operations kvm_vcpu_fops = {
.release = kvm_vcpu_release,
.unlocked_ioctl = kvm_vcpu_ioctl,
.compat_ioctl = kvm_vcpu_ioctl,
.mmap = kvm_vcpu_mmap,
};
/*
* Allocates an inode for the vcpu.
*/
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
int fd, r;
struct inode *inode;
struct file *file;
r = anon_inode_getfd(&fd, &inode, &file,
"kvm-vcpu", &kvm_vcpu_fops, vcpu);
if (r)
return r;
atomic_inc(&vcpu->kvm->filp->f_count);
return fd;
}
/*
* Creates some virtual cpus. Good luck creating more than one.
*/
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
{
int r;
struct kvm_vcpu *vcpu;
if (!valid_vcpu(n))
return -EINVAL;
vcpu = kvm_arch_vcpu_create(kvm, n);
if (IS_ERR(vcpu))
return PTR_ERR(vcpu);
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
r = kvm_arch_vcpu_setup(vcpu);
if (r)
goto vcpu_destroy;
mutex_lock(&kvm->lock);
if (kvm->vcpus[n]) {
r = -EEXIST;
mutex_unlock(&kvm->lock);
goto vcpu_destroy;
}
kvm->vcpus[n] = vcpu;
mutex_unlock(&kvm->lock);
/* Now it's all set up, let userspace reach it */
r = create_vcpu_fd(vcpu);
if (r < 0)
goto unlink;
return r;
unlink:
mutex_lock(&kvm->lock);
kvm->vcpus[n] = NULL;
mutex_unlock(&kvm->lock);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
return r;
}
static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
{
if (sigset) {
sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
vcpu->sigset_active = 1;
vcpu->sigset = *sigset;
} else
vcpu->sigset_active = 0;
return 0;
}
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
if (vcpu->kvm->mm != current->mm)
return -EIO;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
if (arg)
goto out;
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
break;
case KVM_GET_REGS: {
struct kvm_regs kvm_regs;
memset(&kvm_regs, 0, sizeof kvm_regs);
r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
goto out;
r = 0;
break;
}
case KVM_SET_REGS: {
struct kvm_regs kvm_regs;
r = -EFAULT;
if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
goto out;
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
if (r)
goto out;
r = 0;
break;
}
case KVM_GET_SREGS: {
struct kvm_sregs kvm_sregs;
memset(&kvm_sregs, 0, sizeof kvm_sregs);
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
goto out;
r = 0;
break;
}
case KVM_SET_SREGS: {
struct kvm_sregs kvm_sregs;
r = -EFAULT;
if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
goto out;
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
if (r)
goto out;
r = 0;
break;
}
case KVM_TRANSLATE: {
struct kvm_translation tr;
r = -EFAULT;
if (copy_from_user(&tr, argp, sizeof tr))
goto out;
r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &tr, sizeof tr))
goto out;
r = 0;
break;
}
case KVM_DEBUG_GUEST: {
struct kvm_debug_guest dbg;
r = -EFAULT;
if (copy_from_user(&dbg, argp, sizeof dbg))
goto out;
r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
if (r)
goto out;
r = 0;
break;
}
case KVM_SET_SIGNAL_MASK: {
struct kvm_signal_mask __user *sigmask_arg = argp;
struct kvm_signal_mask kvm_sigmask;
sigset_t sigset, *p;
p = NULL;
if (argp) {
r = -EFAULT;
if (copy_from_user(&kvm_sigmask, argp,
sizeof kvm_sigmask))
goto out;
r = -EINVAL;
if (kvm_sigmask.len != sizeof sigset)
goto out;
r = -EFAULT;
if (copy_from_user(&sigset, sigmask_arg->sigset,
sizeof sigset))
goto out;
p = &sigset;
}
r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
break;
}
case KVM_GET_FPU: {
struct kvm_fpu fpu;
memset(&fpu, 0, sizeof fpu);
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &fpu, sizeof fpu))
goto out;
r = 0;
break;
}
case KVM_SET_FPU: {
struct kvm_fpu fpu;
r = -EFAULT;
if (copy_from_user(&fpu, argp, sizeof fpu))
goto out;
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
if (r)
goto out;
r = 0;
break;
}
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
out:
return r;
}
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
if (kvm->mm != current->mm)
return -EIO;
switch (ioctl) {
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
if (r < 0)
goto out;
break;
case KVM_SET_USER_MEMORY_REGION: {
struct kvm_userspace_memory_region kvm_userspace_mem;
r = -EFAULT;
if (copy_from_user(&kvm_userspace_mem, argp,
sizeof kvm_userspace_mem))
goto out;
r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
if (r)
goto out;
break;
}
case KVM_GET_DIRTY_LOG: {
struct kvm_dirty_log log;
r = -EFAULT;
if (copy_from_user(&log, argp, sizeof log))
goto out;
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
if (r)
goto out;
break;
}
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}
out:
return r;
}
static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct kvm *kvm = vma->vm_file->private_data;
struct page *page;
if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
return VM_FAULT_SIGBUS;
page = gfn_to_page(kvm, vmf->pgoff);
if (is_error_page(page)) {
kvm_release_page_clean(page);
return VM_FAULT_SIGBUS;
}
vmf->page = page;
return 0;
}
static struct vm_operations_struct kvm_vm_vm_ops = {
.fault = kvm_vm_fault,
};
static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
{
vma->vm_ops = &kvm_vm_vm_ops;
return 0;
}
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
.compat_ioctl = kvm_vm_ioctl,
.mmap = kvm_vm_mmap,
};
static int kvm_dev_ioctl_create_vm(void)
{
int fd, r;
struct inode *inode;
struct file *file;
struct kvm *kvm;
kvm = kvm_create_vm();
if (IS_ERR(kvm))
return PTR_ERR(kvm);
r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
if (r) {
kvm_destroy_vm(kvm);
return r;
}
kvm->filp = file;
return fd;
}
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
void __user *argp = (void __user *)arg;
long r = -EINVAL;
switch (ioctl) {
case KVM_GET_API_VERSION:
r = -EINVAL;
if (arg)
goto out;
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = -EINVAL;
if (arg)
goto out;
r = kvm_dev_ioctl_create_vm();
break;
case KVM_CHECK_EXTENSION:
r = kvm_dev_ioctl_check_extension((long)argp);
break;
case KVM_GET_VCPU_MMAP_SIZE:
r = -EINVAL;
if (arg)
goto out;
r = 2 * PAGE_SIZE;
break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl = kvm_dev_ioctl,
.compat_ioctl = kvm_dev_ioctl,
};
static struct miscdevice kvm_dev = {
KVM_MINOR,
"kvm",
&kvm_chardev_ops,
};
static void hardware_enable(void *junk)
{
int cpu = raw_smp_processor_id();
if (cpu_isset(cpu, cpus_hardware_enabled))
return;
cpu_set(cpu, cpus_hardware_enabled);
kvm_arch_hardware_enable(NULL);
}
static void hardware_disable(void *junk)
{
int cpu = raw_smp_processor_id();
if (!cpu_isset(cpu, cpus_hardware_enabled))
return;
cpu_clear(cpu, cpus_hardware_enabled);
decache_vcpus_on_cpu(cpu);
kvm_arch_hardware_disable(NULL);
}
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
void *v)
{
int cpu = (long)v;
val &= ~CPU_TASKS_FROZEN;
switch (val) {
case CPU_DYING:
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu);
hardware_disable(NULL);
break;
case CPU_UP_CANCELED:
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu);
smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
break;
case CPU_ONLINE:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu);
smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
break;
}
return NOTIFY_OK;
}
static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
void *v)
{
if (val == SYS_RESTART) {
/*
* Some (well, at least mine) BIOSes hang on reboot if
* in vmx root mode.
*/
printk(KERN_INFO "kvm: exiting hardware virtualization\n");
on_each_cpu(hardware_disable, NULL, 0, 1);
}
return NOTIFY_OK;
}
static struct notifier_block kvm_reboot_notifier = {
.notifier_call = kvm_reboot,
.priority = 0,
};
void kvm_io_bus_init(struct kvm_io_bus *bus)
{
memset(bus, 0, sizeof(*bus));
}
void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
for (i = 0; i < bus->dev_count; i++) {
struct kvm_io_device *pos = bus->devs[i];
kvm_iodevice_destructor(pos);
}
}
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
{
int i;
for (i = 0; i < bus->dev_count; i++) {
struct kvm_io_device *pos = bus->devs[i];
if (pos->in_range(pos, addr))
return pos;
}
return NULL;
}
void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
{
BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
bus->devs[bus->dev_count++] = dev;
}
static struct notifier_block kvm_cpu_notifier = {
.notifier_call = kvm_cpu_hotplug,
.priority = 20, /* must be > scheduler priority */
};
static u64 vm_stat_get(void *_offset)
{
unsigned offset = (long)_offset;
u64 total = 0;
struct kvm *kvm;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
total += *(u32 *)((void *)kvm + offset);
spin_unlock(&kvm_lock);
return total;
}
DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
static u64 vcpu_stat_get(void *_offset)
{
unsigned offset = (long)_offset;
u64 total = 0;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i;
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
vcpu = kvm->vcpus[i];
if (vcpu)
total += *(u32 *)((void *)vcpu + offset);
}
spin_unlock(&kvm_lock);
return total;
}
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
static struct file_operations *stat_fops[] = {
[KVM_STAT_VCPU] = &vcpu_stat_fops,
[KVM_STAT_VM] = &vm_stat_fops,
};
static void kvm_init_debug(void)
{
struct kvm_stats_debugfs_item *p;
debugfs_dir = debugfs_create_dir("kvm", NULL);
for (p = debugfs_entries; p->name; ++p)
p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
(void *)(long)p->offset,
stat_fops[p->kind]);
}
static void kvm_exit_debug(void)
{
struct kvm_stats_debugfs_item *p;
for (p = debugfs_entries; p->name; ++p)
debugfs_remove(p->dentry);
debugfs_remove(debugfs_dir);
}
static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{
hardware_disable(NULL);
return 0;
}
static int kvm_resume(struct sys_device *dev)
{
hardware_enable(NULL);
return 0;
}
static struct sysdev_class kvm_sysdev_class = {
.name = "kvm",
.suspend = kvm_suspend,
.resume = kvm_resume,
};
static struct sys_device kvm_sysdev = {
.id = 0,
.cls = &kvm_sysdev_class,
};
struct page *bad_page;
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
return container_of(pn, struct kvm_vcpu, preempt_notifier);
}
static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
{
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
kvm_arch_vcpu_load(vcpu, cpu);
}
static void kvm_sched_out(struct preempt_notifier *pn,
struct task_struct *next)
{
struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
kvm_arch_vcpu_put(vcpu);
}
int kvm_init(void *opaque, unsigned int vcpu_size,
struct module *module)
{
int r;
int cpu;
kvm_init_debug();
r = kvm_arch_init(opaque);
if (r)
goto out_fail;
bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bad_page == NULL) {
r = -ENOMEM;
goto out;
}
r = kvm_arch_hardware_setup();
if (r < 0)
goto out_free_0;
for_each_online_cpu(cpu) {
smp_call_function_single(cpu,
kvm_arch_check_processor_compat,
&r, 0, 1);
if (r < 0)
goto out_free_1;
}
on_each_cpu(hardware_enable, NULL, 0, 1);
r = register_cpu_notifier(&kvm_cpu_notifier);
if (r)
goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
r = sysdev_class_register(&kvm_sysdev_class);
if (r)
goto out_free_3;
r = sysdev_register(&kvm_sysdev);
if (r)
goto out_free_4;
/* A kmem cache lets us meet the alignment requirements of fx_save. */
kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
__alignof__(struct kvm_vcpu),
0, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
goto out_free_5;
}
kvm_chardev_ops.owner = module;
r = misc_register(&kvm_dev);
if (r) {
printk(KERN_ERR "kvm: misc device register failed\n");
goto out_free;
}
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
return 0;
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
out_free_5:
sysdev_unregister(&kvm_sysdev);
out_free_4:
sysdev_class_unregister(&kvm_sysdev_class);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
out_free_2:
on_each_cpu(hardware_disable, NULL, 0, 1);
out_free_1:
kvm_arch_hardware_unsetup();
out_free_0:
__free_page(bad_page);
out:
kvm_arch_exit();
kvm_exit_debug();
out_fail:
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
void kvm_exit(void)
{
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
sysdev_unregister(&kvm_sysdev);
sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(hardware_disable, NULL, 0, 1);
kvm_arch_hardware_unsetup();
kvm_arch_exit();
kvm_exit_debug();
__free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment