Commit 49c13b51 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (80 commits)
  KVM: Use CPU_DYING for disabling virtualization
  KVM: Tune hotplug/suspend IPIs
  KVM: Keep track of which cpus have virtualization enabled
  SMP: Allow smp_call_function_single() to current cpu
  i386: Allow smp_call_function_single() to current cpu
  x86_64: Allow smp_call_function_single() to current cpu
  HOTPLUG: Adapt thermal throttle to CPU_DYING
  HOTPLUG: Adapt cpuset hotplug callback to CPU_DYING
  HOTPLUG: Add CPU_DYING notifier
  KVM: Clean up #includes
  KVM: Remove kvmfs in favor of the anonymous inodes source
  KVM: SVM: Reliably detect if SVM was disabled by BIOS
  KVM: VMX: Remove unnecessary code in vmx_tlb_flush()
  KVM: MMU: Fix Wrong tlb flush order
  KVM: VMX: Reinitialize the real-mode tss when entering real mode
  KVM: Avoid useless memory write when possible
  KVM: Fix x86 emulator writeback
  KVM: Add support for in-kernel pio handlers
  KVM: VMX: Fix interrupt checking on lightweight exit
  KVM: Adds support for in-kernel mmio handlers
  ...
parents 492559af cec9ad27
...@@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, ...@@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
int err; int err;
sys_dev = get_cpu_sysdev(cpu); sys_dev = get_cpu_sysdev(cpu);
mutex_lock(&therm_cpu_lock);
switch (action) { switch (action) {
case CPU_ONLINE: case CPU_ONLINE:
case CPU_ONLINE_FROZEN: case CPU_ONLINE_FROZEN:
mutex_lock(&therm_cpu_lock);
err = thermal_throttle_add_dev(sys_dev); err = thermal_throttle_add_dev(sys_dev);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err); WARN_ON(err);
break; break;
case CPU_DEAD: case CPU_DEAD:
case CPU_DEAD_FROZEN: case CPU_DEAD_FROZEN:
mutex_lock(&therm_cpu_lock);
thermal_throttle_remove_dev(sys_dev); thermal_throttle_remove_dev(sys_dev);
mutex_unlock(&therm_cpu_lock);
break; break;
} }
mutex_unlock(&therm_cpu_lock);
return NOTIFY_OK; return NOTIFY_OK;
} }
......
...@@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic, ...@@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
EXPORT_SYMBOL(smp_call_function); EXPORT_SYMBOL(smp_call_function);
/** /**
* smp_call_function_single - Run a function on another CPU * smp_call_function_single - Run a function on a specific CPU
* @cpu: The target CPU. Cannot be the calling CPU. * @cpu: The target CPU. Cannot be the calling CPU.
* @func: The function to run. This must be fast and non-blocking. * @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function. * @info: An arbitrary pointer to pass to the function.
...@@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, ...@@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int ret; int ret;
int me = get_cpu(); int me = get_cpu();
if (cpu == me) { if (cpu == me) {
WARN_ON(1); local_irq_disable();
func(info);
local_irq_enable();
put_cpu(); put_cpu();
return -EBUSY; return 0;
} }
ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
......
...@@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info, ...@@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info,
} }
/* /*
* smp_call_function_single - Run a function on another CPU * smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking. * @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function. * @info: An arbitrary pointer to pass to the function.
* @nonatomic: Currently unused. * @nonatomic: Currently unused.
...@@ -374,14 +374,18 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, ...@@ -374,14 +374,18 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
{ {
/* prevent preemption and reschedule on another processor */ /* prevent preemption and reschedule on another processor */
int me = get_cpu(); int me = get_cpu();
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
if (cpu == me) { if (cpu == me) {
local_irq_disable();
func(info);
local_irq_enable();
put_cpu(); put_cpu();
return 0; return 0;
} }
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
spin_lock_bh(&call_lock); spin_lock_bh(&call_lock);
__smp_call_function_single(cpu, func, info, nonatomic, wait); __smp_call_function_single(cpu, func, info, nonatomic, wait);
spin_unlock_bh(&call_lock); spin_unlock_bh(&call_lock);
......
# #
# KVM configuration # KVM configuration
# #
menu "Virtualization" menuconfig VIRTUALIZATION
bool "Virtualization"
depends on X86 depends on X86
default y
if VIRTUALIZATION
config KVM config KVM
tristate "Kernel-based Virtual Machine (KVM) support" tristate "Kernel-based Virtual Machine (KVM) support"
depends on X86 && EXPERIMENTAL depends on X86 && EXPERIMENTAL
depends on X86_CMPXCHG64 || 64BIT
---help--- ---help---
Support hosting fully virtualized guest machines using hardware Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent virtualization extensions. You will need a fairly recent
...@@ -35,4 +40,4 @@ config KVM_AMD ...@@ -35,4 +40,4 @@ config KVM_AMD
Provides support for KVM on AMD processors equipped with the AMD-V Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions. (SVM) extensions.
endmenu endif # VIRTUALIZATION
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <asm/signal.h> #include <asm/signal.h>
...@@ -18,6 +20,7 @@ ...@@ -18,6 +20,7 @@
#include <linux/kvm_para.h> #include <linux/kvm_para.h>
#define CR0_PE_MASK (1ULL << 0) #define CR0_PE_MASK (1ULL << 0)
#define CR0_MP_MASK (1ULL << 1)
#define CR0_TS_MASK (1ULL << 3) #define CR0_TS_MASK (1ULL << 3)
#define CR0_NE_MASK (1ULL << 5) #define CR0_NE_MASK (1ULL << 5)
#define CR0_WP_MASK (1ULL << 16) #define CR0_WP_MASK (1ULL << 16)
...@@ -42,7 +45,8 @@ ...@@ -42,7 +45,8 @@
(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \ (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK \
| CR0_NW_MASK | CR0_CD_MASK) | CR0_NW_MASK | CR0_CD_MASK)
#define KVM_VM_CR0_ALWAYS_ON \ #define KVM_VM_CR0_ALWAYS_ON \
(CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK) (CR0_PG_MASK | CR0_PE_MASK | CR0_WP_MASK | CR0_NE_MASK | CR0_TS_MASK \
| CR0_MP_MASK)
#define KVM_GUEST_CR4_MASK \ #define KVM_GUEST_CR4_MASK \
(CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK) (CR4_PSE_MASK | CR4_PAE_MASK | CR4_PGE_MASK | CR4_VMXE_MASK | CR4_VME_MASK)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK) #define KVM_PMODE_VM_CR4_ALWAYS_ON (CR4_VMXE_MASK | CR4_PAE_MASK)
...@@ -51,10 +55,10 @@ ...@@ -51,10 +55,10 @@
#define INVALID_PAGE (~(hpa_t)0) #define INVALID_PAGE (~(hpa_t)0)
#define UNMAPPED_GVA (~(gpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0)
#define KVM_MAX_VCPUS 1 #define KVM_MAX_VCPUS 4
#define KVM_ALIAS_SLOTS 4 #define KVM_ALIAS_SLOTS 4
#define KVM_MEMORY_SLOTS 4 #define KVM_MEMORY_SLOTS 4
#define KVM_NUM_MMU_PAGES 256 #define KVM_NUM_MMU_PAGES 1024
#define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25 #define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40 #define KVM_MAX_CPUID_ENTRIES 40
...@@ -79,6 +83,11 @@ ...@@ -79,6 +83,11 @@
#define KVM_PIO_PAGE_OFFSET 1 #define KVM_PIO_PAGE_OFFSET 1
/*
* vcpu->requests bit members
*/
#define KVM_TLB_FLUSH 0
/* /*
* Address types: * Address types:
* *
...@@ -137,7 +146,7 @@ struct kvm_mmu_page { ...@@ -137,7 +146,7 @@ struct kvm_mmu_page {
gfn_t gfn; gfn_t gfn;
union kvm_mmu_page_role role; union kvm_mmu_page_role role;
hpa_t page_hpa; u64 *spt;
unsigned long slot_bitmap; /* One bit set per slot which has memory unsigned long slot_bitmap; /* One bit set per slot which has memory
* in this shadow page. * in this shadow page.
*/ */
...@@ -232,6 +241,7 @@ struct kvm_pio_request { ...@@ -232,6 +241,7 @@ struct kvm_pio_request {
struct page *guest_pages[2]; struct page *guest_pages[2];
unsigned guest_page_offset; unsigned guest_page_offset;
int in; int in;
int port;
int size; int size;
int string; int string;
int down; int down;
...@@ -252,8 +262,70 @@ struct kvm_stat { ...@@ -252,8 +262,70 @@ struct kvm_stat {
u32 halt_exits; u32 halt_exits;
u32 request_irq_exits; u32 request_irq_exits;
u32 irq_exits; u32 irq_exits;
u32 light_exits;
u32 efer_reload;
};
struct kvm_io_device {
void (*read)(struct kvm_io_device *this,
gpa_t addr,
int len,
void *val);
void (*write)(struct kvm_io_device *this,
gpa_t addr,
int len,
const void *val);
int (*in_range)(struct kvm_io_device *this, gpa_t addr);
void (*destructor)(struct kvm_io_device *this);
void *private;
};
static inline void kvm_iodevice_read(struct kvm_io_device *dev,
gpa_t addr,
int len,
void *val)
{
dev->read(dev, addr, len, val);
}
static inline void kvm_iodevice_write(struct kvm_io_device *dev,
gpa_t addr,
int len,
const void *val)
{
dev->write(dev, addr, len, val);
}
static inline int kvm_iodevice_inrange(struct kvm_io_device *dev, gpa_t addr)
{
return dev->in_range(dev, addr);
}
static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
{
if (dev->destructor)
dev->destructor(dev);
}
/*
* It would be nice to use something smarter than a linear search, TBD...
* Thankfully we dont expect many devices to register (famous last words :),
* so until then it will suffice. At least its abstracted so we can change
* in one place.
*/
struct kvm_io_bus {
int dev_count;
#define NR_IOBUS_DEVS 6
struct kvm_io_device *devs[NR_IOBUS_DEVS];
}; };
void kvm_io_bus_init(struct kvm_io_bus *bus);
void kvm_io_bus_destroy(struct kvm_io_bus *bus);
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr);
void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
struct kvm_io_device *dev);
struct kvm_vcpu { struct kvm_vcpu {
struct kvm *kvm; struct kvm *kvm;
union { union {
...@@ -266,6 +338,8 @@ struct kvm_vcpu { ...@@ -266,6 +338,8 @@ struct kvm_vcpu {
u64 host_tsc; u64 host_tsc;
struct kvm_run *run; struct kvm_run *run;
int interrupt_window_open; int interrupt_window_open;
int guest_mode;
unsigned long requests;
unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
#define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long)
unsigned long irq_pending[NR_IRQ_WORDS]; unsigned long irq_pending[NR_IRQ_WORDS];
...@@ -285,15 +359,20 @@ struct kvm_vcpu { ...@@ -285,15 +359,20 @@ struct kvm_vcpu {
u64 apic_base; u64 apic_base;
u64 ia32_misc_enable_msr; u64 ia32_misc_enable_msr;
int nmsrs; int nmsrs;
int save_nmsrs;
int msr_offset_efer;
#ifdef CONFIG_X86_64
int msr_offset_kernel_gs_base;
#endif
struct vmx_msr_entry *guest_msrs; struct vmx_msr_entry *guest_msrs;
struct vmx_msr_entry *host_msrs; struct vmx_msr_entry *host_msrs;
struct list_head free_pages;
struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
struct kvm_mmu mmu; struct kvm_mmu mmu;
struct kvm_mmu_memory_cache mmu_pte_chain_cache; struct kvm_mmu_memory_cache mmu_pte_chain_cache;
struct kvm_mmu_memory_cache mmu_rmap_desc_cache; struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
gfn_t last_pt_write_gfn; gfn_t last_pt_write_gfn;
int last_pt_write_count; int last_pt_write_count;
...@@ -305,6 +384,11 @@ struct kvm_vcpu { ...@@ -305,6 +384,11 @@ struct kvm_vcpu {
char *guest_fx_image; char *guest_fx_image;
int fpu_active; int fpu_active;
int guest_fpu_loaded; int guest_fpu_loaded;
struct vmx_host_state {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
int fs_gs_ldt_reload_needed;
} vmx_host_state;
int mmio_needed; int mmio_needed;
int mmio_read_completed; int mmio_read_completed;
...@@ -331,6 +415,7 @@ struct kvm_vcpu { ...@@ -331,6 +415,7 @@ struct kvm_vcpu {
u32 ar; u32 ar;
} tr, es, ds, fs, gs; } tr, es, ds, fs, gs;
} rmode; } rmode;
int halt_request; /* real mode on Intel only */
int cpuid_nent; int cpuid_nent;
struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES]; struct kvm_cpuid_entry cpuid_entries[KVM_MAX_CPUID_ENTRIES];
...@@ -362,12 +447,15 @@ struct kvm { ...@@ -362,12 +447,15 @@ struct kvm {
struct list_head active_mmu_pages; struct list_head active_mmu_pages;
int n_free_mmu_pages; int n_free_mmu_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
int nvcpus;
struct kvm_vcpu vcpus[KVM_MAX_VCPUS]; struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
int memory_config_version; int memory_config_version;
int busy; int busy;
unsigned long rmap_overflow; unsigned long rmap_overflow;
struct list_head vm_list; struct list_head vm_list;
struct file *filp; struct file *filp;
struct kvm_io_bus mmio_bus;
struct kvm_io_bus pio_bus;
}; };
struct descriptor_table { struct descriptor_table {
...@@ -488,6 +576,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, ...@@ -488,6 +576,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned long count, int string, int down, int size, unsigned long count, int string, int down,
gva_t address, int rep, unsigned port); gva_t address, int rep, unsigned port);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
int emulate_clts(struct kvm_vcpu *vcpu); int emulate_clts(struct kvm_vcpu *vcpu);
int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr,
...@@ -511,6 +600,7 @@ void save_msrs(struct vmx_msr_entry *e, int n); ...@@ -511,6 +600,7 @@ void save_msrs(struct vmx_msr_entry *e, int n);
void kvm_resched(struct kvm_vcpu *vcpu); void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_flush_remote_tlbs(struct kvm *kvm);
int kvm_read_guest(struct kvm_vcpu *vcpu, int kvm_read_guest(struct kvm_vcpu *vcpu,
gva_t addr, gva_t addr,
...@@ -524,10 +614,12 @@ int kvm_write_guest(struct kvm_vcpu *vcpu, ...@@ -524,10 +614,12 @@ int kvm_write_guest(struct kvm_vcpu *vcpu,
unsigned long segment_base(u16 selector); unsigned long segment_base(u16 selector);
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); const u8 *old, const u8 *new, int bytes);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
...@@ -539,6 +631,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, ...@@ -539,6 +631,14 @@ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
return vcpu->mmu.page_fault(vcpu, gva, error_code); return vcpu->mmu.page_fault(vcpu, gva, error_code);
} }
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
if (likely(vcpu->mmu.root_hpa != INVALID_PAGE))
return 0;
return kvm_mmu_load(vcpu);
}
static inline int is_long_mode(struct kvm_vcpu *vcpu) static inline int is_long_mode(struct kvm_vcpu *vcpu)
{ {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
......
...@@ -16,34 +16,33 @@ ...@@ -16,34 +16,33 @@
*/ */
#include "kvm.h" #include "kvm.h"
#include "x86_emulate.h"
#include "segment_descriptor.h"
#include <linux/kvm.h> #include <linux/kvm.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/magic.h>
#include <asm/processor.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <asm/msr.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/miscdevice.h> #include <linux/miscdevice.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <linux/reboot.h> #include <linux/reboot.h>
#include <asm/io.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/file.h> #include <linux/file.h>
#include <asm/desc.h>
#include <linux/sysdev.h> #include <linux/sysdev.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include "x86_emulate.h" #include <asm/processor.h>
#include "segment_descriptor.h" #include <asm/msr.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/desc.h>
MODULE_AUTHOR("Qumranet"); MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -51,8 +50,12 @@ MODULE_LICENSE("GPL"); ...@@ -51,8 +50,12 @@ MODULE_LICENSE("GPL");
static DEFINE_SPINLOCK(kvm_lock); static DEFINE_SPINLOCK(kvm_lock);
static LIST_HEAD(vm_list); static LIST_HEAD(vm_list);
static cpumask_t cpus_hardware_enabled;
struct kvm_arch_ops *kvm_arch_ops; struct kvm_arch_ops *kvm_arch_ops;
static void hardware_disable(void *ignored);
#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x) #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
static struct kvm_stats_debugfs_item { static struct kvm_stats_debugfs_item {
...@@ -72,13 +75,13 @@ static struct kvm_stats_debugfs_item { ...@@ -72,13 +75,13 @@ static struct kvm_stats_debugfs_item {
{ "halt_exits", STAT_OFFSET(halt_exits) }, { "halt_exits", STAT_OFFSET(halt_exits) },
{ "request_irq", STAT_OFFSET(request_irq_exits) }, { "request_irq", STAT_OFFSET(request_irq_exits) },
{ "irq_exits", STAT_OFFSET(irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) },
{ "light_exits", STAT_OFFSET(light_exits) },
{ "efer_reload", STAT_OFFSET(efer_reload) },
{ NULL } { NULL }
}; };
static struct dentry *debugfs_dir; static struct dentry *debugfs_dir;
struct vfsmount *kvmfs_mnt;
#define MAX_IO_MSRS 256 #define MAX_IO_MSRS 256
#define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL #define CR0_RESEVED_BITS 0xffffffff1ffaffc0ULL
...@@ -100,55 +103,6 @@ struct segment_descriptor_64 { ...@@ -100,55 +103,6 @@ struct segment_descriptor_64 {
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg); unsigned long arg);
static struct inode *kvmfs_inode(struct file_operations *fops)
{
int error = -ENOMEM;
struct inode *inode = new_inode(kvmfs_mnt->mnt_sb);
if (!inode)
goto eexit_1;
inode->i_fop = fops;
/*
* Mark the inode dirty from the very beginning,
* that way it will never be moved to the dirty
* list because mark_inode_dirty() will think
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
return inode;
eexit_1:
return ERR_PTR(error);
}
static struct file *kvmfs_file(struct inode *inode, void *private_data)
{
struct file *file = get_empty_filp();
if (!file)
return ERR_PTR(-ENFILE);
file->f_path.mnt = mntget(kvmfs_mnt);
file->f_path.dentry = d_alloc_anon(inode);
if (!file->f_path.dentry)
return ERR_PTR(-ENOMEM);
file->f_mapping = inode->i_mapping;
file->f_pos = 0;
file->f_flags = O_RDWR;
file->f_op = inode->i_fop;
file->f_mode = FMODE_READ | FMODE_WRITE;
file->f_version = 0;
file->private_data = private_data;
return file;
}
unsigned long segment_base(u16 selector) unsigned long segment_base(u16 selector)
{ {
struct descriptor_table gdt; struct descriptor_table gdt;
...@@ -307,6 +261,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu) ...@@ -307,6 +261,48 @@ static void vcpu_put(struct kvm_vcpu *vcpu)
mutex_unlock(&vcpu->mutex); mutex_unlock(&vcpu->mutex);
} }
static void ack_flush(void *_completed)
{
atomic_t *completed = _completed;
atomic_inc(completed);
}
void kvm_flush_remote_tlbs(struct kvm *kvm)
{
int i, cpu, needed;
cpumask_t cpus;
struct kvm_vcpu *vcpu;
atomic_t completed;
atomic_set(&completed, 0);
cpus_clear(cpus);
needed = 0;
for (i = 0; i < kvm->nvcpus; ++i) {
vcpu = &kvm->vcpus[i];
if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
continue;
cpu = vcpu->cpu;
if (cpu != -1 && cpu != raw_smp_processor_id())
if (!cpu_isset(cpu, cpus)) {
cpu_set(cpu, cpus);
++needed;
}
}
/*
* We really want smp_call_function_mask() here. But that's not
* available, so ipi all cpus in parallel and wait for them
* to complete.
*/
for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
while (atomic_read(&completed) != needed) {
cpu_relax();
barrier();
}
}
static struct kvm *kvm_create_vm(void) static struct kvm *kvm_create_vm(void)
{ {
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
...@@ -315,8 +311,13 @@ static struct kvm *kvm_create_vm(void) ...@@ -315,8 +311,13 @@ static struct kvm *kvm_create_vm(void)
if (!kvm) if (!kvm)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
kvm_io_bus_init(&kvm->pio_bus);
spin_lock_init(&kvm->lock); spin_lock_init(&kvm->lock);
INIT_LIST_HEAD(&kvm->active_mmu_pages); INIT_LIST_HEAD(&kvm->active_mmu_pages);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
kvm_io_bus_init(&kvm->mmio_bus);
for (i = 0; i < KVM_MAX_VCPUS; ++i) { for (i = 0; i < KVM_MAX_VCPUS; ++i) {
struct kvm_vcpu *vcpu = &kvm->vcpus[i]; struct kvm_vcpu *vcpu = &kvm->vcpus[i];
...@@ -324,10 +325,6 @@ static struct kvm *kvm_create_vm(void) ...@@ -324,10 +325,6 @@ static struct kvm *kvm_create_vm(void)
vcpu->cpu = -1; vcpu->cpu = -1;
vcpu->kvm = kvm; vcpu->kvm = kvm;
vcpu->mmu.root_hpa = INVALID_PAGE; vcpu->mmu.root_hpa = INVALID_PAGE;
INIT_LIST_HEAD(&vcpu->free_pages);
spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock);
} }
return kvm; return kvm;
} }
...@@ -380,6 +377,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) ...@@ -380,6 +377,16 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
} }
} }
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
if (!vcpu->vmcs)
return;
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
static void kvm_free_vcpu(struct kvm_vcpu *vcpu) static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
{ {
if (!vcpu->vmcs) if (!vcpu->vmcs)
...@@ -400,6 +407,11 @@ static void kvm_free_vcpus(struct kvm *kvm) ...@@ -400,6 +407,11 @@ static void kvm_free_vcpus(struct kvm *kvm)
{ {
unsigned int i; unsigned int i;
/*
* Unpin any mmu pages first.
*/
for (i = 0; i < KVM_MAX_VCPUS; ++i)
kvm_unload_vcpu_mmu(&kvm->vcpus[i]);
for (i = 0; i < KVM_MAX_VCPUS; ++i) for (i = 0; i < KVM_MAX_VCPUS; ++i)
kvm_free_vcpu(&kvm->vcpus[i]); kvm_free_vcpu(&kvm->vcpus[i]);
} }
...@@ -414,6 +426,8 @@ static void kvm_destroy_vm(struct kvm *kvm) ...@@ -414,6 +426,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
spin_lock(&kvm_lock); spin_lock(&kvm_lock);
list_del(&kvm->vm_list); list_del(&kvm->vm_list);
spin_unlock(&kvm_lock); spin_unlock(&kvm_lock);
kvm_io_bus_destroy(&kvm->pio_bus);
kvm_io_bus_destroy(&kvm->mmio_bus);
kvm_free_vcpus(kvm); kvm_free_vcpus(kvm);
kvm_free_physmem(kvm); kvm_free_physmem(kvm);
kfree(kvm); kfree(kvm);
...@@ -969,7 +983,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page); ...@@ -969,7 +983,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn) void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{ {
int i; int i;
struct kvm_memory_slot *memslot = NULL; struct kvm_memory_slot *memslot;
unsigned long rel_gfn; unsigned long rel_gfn;
for (i = 0; i < kvm->nmemslots; ++i) { for (i = 0; i < kvm->nmemslots; ++i) {
...@@ -978,7 +992,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) ...@@ -978,7 +992,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
if (gfn >= memslot->base_gfn if (gfn >= memslot->base_gfn
&& gfn < memslot->base_gfn + memslot->npages) { && gfn < memslot->base_gfn + memslot->npages) {
if (!memslot || !memslot->dirty_bitmap) if (!memslot->dirty_bitmap)
return; return;
rel_gfn = gfn - memslot->base_gfn; rel_gfn = gfn - memslot->base_gfn;
...@@ -1037,12 +1051,31 @@ static int emulator_write_std(unsigned long addr, ...@@ -1037,12 +1051,31 @@ static int emulator_write_std(unsigned long addr,
return X86EMUL_UNHANDLEABLE; return X86EMUL_UNHANDLEABLE;
} }
static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
gpa_t addr)
{
/*
* Note that its important to have this wrapper function because
* in the very near future we will be checking for MMIOs against
* the LAPIC as well as the general MMIO bus
*/
return kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
}
static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
gpa_t addr)
{
return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
}
static int emulator_read_emulated(unsigned long addr, static int emulator_read_emulated(unsigned long addr,
void *val, void *val,
unsigned int bytes, unsigned int bytes,
struct x86_emulate_ctxt *ctxt) struct x86_emulate_ctxt *ctxt)
{ {
struct kvm_vcpu *vcpu = ctxt->vcpu; struct kvm_vcpu *vcpu = ctxt->vcpu;
struct kvm_io_device *mmio_dev;
gpa_t gpa;
if (vcpu->mmio_read_completed) { if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes); memcpy(val, vcpu->mmio_data, bytes);
...@@ -1051,18 +1084,26 @@ static int emulator_read_emulated(unsigned long addr, ...@@ -1051,18 +1084,26 @@ static int emulator_read_emulated(unsigned long addr,
} else if (emulator_read_std(addr, val, bytes, ctxt) } else if (emulator_read_std(addr, val, bytes, ctxt)
== X86EMUL_CONTINUE) == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE; return X86EMUL_CONTINUE;
else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA) gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
return X86EMUL_PROPAGATE_FAULT; if (gpa == UNMAPPED_GVA)
vcpu->mmio_needed = 1; return X86EMUL_PROPAGATE_FAULT;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
vcpu->mmio_is_write = 0;
return X86EMUL_UNHANDLEABLE; /*
* Is this MMIO handled locally?
*/
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
if (mmio_dev) {
kvm_iodevice_read(mmio_dev, gpa, bytes, val);
return X86EMUL_CONTINUE;
} }
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
vcpu->mmio_is_write = 0;
return X86EMUL_UNHANDLEABLE;
} }
static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
...@@ -1070,18 +1111,20 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ...@@ -1070,18 +1111,20 @@ static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{ {
struct page *page; struct page *page;
void *virt; void *virt;
unsigned offset = offset_in_page(gpa);
if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
return 0; return 0;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
if (!page) if (!page)
return 0; return 0;
kvm_mmu_pre_write(vcpu, gpa, bytes);
mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
virt = kmap_atomic(page, KM_USER0); virt = kmap_atomic(page, KM_USER0);
memcpy(virt + offset_in_page(gpa), val, bytes); if (memcmp(virt + offset_in_page(gpa), val, bytes)) {
kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes);
memcpy(virt + offset_in_page(gpa), val, bytes);
}
kunmap_atomic(virt, KM_USER0); kunmap_atomic(virt, KM_USER0);
kvm_mmu_post_write(vcpu, gpa, bytes);
return 1; return 1;
} }
...@@ -1090,8 +1133,9 @@ static int emulator_write_emulated(unsigned long addr, ...@@ -1090,8 +1133,9 @@ static int emulator_write_emulated(unsigned long addr,
unsigned int bytes, unsigned int bytes,
struct x86_emulate_ctxt *ctxt) struct x86_emulate_ctxt *ctxt)
{ {
struct kvm_vcpu *vcpu = ctxt->vcpu; struct kvm_vcpu *vcpu = ctxt->vcpu;
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); struct kvm_io_device *mmio_dev;
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA) { if (gpa == UNMAPPED_GVA) {
kvm_arch_ops->inject_page_fault(vcpu, addr, 2); kvm_arch_ops->inject_page_fault(vcpu, addr, 2);
...@@ -1101,6 +1145,15 @@ static int emulator_write_emulated(unsigned long addr, ...@@ -1101,6 +1145,15 @@ static int emulator_write_emulated(unsigned long addr,
if (emulator_write_phys(vcpu, gpa, val, bytes)) if (emulator_write_phys(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE; return X86EMUL_CONTINUE;
/*
* Is this MMIO handled locally?
*/
mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
if (mmio_dev) {
kvm_iodevice_write(mmio_dev, gpa, bytes, val);
return X86EMUL_CONTINUE;
}
vcpu->mmio_needed = 1; vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa; vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes; vcpu->mmio_size = bytes;
...@@ -1269,6 +1322,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ...@@ -1269,6 +1322,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
} }
EXPORT_SYMBOL_GPL(emulate_instruction); EXPORT_SYMBOL_GPL(emulate_instruction);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
if (vcpu->irq_summary)
return 1;
vcpu->run->exit_reason = KVM_EXIT_HLT;
++vcpu->stat.halt_exits;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run)
{ {
unsigned long nr, a0, a1, a2, a3, a4, a5, ret; unsigned long nr, a0, a1, a2, a3, a4, a5, ret;
...@@ -1469,6 +1533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) ...@@ -1469,6 +1533,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_MISC+16: case MSR_IA32_MC0_MISC+16:
case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_REV:
case MSR_IA32_PERF_STATUS: case MSR_IA32_PERF_STATUS:
case MSR_IA32_EBL_CR_POWERON:
/* MTRR registers */ /* MTRR registers */
case 0xfe: case 0xfe:
case 0x200 ... 0x2ff: case 0x200 ... 0x2ff:
...@@ -1727,6 +1792,20 @@ static int complete_pio(struct kvm_vcpu *vcpu) ...@@ -1727,6 +1792,20 @@ static int complete_pio(struct kvm_vcpu *vcpu)
return 0; return 0;
} }
void kernel_pio(struct kvm_io_device *pio_dev, struct kvm_vcpu *vcpu)
{
/* TODO: String I/O for in kernel device */
if (vcpu->pio.in)
kvm_iodevice_read(pio_dev, vcpu->pio.port,
vcpu->pio.size,
vcpu->pio_data);
else
kvm_iodevice_write(pio_dev, vcpu->pio.port,
vcpu->pio.size,
vcpu->pio_data);
}
int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int size, unsigned long count, int string, int down, int size, unsigned long count, int string, int down,
gva_t address, int rep, unsigned port) gva_t address, int rep, unsigned port)
...@@ -1735,6 +1814,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, ...@@ -1735,6 +1814,7 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
int i; int i;
int nr_pages = 1; int nr_pages = 1;
struct page *page; struct page *page;
struct kvm_io_device *pio_dev;
vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
...@@ -1746,17 +1826,27 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, ...@@ -1746,17 +1826,27 @@ int kvm_setup_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->pio.cur_count = count; vcpu->pio.cur_count = count;
vcpu->pio.size = size; vcpu->pio.size = size;
vcpu->pio.in = in; vcpu->pio.in = in;
vcpu->pio.port = port;
vcpu->pio.string = string; vcpu->pio.string = string;
vcpu->pio.down = down; vcpu->pio.down = down;
vcpu->pio.guest_page_offset = offset_in_page(address); vcpu->pio.guest_page_offset = offset_in_page(address);
vcpu->pio.rep = rep; vcpu->pio.rep = rep;
pio_dev = vcpu_find_pio_dev(vcpu, port);
if (!string) { if (!string) {
kvm_arch_ops->cache_regs(vcpu); kvm_arch_ops->cache_regs(vcpu);
memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4); memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
kvm_arch_ops->decache_regs(vcpu); kvm_arch_ops->decache_regs(vcpu);
if (pio_dev) {
kernel_pio(pio_dev, vcpu);
complete_pio(vcpu);
return 1;
}
return 0; return 0;
} }
/* TODO: String I/O for in kernel device */
if (pio_dev)
printk(KERN_ERR "kvm_setup_pio: no string io support\n");
if (!count) { if (!count) {
kvm_arch_ops->skip_emulated_instruction(vcpu); kvm_arch_ops->skip_emulated_instruction(vcpu);
...@@ -2273,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) ...@@ -2273,34 +2363,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
struct inode *inode; struct inode *inode;
struct file *file; struct file *file;
r = anon_inode_getfd(&fd, &inode, &file,
"kvm-vcpu", &kvm_vcpu_fops, vcpu);
if (r)
return r;
atomic_inc(&vcpu->kvm->filp->f_count); atomic_inc(&vcpu->kvm->filp->f_count);
inode = kvmfs_inode(&kvm_vcpu_fops);
if (IS_ERR(inode)) {
r = PTR_ERR(inode);
goto out1;
}
file = kvmfs_file(inode, vcpu);
if (IS_ERR(file)) {
r = PTR_ERR(file);
goto out2;
}
r = get_unused_fd();
if (r < 0)
goto out3;
fd = r;
fd_install(fd, file);
return fd; return fd;
out3:
fput(file);
out2:
iput(inode);
out1:
fput(vcpu->kvm->filp);
return r;
} }
/* /*
...@@ -2363,6 +2431,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) ...@@ -2363,6 +2431,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
if (r < 0) if (r < 0)
goto out_free_vcpus; goto out_free_vcpus;
spin_lock(&kvm_lock);
if (n >= kvm->nvcpus)
kvm->nvcpus = n + 1;
spin_unlock(&kvm_lock);
return r; return r;
out_free_vcpus: out_free_vcpus:
...@@ -2376,6 +2449,27 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) ...@@ -2376,6 +2449,27 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
return r; return r;
} }
static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
{
u64 efer;
int i;
struct kvm_cpuid_entry *e, *entry;
rdmsrl(MSR_EFER, efer);
entry = NULL;
for (i = 0; i < vcpu->cpuid_nent; ++i) {
e = &vcpu->cpuid_entries[i];
if (e->function == 0x80000001) {
entry = e;
break;
}
}
if (entry && (entry->edx & EFER_NX) && !(efer & EFER_NX)) {
entry->edx &= ~(1 << 20);
printk(KERN_INFO ": guest NX capability removed\n");
}
}
static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid, struct kvm_cpuid *cpuid,
struct kvm_cpuid_entry __user *entries) struct kvm_cpuid_entry __user *entries)
...@@ -2390,6 +2484,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, ...@@ -2390,6 +2484,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
cpuid->nent * sizeof(struct kvm_cpuid_entry))) cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out; goto out;
vcpu->cpuid_nent = cpuid->nent; vcpu->cpuid_nent = cpuid->nent;
cpuid_fix_nx_cap(vcpu);
return 0; return 0;
out: out:
...@@ -2738,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void) ...@@ -2738,41 +2833,18 @@ static int kvm_dev_ioctl_create_vm(void)
struct file *file; struct file *file;
struct kvm *kvm; struct kvm *kvm;
inode = kvmfs_inode(&kvm_vm_fops);
if (IS_ERR(inode)) {
r = PTR_ERR(inode);
goto out1;
}
kvm = kvm_create_vm(); kvm = kvm_create_vm();
if (IS_ERR(kvm)) { if (IS_ERR(kvm))
r = PTR_ERR(kvm); return PTR_ERR(kvm);
goto out2; r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
if (r) {
kvm_destroy_vm(kvm);
return r;
} }
file = kvmfs_file(inode, kvm);
if (IS_ERR(file)) {
r = PTR_ERR(file);
goto out3;
}
kvm->filp = file; kvm->filp = file;
r = get_unused_fd();
if (r < 0)
goto out4;
fd = r;
fd_install(fd, file);
return fd; return fd;
out4:
fput(file);
out3:
kvm_destroy_vm(kvm);
out2:
iput(inode);
out1:
return r;
} }
static long kvm_dev_ioctl(struct file *filp, static long kvm_dev_ioctl(struct file *filp,
...@@ -2862,7 +2934,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val, ...@@ -2862,7 +2934,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
* in vmx root mode. * in vmx root mode.
*/ */
printk(KERN_INFO "kvm: exiting hardware virtualization\n"); printk(KERN_INFO "kvm: exiting hardware virtualization\n");
on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); on_each_cpu(hardware_disable, NULL, 0, 1);
} }
return NOTIFY_OK; return NOTIFY_OK;
} }
...@@ -2905,33 +2977,88 @@ static void decache_vcpus_on_cpu(int cpu) ...@@ -2905,33 +2977,88 @@ static void decache_vcpus_on_cpu(int cpu)
spin_unlock(&kvm_lock); spin_unlock(&kvm_lock);
} }
static void hardware_enable(void *junk)
{
int cpu = raw_smp_processor_id();
if (cpu_isset(cpu, cpus_hardware_enabled))
return;
cpu_set(cpu, cpus_hardware_enabled);
kvm_arch_ops->hardware_enable(NULL);
}
static void hardware_disable(void *junk)
{
int cpu = raw_smp_processor_id();
if (!cpu_isset(cpu, cpus_hardware_enabled))
return;
cpu_clear(cpu, cpus_hardware_enabled);
decache_vcpus_on_cpu(cpu);
kvm_arch_ops->hardware_disable(NULL);
}
static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
void *v) void *v)
{ {
int cpu = (long)v; int cpu = (long)v;
switch (val) { switch (val) {
case CPU_DOWN_PREPARE: case CPU_DYING:
case CPU_DOWN_PREPARE_FROZEN: case CPU_DYING_FROZEN:
case CPU_UP_CANCELED: case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN: case CPU_UP_CANCELED_FROZEN:
printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
cpu); cpu);
decache_vcpus_on_cpu(cpu); smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
NULL, 0, 1);
break; break;
case CPU_ONLINE: case CPU_ONLINE:
case CPU_ONLINE_FROZEN: case CPU_ONLINE_FROZEN:
printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
cpu); cpu);
smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
NULL, 0, 1);
break; break;
} }
return NOTIFY_OK; return NOTIFY_OK;
} }
void kvm_io_bus_init(struct kvm_io_bus *bus)
{
memset(bus, 0, sizeof(*bus));
}
void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
int i;
for (i = 0; i < bus->dev_count; i++) {
struct kvm_io_device *pos = bus->devs[i];
kvm_iodevice_destructor(pos);
}
}
struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
{
int i;
for (i = 0; i < bus->dev_count; i++) {
struct kvm_io_device *pos = bus->devs[i];
if (pos->in_range(pos, addr))
return pos;
}
return NULL;
}
void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
{
BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
bus->devs[bus->dev_count++] = dev;
}
static struct notifier_block kvm_cpu_notifier = { static struct notifier_block kvm_cpu_notifier = {
.notifier_call = kvm_cpu_hotplug, .notifier_call = kvm_cpu_hotplug,
.priority = 20, /* must be > scheduler priority */ .priority = 20, /* must be > scheduler priority */
...@@ -2983,14 +3110,13 @@ static void kvm_exit_debug(void) ...@@ -2983,14 +3110,13 @@ static void kvm_exit_debug(void)
static int kvm_suspend(struct sys_device *dev, pm_message_t state) static int kvm_suspend(struct sys_device *dev, pm_message_t state)
{ {
decache_vcpus_on_cpu(raw_smp_processor_id()); hardware_disable(NULL);
on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1);
return 0; return 0;
} }
static int kvm_resume(struct sys_device *dev) static int kvm_resume(struct sys_device *dev)
{ {
on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); hardware_enable(NULL);
return 0; return 0;
} }
...@@ -3007,18 +3133,6 @@ static struct sys_device kvm_sysdev = { ...@@ -3007,18 +3133,6 @@ static struct sys_device kvm_sysdev = {
hpa_t bad_page_address; hpa_t bad_page_address;
static int kvmfs_get_sb(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_pseudo(fs_type, "kvm:", NULL, KVMFS_SUPER_MAGIC, mnt);
}
static struct file_system_type kvm_fs_type = {
.name = "kvmfs",
.get_sb = kvmfs_get_sb,
.kill_sb = kill_anon_super,
};
int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
{ {
int r; int r;
...@@ -3043,7 +3157,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) ...@@ -3043,7 +3157,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
if (r < 0) if (r < 0)
goto out; goto out;
on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); on_each_cpu(hardware_enable, NULL, 0, 1);
r = register_cpu_notifier(&kvm_cpu_notifier); r = register_cpu_notifier(&kvm_cpu_notifier);
if (r) if (r)
goto out_free_1; goto out_free_1;
...@@ -3075,7 +3189,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module) ...@@ -3075,7 +3189,7 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
unregister_reboot_notifier(&kvm_reboot_notifier); unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier); unregister_cpu_notifier(&kvm_cpu_notifier);
out_free_1: out_free_1:
on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); on_each_cpu(hardware_disable, NULL, 0, 1);
kvm_arch_ops->hardware_unsetup(); kvm_arch_ops->hardware_unsetup();
out: out:
kvm_arch_ops = NULL; kvm_arch_ops = NULL;
...@@ -3089,7 +3203,7 @@ void kvm_exit_arch(void) ...@@ -3089,7 +3203,7 @@ void kvm_exit_arch(void)
sysdev_class_unregister(&kvm_sysdev_class); sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier); unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier); unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); on_each_cpu(hardware_disable, NULL, 0, 1);
kvm_arch_ops->hardware_unsetup(); kvm_arch_ops->hardware_unsetup();
kvm_arch_ops = NULL; kvm_arch_ops = NULL;
} }
...@@ -3103,14 +3217,6 @@ static __init int kvm_init(void) ...@@ -3103,14 +3217,6 @@ static __init int kvm_init(void)
if (r) if (r)
goto out4; goto out4;
r = register_filesystem(&kvm_fs_type);
if (r)
goto out3;
kvmfs_mnt = kern_mount(&kvm_fs_type);
r = PTR_ERR(kvmfs_mnt);
if (IS_ERR(kvmfs_mnt))
goto out2;
kvm_init_debug(); kvm_init_debug();
kvm_init_msr_list(); kvm_init_msr_list();
...@@ -3127,10 +3233,6 @@ static __init int kvm_init(void) ...@@ -3127,10 +3233,6 @@ static __init int kvm_init(void)
out: out:
kvm_exit_debug(); kvm_exit_debug();
mntput(kvmfs_mnt);
out2:
unregister_filesystem(&kvm_fs_type);
out3:
kvm_mmu_module_exit(); kvm_mmu_module_exit();
out4: out4:
return r; return r;
...@@ -3140,8 +3242,6 @@ static __exit void kvm_exit(void) ...@@ -3140,8 +3242,6 @@ static __exit void kvm_exit(void)
{ {
kvm_exit_debug(); kvm_exit_debug();
__free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT)); __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
mntput(kvmfs_mnt);
unregister_filesystem(&kvm_fs_type);
kvm_mmu_module_exit(); kvm_mmu_module_exit();
} }
......
...@@ -16,15 +16,18 @@ ...@@ -16,15 +16,18 @@
* the COPYING file in the top-level directory. * the COPYING file in the top-level directory.
* *
*/ */
#include "vmx.h"
#include "kvm.h"
#include <linux/types.h> #include <linux/types.h>
#include <linux/string.h> #include <linux/string.h>
#include <asm/page.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/module.h> #include <linux/module.h>
#include "vmx.h" #include <asm/page.h>
#include "kvm.h" #include <asm/cmpxchg.h>
#undef MMU_DEBUG #undef MMU_DEBUG
...@@ -90,25 +93,11 @@ static int dbg = 1; ...@@ -90,25 +93,11 @@ static int dbg = 1;
#define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
#define PT32_PTE_COPY_MASK \
(PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
#define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
#define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
#define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
#define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
#define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
#define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
#define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
#define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
#define VALID_PAGE(x) ((x) != INVALID_PAGE) #define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define PT64_LEVEL_BITS 9 #define PT64_LEVEL_BITS 9
...@@ -165,6 +154,8 @@ struct kvm_rmap_desc { ...@@ -165,6 +154,8 @@ struct kvm_rmap_desc {
static struct kmem_cache *pte_chain_cache; static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_cache;
static struct kmem_cache *mmu_page_header_cache;
static int is_write_protection(struct kvm_vcpu *vcpu) static int is_write_protection(struct kvm_vcpu *vcpu)
{ {
...@@ -202,6 +193,15 @@ static int is_rmap_pte(u64 pte) ...@@ -202,6 +193,15 @@ static int is_rmap_pte(u64 pte)
== (PT_WRITABLE_MASK | PT_PRESENT_MASK); == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
} }
static void set_shadow_pte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64
set_64bit((unsigned long *)sptep, spte);
#else
set_64bit((unsigned long long *)sptep, spte);
#endif
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min, struct kmem_cache *base_cache, int min,
gfp_t gfp_flags) gfp_t gfp_flags)
...@@ -235,6 +235,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags) ...@@ -235,6 +235,14 @@ static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
goto out; goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache, r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
rmap_desc_cache, 1, gfp_flags); rmap_desc_cache, 1, gfp_flags);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_page_cache,
mmu_page_cache, 4, gfp_flags);
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
mmu_page_header_cache, 4, gfp_flags);
out: out:
return r; return r;
} }
...@@ -258,6 +266,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) ...@@ -258,6 +266,8 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{ {
mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache); mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache); mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
mmu_free_memory_cache(&vcpu->mmu_page_cache);
mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
} }
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
...@@ -433,19 +443,18 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) ...@@ -433,19 +443,18 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
BUG_ON(!(*spte & PT_WRITABLE_MASK)); BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
rmap_remove(vcpu, spte); rmap_remove(vcpu, spte);
kvm_arch_ops->tlb_flush(vcpu); set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
*spte &= ~(u64)PT_WRITABLE_MASK; kvm_flush_remote_tlbs(vcpu->kvm);
} }
} }
#ifdef MMU_DEBUG #ifdef MMU_DEBUG
static int is_empty_shadow_page(hpa_t page_hpa) static int is_empty_shadow_page(u64 *spt)
{ {
u64 *pos; u64 *pos;
u64 *end; u64 *end;
for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64); for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
pos != end; pos++)
if (*pos != 0) { if (*pos != 0) {
printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
pos, *pos); pos, *pos);
...@@ -455,13 +464,13 @@ static int is_empty_shadow_page(hpa_t page_hpa) ...@@ -455,13 +464,13 @@ static int is_empty_shadow_page(hpa_t page_hpa)
} }
#endif #endif
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa) static void kvm_mmu_free_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page_head)
{ {
struct kvm_mmu_page *page_head = page_header(page_hpa); ASSERT(is_empty_shadow_page(page_head->spt));
list_del(&page_head->link);
ASSERT(is_empty_shadow_page(page_hpa)); mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt);
page_head->page_hpa = page_hpa; mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head);
list_move(&page_head->link, &vcpu->free_pages);
++vcpu->kvm->n_free_mmu_pages; ++vcpu->kvm->n_free_mmu_pages;
} }
...@@ -475,12 +484,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, ...@@ -475,12 +484,15 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
{ {
struct kvm_mmu_page *page; struct kvm_mmu_page *page;
if (list_empty(&vcpu->free_pages)) if (!vcpu->kvm->n_free_mmu_pages)
return NULL; return NULL;
page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link); page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
list_move(&page->link, &vcpu->kvm->active_mmu_pages); sizeof *page);
ASSERT(is_empty_shadow_page(page->page_hpa)); page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(page->spt), (unsigned long)page);
list_add(&page->link, &vcpu->kvm->active_mmu_pages);
ASSERT(is_empty_shadow_page(page->spt));
page->slot_bitmap = 0; page->slot_bitmap = 0;
page->multimapped = 0; page->multimapped = 0;
page->parent_pte = parent_pte; page->parent_pte = parent_pte;
...@@ -638,7 +650,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, ...@@ -638,7 +650,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
u64 *pt; u64 *pt;
u64 ent; u64 ent;
pt = __va(page->page_hpa); pt = page->spt;
if (page->role.level == PT_PAGE_TABLE_LEVEL) { if (page->role.level == PT_PAGE_TABLE_LEVEL) {
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
...@@ -646,7 +658,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, ...@@ -646,7 +658,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
rmap_remove(vcpu, &pt[i]); rmap_remove(vcpu, &pt[i]);
pt[i] = 0; pt[i] = 0;
} }
kvm_arch_ops->tlb_flush(vcpu); kvm_flush_remote_tlbs(vcpu->kvm);
return; return;
} }
...@@ -659,6 +671,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu, ...@@ -659,6 +671,7 @@ static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
ent &= PT64_BASE_ADDR_MASK; ent &= PT64_BASE_ADDR_MASK;
mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]); mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
} }
kvm_flush_remote_tlbs(vcpu->kvm);
} }
static void kvm_mmu_put_page(struct kvm_vcpu *vcpu, static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
...@@ -685,12 +698,12 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu, ...@@ -685,12 +698,12 @@ static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
} }
BUG_ON(!parent_pte); BUG_ON(!parent_pte);
kvm_mmu_put_page(vcpu, page, parent_pte); kvm_mmu_put_page(vcpu, page, parent_pte);
*parent_pte = 0; set_shadow_pte(parent_pte, 0);
} }
kvm_mmu_page_unlink_children(vcpu, page); kvm_mmu_page_unlink_children(vcpu, page);
if (!page->root_count) { if (!page->root_count) {
hlist_del(&page->hash_link); hlist_del(&page->hash_link);
kvm_mmu_free_page(vcpu, page->page_hpa); kvm_mmu_free_page(vcpu, page);
} else } else
list_move(&page->link, &vcpu->kvm->active_mmu_pages); list_move(&page->link, &vcpu->kvm->active_mmu_pages);
} }
...@@ -717,6 +730,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn) ...@@ -717,6 +730,17 @@ static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
return r; return r;
} }
static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
{
struct kvm_mmu_page *page;
while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n",
__FUNCTION__, gfn, page->role.word);
kvm_mmu_zap_page(vcpu, page);
}
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa) static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
{ {
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT)); int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
...@@ -805,7 +829,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) ...@@ -805,7 +829,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
return -ENOMEM; return -ENOMEM;
} }
table[index] = new_table->page_hpa | PT_PRESENT_MASK table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
| PT_WRITABLE_MASK | PT_USER_MASK; | PT_WRITABLE_MASK | PT_USER_MASK;
} }
table_addr = table[index] & PT64_BASE_ADDR_MASK; table_addr = table[index] & PT64_BASE_ADDR_MASK;
...@@ -817,11 +841,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) ...@@ -817,11 +841,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
int i; int i;
struct kvm_mmu_page *page; struct kvm_mmu_page *page;
if (!VALID_PAGE(vcpu->mmu.root_hpa))
return;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) { if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa; hpa_t root = vcpu->mmu.root_hpa;
ASSERT(VALID_PAGE(root));
page = page_header(root); page = page_header(root);
--page->root_count; --page->root_count;
vcpu->mmu.root_hpa = INVALID_PAGE; vcpu->mmu.root_hpa = INVALID_PAGE;
...@@ -832,7 +857,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) ...@@ -832,7 +857,6 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->mmu.pae_root[i]; hpa_t root = vcpu->mmu.pae_root[i];
if (root) { if (root) {
ASSERT(VALID_PAGE(root));
root &= PT64_BASE_ADDR_MASK; root &= PT64_BASE_ADDR_MASK;
page = page_header(root); page = page_header(root);
--page->root_count; --page->root_count;
...@@ -857,7 +881,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ...@@ -857,7 +881,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root)); ASSERT(!VALID_PAGE(root));
page = kvm_mmu_get_page(vcpu, root_gfn, 0, page = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, 0, 0, NULL); PT64_ROOT_LEVEL, 0, 0, NULL);
root = page->page_hpa; root = __pa(page->spt);
++page->root_count; ++page->root_count;
vcpu->mmu.root_hpa = root; vcpu->mmu.root_hpa = root;
return; return;
...@@ -878,7 +902,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ...@@ -878,7 +902,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
page = kvm_mmu_get_page(vcpu, root_gfn, i << 30, page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu), PT32_ROOT_LEVEL, !is_paging(vcpu),
0, NULL); 0, NULL);
root = page->page_hpa; root = __pa(page->spt);
++page->root_count; ++page->root_count;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
} }
...@@ -928,9 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) ...@@ -928,9 +952,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->free = nonpaging_free; context->free = nonpaging_free;
context->root_level = 0; context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu); context->root_hpa = INVALID_PAGE;
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
return 0; return 0;
} }
...@@ -944,59 +966,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) ...@@ -944,59 +966,6 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
{ {
pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
mmu_free_roots(vcpu); mmu_free_roots(vcpu);
if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
}
static inline void set_pte_common(struct kvm_vcpu *vcpu,
u64 *shadow_pte,
gpa_t gaddr,
int dirty,
u64 access_bits,
gfn_t gfn)
{
hpa_t paddr;
*shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
*shadow_pte |= access_bits;
if (is_error_hpa(paddr)) {
*shadow_pte |= gaddr;
*shadow_pte |= PT_SHADOW_IO_MARK;
*shadow_pte &= ~PT_PRESENT_MASK;
return;
}
*shadow_pte |= paddr;
if (access_bits & PT_WRITABLE_MASK) {
struct kvm_mmu_page *shadow;
shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
if (is_writeble_pte(*shadow_pte)) {
*shadow_pte &= ~PT_WRITABLE_MASK;
kvm_arch_ops->tlb_flush(vcpu);
}
}
}
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
rmap_add(vcpu, shadow_pte);
} }
static void inject_page_fault(struct kvm_vcpu *vcpu, static void inject_page_fault(struct kvm_vcpu *vcpu,
...@@ -1006,23 +975,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, ...@@ -1006,23 +975,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
kvm_arch_ops->inject_page_fault(vcpu, addr, err_code); kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
} }
static inline int fix_read_pf(u64 *shadow_ent)
{
if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
!(*shadow_ent & PT_USER_MASK)) {
/*
* If supervisor write protect is disabled, we shadow kernel
* pages as user pages so we can trap the write access.
*/
*shadow_ent |= PT_USER_MASK;
*shadow_ent &= ~PT_WRITABLE_MASK;
return 1;
}
return 0;
}
static void paging_free(struct kvm_vcpu *vcpu) static void paging_free(struct kvm_vcpu *vcpu)
{ {
nonpaging_free(vcpu); nonpaging_free(vcpu);
...@@ -1047,10 +999,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) ...@@ -1047,10 +999,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->free = paging_free; context->free = paging_free;
context->root_level = level; context->root_level = level;
context->shadow_root_level = level; context->shadow_root_level = level;
mmu_alloc_roots(vcpu); context->root_hpa = INVALID_PAGE;
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
return 0; return 0;
} }
...@@ -1069,10 +1018,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) ...@@ -1069,10 +1018,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->free = paging_free; context->free = paging_free;
context->root_level = PT32_ROOT_LEVEL; context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu); context->root_hpa = INVALID_PAGE;
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
return 0; return 0;
} }
...@@ -1106,19 +1052,34 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) ...@@ -1106,19 +1052,34 @@ static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
} }
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
destroy_kvm_mmu(vcpu);
return init_kvm_mmu(vcpu);
}
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{ {
int r; int r;
destroy_kvm_mmu(vcpu); spin_lock(&vcpu->kvm->lock);
r = init_kvm_mmu(vcpu);
if (r < 0)
goto out;
r = mmu_topup_memory_caches(vcpu); r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
mmu_alloc_roots(vcpu);
kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
out: out:
spin_unlock(&vcpu->kvm->lock);
return r; return r;
} }
EXPORT_SYMBOL_GPL(kvm_mmu_load);
void kvm_mmu_unload(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
}
static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page, struct kvm_mmu_page *page,
u64 *spte) u64 *spte)
{ {
...@@ -1135,9 +1096,25 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, ...@@ -1135,9 +1096,25 @@ static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
} }
} }
*spte = 0; *spte = 0;
kvm_flush_remote_tlbs(vcpu->kvm);
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *spte,
const void *new, int bytes)
{
if (page->role.level != PT_PAGE_TABLE_LEVEL)
return;
if (page->role.glevels == PT32_ROOT_LEVEL)
paging32_update_pte(vcpu, page, spte, new, bytes);
else
paging64_update_pte(vcpu, page, spte, new, bytes);
} }
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *old, const u8 *new, int bytes)
{ {
gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *page; struct kvm_mmu_page *page;
...@@ -1149,6 +1126,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) ...@@ -1149,6 +1126,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
unsigned pte_size; unsigned pte_size;
unsigned page_offset; unsigned page_offset;
unsigned misaligned; unsigned misaligned;
unsigned quadrant;
int level; int level;
int flooded = 0; int flooded = 0;
int npte; int npte;
...@@ -1169,6 +1147,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) ...@@ -1169,6 +1147,7 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
continue; continue;
pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
if (misaligned || flooded) { if (misaligned || flooded) {
/* /*
* Misaligned accesses are too much trouble to fix * Misaligned accesses are too much trouble to fix
...@@ -1200,21 +1179,20 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) ...@@ -1200,21 +1179,20 @@ void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
page_offset <<= 1; page_offset <<= 1;
npte = 2; npte = 2;
} }
quadrant = page_offset >> PAGE_SHIFT;
page_offset &= ~PAGE_MASK; page_offset &= ~PAGE_MASK;
if (quadrant != page->role.quadrant)
continue;
} }
spte = __va(page->page_hpa); spte = &page->spt[page_offset / sizeof(*spte)];
spte += page_offset / sizeof(*spte);
while (npte--) { while (npte--) {
mmu_pre_write_zap_pte(vcpu, page, spte); mmu_pte_write_zap_pte(vcpu, page, spte);
mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
++spte; ++spte;
} }
} }
} }
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
{
}
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
{ {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva); gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
...@@ -1243,13 +1221,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu) ...@@ -1243,13 +1221,6 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
struct kvm_mmu_page, link); struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu, page); kvm_mmu_zap_page(vcpu, page);
} }
while (!list_empty(&vcpu->free_pages)) {
page = list_entry(vcpu->free_pages.next,
struct kvm_mmu_page, link);
list_del(&page->link);
__free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
page->page_hpa = INVALID_PAGE;
}
free_page((unsigned long)vcpu->mmu.pae_root); free_page((unsigned long)vcpu->mmu.pae_root);
} }
...@@ -1260,18 +1231,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ...@@ -1260,18 +1231,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
ASSERT(vcpu); ASSERT(vcpu);
for (i = 0; i < KVM_NUM_MMU_PAGES; i++) { vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
INIT_LIST_HEAD(&page_header->link);
if ((page = alloc_page(GFP_KERNEL)) == NULL)
goto error_1;
set_page_private(page, (unsigned long)page_header);
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
++vcpu->kvm->n_free_mmu_pages;
}
/* /*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
...@@ -1296,7 +1256,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) ...@@ -1296,7 +1256,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
{ {
ASSERT(vcpu); ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(list_empty(&vcpu->free_pages));
return alloc_mmu_pages(vcpu); return alloc_mmu_pages(vcpu);
} }
...@@ -1305,7 +1264,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) ...@@ -1305,7 +1264,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
{ {
ASSERT(vcpu); ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa)); ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
ASSERT(!list_empty(&vcpu->free_pages));
return init_kvm_mmu(vcpu); return init_kvm_mmu(vcpu);
} }
...@@ -1331,7 +1289,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot) ...@@ -1331,7 +1289,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
if (!test_bit(slot, &page->slot_bitmap)) if (!test_bit(slot, &page->slot_bitmap))
continue; continue;
pt = __va(page->page_hpa); pt = page->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */ /* avoid RMW */
if (pt[i] & PT_WRITABLE_MASK) { if (pt[i] & PT_WRITABLE_MASK) {
...@@ -1354,7 +1312,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu) ...@@ -1354,7 +1312,7 @@ void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
} }
mmu_free_memory_caches(vcpu); mmu_free_memory_caches(vcpu);
kvm_arch_ops->tlb_flush(vcpu); kvm_flush_remote_tlbs(vcpu->kvm);
init_kvm_mmu(vcpu); init_kvm_mmu(vcpu);
} }
...@@ -1364,6 +1322,10 @@ void kvm_mmu_module_exit(void) ...@@ -1364,6 +1322,10 @@ void kvm_mmu_module_exit(void)
kmem_cache_destroy(pte_chain_cache); kmem_cache_destroy(pte_chain_cache);
if (rmap_desc_cache) if (rmap_desc_cache)
kmem_cache_destroy(rmap_desc_cache); kmem_cache_destroy(rmap_desc_cache);
if (mmu_page_cache)
kmem_cache_destroy(mmu_page_cache);
if (mmu_page_header_cache)
kmem_cache_destroy(mmu_page_header_cache);
} }
int kvm_mmu_module_init(void) int kvm_mmu_module_init(void)
...@@ -1379,6 +1341,18 @@ int kvm_mmu_module_init(void) ...@@ -1379,6 +1341,18 @@ int kvm_mmu_module_init(void)
if (!rmap_desc_cache) if (!rmap_desc_cache)
goto nomem; goto nomem;
mmu_page_cache = kmem_cache_create("kvm_mmu_page",
PAGE_SIZE,
PAGE_SIZE, 0, NULL, NULL);
if (!mmu_page_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
0, 0, NULL, NULL);
if (!mmu_page_header_cache)
goto nomem;
return 0; return 0;
nomem: nomem:
...@@ -1482,7 +1456,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) ...@@ -1482,7 +1456,7 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu)
int i; int i;
list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) { list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
u64 *pt = __va(page->page_hpa); u64 *pt = page->spt;
if (page->role.level != PT_PAGE_TABLE_LEVEL) if (page->role.level != PT_PAGE_TABLE_LEVEL)
continue; continue;
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4 #define PT_MAX_FULL_LEVELS 4
#else #else
...@@ -46,7 +45,6 @@ ...@@ -46,7 +45,6 @@
#define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
#define PT_MAX_FULL_LEVELS 2 #define PT_MAX_FULL_LEVELS 2
#else #else
#error Invalid PTTYPE value #error Invalid PTTYPE value
...@@ -192,40 +190,143 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, ...@@ -192,40 +190,143 @@ static void FNAME(mark_pagetable_dirty)(struct kvm *kvm,
mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]);
} }
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
u64 *shadow_pte, u64 access_bits, gfn_t gfn) u64 *shadow_pte,
gpa_t gaddr,
pt_element_t *gpte,
u64 access_bits,
int user_fault,
int write_fault,
int *ptwrite,
struct guest_walker *walker,
gfn_t gfn)
{ {
ASSERT(*shadow_pte == 0); hpa_t paddr;
access_bits &= guest_pte; int dirty = *gpte & PT_DIRTY_MASK;
*shadow_pte = (guest_pte & PT_PTE_COPY_MASK); u64 spte = *shadow_pte;
set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK, int was_rmapped = is_rmap_pte(spte);
guest_pte & PT_DIRTY_MASK, access_bits, gfn);
pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
" user_fault %d gfn %lx\n",
__FUNCTION__, spte, (u64)*gpte, access_bits,
write_fault, user_fault, gfn);
if (write_fault && !dirty) {
*gpte |= PT_DIRTY_MASK;
dirty = 1;
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
}
spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
spte |= *gpte & PT64_NX_MASK;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
spte |= PT_PRESENT_MASK;
if (access_bits & PT_USER_MASK)
spte |= PT_USER_MASK;
if (is_error_hpa(paddr)) {
spte |= gaddr;
spte |= PT_SHADOW_IO_MARK;
spte &= ~PT_PRESENT_MASK;
set_shadow_pte(shadow_pte, spte);
return;
}
spte |= paddr;
if ((access_bits & PT_WRITABLE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
if (user_fault) {
mmu_unshadow(vcpu, gfn);
goto unshadowed;
}
shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
kvm_arch_ops->tlb_flush(vcpu);
}
if (write_fault)
*ptwrite = 1;
}
}
unshadowed:
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
set_shadow_pte(shadow_pte, spte);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
if (!was_rmapped)
rmap_add(vcpu, shadow_pte);
} }
static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde, static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t *gpte,
u64 *shadow_pte, u64 access_bits, gfn_t gfn) u64 *shadow_pte, u64 access_bits,
int user_fault, int write_fault, int *ptwrite,
struct guest_walker *walker, gfn_t gfn)
{
access_bits &= *gpte;
FNAME(set_pte_common)(vcpu, shadow_pte, *gpte & PT_BASE_ADDR_MASK,
gpte, access_bits, user_fault, write_fault,
ptwrite, walker, gfn);
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
u64 *spte, const void *pte, int bytes)
{
pt_element_t gpte;
if (bytes < sizeof(pt_element_t))
return;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
return;
pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
FNAME(set_pte)(vcpu, &gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
0, NULL, NULL,
(gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT);
}
static void FNAME(set_pde)(struct kvm_vcpu *vcpu, pt_element_t *gpde,
u64 *shadow_pte, u64 access_bits,
int user_fault, int write_fault, int *ptwrite,
struct guest_walker *walker, gfn_t gfn)
{ {
gpa_t gaddr; gpa_t gaddr;
ASSERT(*shadow_pte == 0); access_bits &= *gpde;
access_bits &= guest_pde;
gaddr = (gpa_t)gfn << PAGE_SHIFT; gaddr = (gpa_t)gfn << PAGE_SHIFT;
if (PTTYPE == 32 && is_cpuid_PSE36()) if (PTTYPE == 32 && is_cpuid_PSE36())
gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) << gaddr |= (*gpde & PT32_DIR_PSE36_MASK) <<
(32 - PT32_DIR_PSE36_SHIFT); (32 - PT32_DIR_PSE36_SHIFT);
*shadow_pte = guest_pde & PT_PTE_COPY_MASK; FNAME(set_pte_common)(vcpu, shadow_pte, gaddr,
set_pte_common(vcpu, shadow_pte, gaddr, gpde, access_bits, user_fault, write_fault,
guest_pde & PT_DIRTY_MASK, access_bits, gfn); ptwrite, walker, gfn);
} }
/* /*
* Fetch a shadow pte for a specific level in the paging hierarchy. * Fetch a shadow pte for a specific level in the paging hierarchy.
*/ */
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker) struct guest_walker *walker,
int user_fault, int write_fault, int *ptwrite)
{ {
hpa_t shadow_addr; hpa_t shadow_addr;
int level; int level;
u64 *shadow_ent;
u64 *prev_shadow_ent = NULL; u64 *prev_shadow_ent = NULL;
pt_element_t *guest_ent = walker->ptep; pt_element_t *guest_ent = walker->ptep;
...@@ -242,37 +343,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -242,37 +343,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
for (; ; level--) { for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level); u32 index = SHADOW_PT_INDEX(addr, level);
u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
struct kvm_mmu_page *shadow_page; struct kvm_mmu_page *shadow_page;
u64 shadow_pte; u64 shadow_pte;
int metaphysical; int metaphysical;
gfn_t table_gfn; gfn_t table_gfn;
unsigned hugepage_access = 0; unsigned hugepage_access = 0;
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
if (level == PT_PAGE_TABLE_LEVEL) if (level == PT_PAGE_TABLE_LEVEL)
return shadow_ent; break;
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
prev_shadow_ent = shadow_ent; prev_shadow_ent = shadow_ent;
continue; continue;
} }
if (level == PT_PAGE_TABLE_LEVEL) { if (level == PT_PAGE_TABLE_LEVEL)
break;
if (walker->level == PT_DIRECTORY_LEVEL) {
if (prev_shadow_ent)
*prev_shadow_ent |= PT_SHADOW_PS_MARK;
FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
walker->inherited_ar,
walker->gfn);
} else {
ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
walker->inherited_ar,
walker->gfn);
}
return shadow_ent;
}
if (level - 1 == PT_PAGE_TABLE_LEVEL if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) { && walker->level == PT_DIRECTORY_LEVEL) {
...@@ -289,90 +376,24 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -289,90 +376,24 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
metaphysical, hugepage_access, metaphysical, hugepage_access,
shadow_ent); shadow_ent);
shadow_addr = shadow_page->page_hpa; shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK; | PT_WRITABLE_MASK | PT_USER_MASK;
*shadow_ent = shadow_pte; *shadow_ent = shadow_pte;
prev_shadow_ent = shadow_ent; prev_shadow_ent = shadow_ent;
} }
}
/* if (walker->level == PT_DIRECTORY_LEVEL) {
* The guest faulted for write. We need to FNAME(set_pde)(vcpu, guest_ent, shadow_ent,
* walker->inherited_ar, user_fault, write_fault,
* - check write permissions ptwrite, walker, walker->gfn);
* - update the guest pte dirty bit } else {
* - update our own dirty page tracking structures ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
*/ FNAME(set_pte)(vcpu, guest_ent, shadow_ent,
static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, walker->inherited_ar, user_fault, write_fault,
u64 *shadow_ent, ptwrite, walker, walker->gfn);
struct guest_walker *walker,
gva_t addr,
int user,
int *write_pt)
{
pt_element_t *guest_ent;
int writable_shadow;
gfn_t gfn;
struct kvm_mmu_page *page;
if (is_writeble_pte(*shadow_ent))
return !user || (*shadow_ent & PT_USER_MASK);
writable_shadow = *shadow_ent & PT_SHADOW_WRITABLE_MASK;
if (user) {
/*
* User mode access. Fail if it's a kernel page or a read-only
* page.
*/
if (!(*shadow_ent & PT_SHADOW_USER_MASK) || !writable_shadow)
return 0;
ASSERT(*shadow_ent & PT_USER_MASK);
} else
/*
* Kernel mode access. Fail if it's a read-only page and
* supervisor write protection is enabled.
*/
if (!writable_shadow) {
if (is_write_protection(vcpu))
return 0;
*shadow_ent &= ~PT_USER_MASK;
}
guest_ent = walker->ptep;
if (!is_present_pte(*guest_ent)) {
*shadow_ent = 0;
return 0;
} }
return shadow_ent;
gfn = walker->gfn;
if (user) {
/*
* Usermode page faults won't be for page table updates.
*/
while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
pgprintk("%s: zap %lx %x\n",
__FUNCTION__, gfn, page->role.word);
kvm_mmu_zap_page(vcpu, page);
}
} else if (kvm_mmu_lookup_page(vcpu, gfn)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
mark_page_dirty(vcpu->kvm, gfn);
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
*guest_ent |= PT_DIRTY_MASK;
*write_pt = 1;
return 0;
}
mark_page_dirty(vcpu->kvm, gfn);
*shadow_ent |= PT_WRITABLE_MASK;
FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
*guest_ent |= PT_DIRTY_MASK;
rmap_add(vcpu, shadow_ent);
return 1;
} }
/* /*
...@@ -397,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -397,7 +418,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int fetch_fault = error_code & PFERR_FETCH_MASK; int fetch_fault = error_code & PFERR_FETCH_MASK;
struct guest_walker walker; struct guest_walker walker;
u64 *shadow_pte; u64 *shadow_pte;
int fixed;
int write_pt = 0; int write_pt = 0;
int r; int r;
...@@ -421,27 +441,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ...@@ -421,27 +441,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pgprintk("%s: guest page fault\n", __FUNCTION__); pgprintk("%s: guest page fault\n", __FUNCTION__);
inject_page_fault(vcpu, addr, walker.error_code); inject_page_fault(vcpu, addr, walker.error_code);
FNAME(release_walker)(&walker); FNAME(release_walker)(&walker);
vcpu->last_pt_write_count = 0; /* reset fork detector */
return 0; return 0;
} }
shadow_pte = FNAME(fetch)(vcpu, addr, &walker); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__, &write_pt);
shadow_pte, *shadow_pte); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
shadow_pte, *shadow_pte, write_pt);
/*
* Update the shadow pte.
*/
if (write_fault)
fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
user_fault, &write_pt);
else
fixed = fix_read_pf(shadow_pte);
pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
shadow_pte, *shadow_pte);
FNAME(release_walker)(&walker); FNAME(release_walker)(&walker);
if (!write_pt)
vcpu->last_pt_write_count = 0; /* reset fork detector */
/* /*
* mmio: emulate if accessible, otherwise its a guest fault. * mmio: emulate if accessible, otherwise its a guest fault.
*/ */
...@@ -478,7 +491,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) ...@@ -478,7 +491,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
#undef PT_INDEX #undef PT_INDEX
#undef SHADOW_PT_INDEX #undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK #undef PT_LEVEL_MASK
#undef PT_PTE_COPY_MASK
#undef PT_NON_PTE_COPY_MASK
#undef PT_DIR_BASE_ADDR_MASK #undef PT_DIR_BASE_ADDR_MASK
#undef PT_MAX_FULL_LEVELS #undef PT_MAX_FULL_LEVELS
...@@ -14,16 +14,17 @@ ...@@ -14,16 +14,17 @@
* *
*/ */
#include "kvm_svm.h"
#include "x86_emulate.h"
#include <linux/module.h> #include <linux/module.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/profile.h> #include <linux/profile.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <asm/desc.h>
#include "kvm_svm.h" #include <asm/desc.h>
#include "x86_emulate.h"
MODULE_AUTHOR("Qumranet"); MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -378,7 +379,7 @@ static __init int svm_hardware_setup(void) ...@@ -378,7 +379,7 @@ static __init int svm_hardware_setup(void)
int cpu; int cpu;
struct page *iopm_pages; struct page *iopm_pages;
struct page *msrpm_pages; struct page *msrpm_pages;
void *msrpm_va; void *iopm_va, *msrpm_va;
int r; int r;
kvm_emulator_want_group7_invlpg(); kvm_emulator_want_group7_invlpg();
...@@ -387,8 +388,10 @@ static __init int svm_hardware_setup(void) ...@@ -387,8 +388,10 @@ static __init int svm_hardware_setup(void)
if (!iopm_pages) if (!iopm_pages)
return -ENOMEM; return -ENOMEM;
memset(page_address(iopm_pages), 0xff,
PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
...@@ -579,7 +582,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) ...@@ -579,7 +582,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
goto out2; goto out2;
vcpu->svm->vmcb = page_address(page); vcpu->svm->vmcb = page_address(page);
memset(vcpu->svm->vmcb, 0, PAGE_SIZE); clear_page(vcpu->svm->vmcb);
vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; vcpu->svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
vcpu->svm->asid_generation = 0; vcpu->svm->asid_generation = 0;
memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs)); memset(vcpu->svm->db_regs, 0, sizeof(vcpu->svm->db_regs));
...@@ -587,9 +590,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) ...@@ -587,9 +590,9 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
fx_init(vcpu); fx_init(vcpu);
vcpu->fpu_active = 1; vcpu->fpu_active = 1;
vcpu->apic_base = 0xfee00000 | vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
/*for vcpu 0*/ MSR_IA32_APICBASE_BSP | if (vcpu == &vcpu->kvm->vcpus[0])
MSR_IA32_APICBASE_ENABLE; vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
return 0; return 0;
...@@ -955,7 +958,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -955,7 +958,7 @@ static int shutdown_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
* VMCB is undefined after a SHUTDOWN intercept * VMCB is undefined after a SHUTDOWN intercept
* so reinitialize it. * so reinitialize it.
*/ */
memset(vcpu->svm->vmcb, 0, PAGE_SIZE); clear_page(vcpu->svm->vmcb);
init_vmcb(vcpu->svm->vmcb); init_vmcb(vcpu->svm->vmcb);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
...@@ -1113,12 +1116,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1113,12 +1116,7 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1; vcpu->svm->next_rip = vcpu->svm->vmcb->save.rip + 1;
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
if (vcpu->irq_summary) return kvm_emulate_halt(vcpu);
return 1;
kvm_run->exit_reason = KVM_EXIT_HLT;
++vcpu->stat.halt_exits;
return 0;
} }
static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
...@@ -1473,6 +1471,11 @@ static void load_db_regs(unsigned long *db_regs) ...@@ -1473,6 +1471,11 @@ static void load_db_regs(unsigned long *db_regs)
asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3])); asm volatile ("mov %0, %%dr3" : : "r"(db_regs[3]));
} }
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
force_new_asid(vcpu);
}
static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
u16 fs_selector; u16 fs_selector;
...@@ -1481,11 +1484,20 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1481,11 +1484,20 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r; int r;
again: again:
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
return r;
if (!vcpu->mmio_read_completed) if (!vcpu->mmio_read_completed)
do_interrupt_requests(vcpu, kvm_run); do_interrupt_requests(vcpu, kvm_run);
clgi(); clgi();
vcpu->guest_mode = 1;
if (vcpu->requests)
if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
svm_flush_tlb(vcpu);
pre_svm_run(vcpu); pre_svm_run(vcpu);
save_host_msrs(vcpu); save_host_msrs(vcpu);
...@@ -1617,6 +1629,8 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1617,6 +1629,8 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
#endif #endif
: "cc", "memory" ); : "cc", "memory" );
vcpu->guest_mode = 0;
if (vcpu->fpu_active) { if (vcpu->fpu_active) {
fx_save(vcpu->guest_fx_image); fx_save(vcpu->guest_fx_image);
fx_restore(vcpu->host_fx_image); fx_restore(vcpu->host_fx_image);
...@@ -1681,11 +1695,6 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1681,11 +1695,6 @@ static int svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return r; return r;
} }
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
force_new_asid(vcpu);
}
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{ {
vcpu->svm->vmcb->save.cr3 = root; vcpu->svm->vmcb->save.cr3 = root;
...@@ -1727,6 +1736,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu, ...@@ -1727,6 +1736,12 @@ static void svm_inject_page_fault(struct kvm_vcpu *vcpu,
static int is_disabled(void) static int is_disabled(void)
{ {
u64 vm_cr;
rdmsrl(MSR_VM_CR, vm_cr);
if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
return 1;
return 0; return 0;
} }
......
...@@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb { ...@@ -175,8 +175,11 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_CPUID_FUNC 0x8000000a #define SVM_CPUID_FUNC 0x8000000a
#define MSR_EFER_SVME_MASK (1ULL << 12) #define MSR_EFER_SVME_MASK (1ULL << 12)
#define MSR_VM_CR 0xc0010114
#define MSR_VM_HSAVE_PA 0xc0010117ULL #define MSR_VM_HSAVE_PA 0xc0010117ULL
#define SVM_VM_CR_SVM_DISABLE 4
#define SVM_SELECTOR_S_SHIFT 4 #define SVM_SELECTOR_S_SHIFT 4
#define SVM_SELECTOR_DPL_SHIFT 5 #define SVM_SELECTOR_DPL_SHIFT 5
#define SVM_SELECTOR_P_SHIFT 7 #define SVM_SELECTOR_P_SHIFT 7
......
...@@ -17,28 +17,35 @@ ...@@ -17,28 +17,35 @@
#include "kvm.h" #include "kvm.h"
#include "vmx.h" #include "vmx.h"
#include "segment_descriptor.h"
#include <linux/module.h> #include <linux/module.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/profile.h> #include <linux/profile.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/desc.h> #include <asm/desc.h>
#include "segment_descriptor.h"
MODULE_AUTHOR("Qumranet"); MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
static int init_rmode_tss(struct kvm *kvm);
static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static struct page *vmx_io_bitmap_a;
static struct page *vmx_io_bitmap_b;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
#define HOST_IS_64 1 #define HOST_IS_64 1
#else #else
#define HOST_IS_64 0 #define HOST_IS_64 0
#endif #endif
#define EFER_SAVE_RESTORE_BITS ((u64)EFER_SCE)
static struct vmcs_descriptor { static struct vmcs_descriptor {
int size; int size;
...@@ -82,18 +89,17 @@ static const u32 vmx_msr_index[] = { ...@@ -82,18 +89,17 @@ static const u32 vmx_msr_index[] = {
}; };
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
#ifdef CONFIG_X86_64 static inline u64 msr_efer_save_restore_bits(struct vmx_msr_entry msr)
static unsigned msr_offset_kernel_gs_base; {
#define NR_64BIT_MSRS 4 return (u64)msr.data & EFER_SAVE_RESTORE_BITS;
/* }
* avoid save/load MSR_SYSCALL_MASK and MSR_LSTAR by std vt
* mechanism (cpu bug AA24) static inline int msr_efer_need_save_restore(struct kvm_vcpu *vcpu)
*/ {
#define NR_BAD_MSRS 2 int efer_offset = vcpu->msr_offset_efer;
#else return msr_efer_save_restore_bits(vcpu->host_msrs[efer_offset]) !=
#define NR_64BIT_MSRS 0 msr_efer_save_restore_bits(vcpu->guest_msrs[efer_offset]);
#define NR_BAD_MSRS 0 }
#endif
static inline int is_page_fault(u32 intr_info) static inline int is_page_fault(u32 intr_info)
{ {
...@@ -115,13 +121,23 @@ static inline int is_external_interrupt(u32 intr_info) ...@@ -115,13 +121,23 @@ static inline int is_external_interrupt(u32 intr_info)
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
} }
static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr) static int __find_msr_index(struct kvm_vcpu *vcpu, u32 msr)
{ {
int i; int i;
for (i = 0; i < vcpu->nmsrs; ++i) for (i = 0; i < vcpu->nmsrs; ++i)
if (vcpu->guest_msrs[i].index == msr) if (vcpu->guest_msrs[i].index == msr)
return &vcpu->guest_msrs[i]; return i;
return -1;
}
static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
{
int i;
i = __find_msr_index(vcpu, msr);
if (i >= 0)
return &vcpu->guest_msrs[i];
return NULL; return NULL;
} }
...@@ -147,6 +163,7 @@ static void __vcpu_clear(void *arg) ...@@ -147,6 +163,7 @@ static void __vcpu_clear(void *arg)
vmcs_clear(vcpu->vmcs); vmcs_clear(vcpu->vmcs);
if (per_cpu(current_vmcs, cpu) == vcpu->vmcs) if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
per_cpu(current_vmcs, cpu) = NULL; per_cpu(current_vmcs, cpu) = NULL;
rdtscll(vcpu->host_tsc);
} }
static void vcpu_clear(struct kvm_vcpu *vcpu) static void vcpu_clear(struct kvm_vcpu *vcpu)
...@@ -234,6 +251,127 @@ static void vmcs_set_bits(unsigned long field, u32 mask) ...@@ -234,6 +251,127 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
vmcs_writel(field, vmcs_readl(field) | mask); vmcs_writel(field, vmcs_readl(field) | mask);
} }
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
eb = 1u << PF_VECTOR;
if (!vcpu->fpu_active)
eb |= 1u << NM_VECTOR;
if (vcpu->guest_debug.enabled)
eb |= 1u << 1;
if (vcpu->rmode.active)
eb = ~0;
vmcs_write32(EXCEPTION_BITMAP, eb);
}
static void reload_tss(void)
{
#ifndef CONFIG_X86_64
/*
* VT restores TR but not its size. Useless.
*/
struct descriptor_table gdt;
struct segment_descriptor *descs;
get_gdt(&gdt);
descs = (void *)gdt.base;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
#endif
}
static void load_transition_efer(struct kvm_vcpu *vcpu)
{
u64 trans_efer;
int efer_offset = vcpu->msr_offset_efer;
trans_efer = vcpu->host_msrs[efer_offset].data;
trans_efer &= ~EFER_SAVE_RESTORE_BITS;
trans_efer |= msr_efer_save_restore_bits(
vcpu->guest_msrs[efer_offset]);
wrmsrl(MSR_EFER, trans_efer);
vcpu->stat.efer_reload++;
}
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vmx_host_state *hs = &vcpu->vmx_host_state;
if (hs->loaded)
return;
hs->loaded = 1;
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
hs->ldt_sel = read_ldt();
hs->fs_gs_ldt_reload_needed = hs->ldt_sel;
hs->fs_sel = read_fs();
if (!(hs->fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, hs->fs_sel);
else {
vmcs_write16(HOST_FS_SELECTOR, 0);
hs->fs_gs_ldt_reload_needed = 1;
}
hs->gs_sel = read_gs();
if (!(hs->gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, hs->gs_sel);
else {
vmcs_write16(HOST_GS_SELECTOR, 0);
hs->fs_gs_ldt_reload_needed = 1;
}
#ifdef CONFIG_X86_64
vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
#else
vmcs_writel(HOST_FS_BASE, segment_base(hs->fs_sel));
vmcs_writel(HOST_GS_BASE, segment_base(hs->gs_sel));
#endif
#ifdef CONFIG_X86_64
if (is_long_mode(vcpu)) {
save_msrs(vcpu->host_msrs + vcpu->msr_offset_kernel_gs_base, 1);
}
#endif
load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
if (msr_efer_need_save_restore(vcpu))
load_transition_efer(vcpu);
}
static void vmx_load_host_state(struct kvm_vcpu *vcpu)
{
struct vmx_host_state *hs = &vcpu->vmx_host_state;
if (!hs->loaded)
return;
hs->loaded = 0;
if (hs->fs_gs_ldt_reload_needed) {
load_ldt(hs->ldt_sel);
load_fs(hs->fs_sel);
/*
* If we have to reload gs, we must take care to
* preserve our gs base.
*/
local_irq_disable();
load_gs(hs->gs_sel);
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
#endif
local_irq_enable();
reload_tss();
}
save_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
load_msrs(vcpu->host_msrs, vcpu->save_nmsrs);
if (msr_efer_need_save_restore(vcpu))
load_msrs(vcpu->host_msrs + vcpu->msr_offset_efer, 1);
}
/* /*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes * Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken. * vcpu mutex is already taken.
...@@ -242,6 +380,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) ...@@ -242,6 +380,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
{ {
u64 phys_addr = __pa(vcpu->vmcs); u64 phys_addr = __pa(vcpu->vmcs);
int cpu; int cpu;
u64 tsc_this, delta;
cpu = get_cpu(); cpu = get_cpu();
...@@ -275,15 +414,43 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu) ...@@ -275,15 +414,43 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
/*
* Make sure the time stamp counter is monotonous.
*/
rdtscll(tsc_this);
delta = vcpu->host_tsc - tsc_this;
vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
} }
} }
static void vmx_vcpu_put(struct kvm_vcpu *vcpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{ {
vmx_load_host_state(vcpu);
kvm_put_guest_fpu(vcpu); kvm_put_guest_fpu(vcpu);
put_cpu(); put_cpu();
} }
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
{
if (vcpu->fpu_active)
return;
vcpu->fpu_active = 1;
vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
if (vcpu->cr0 & CR0_TS_MASK)
vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
update_exception_bitmap(vcpu);
}
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
if (!vcpu->fpu_active)
return;
vcpu->fpu_active = 0;
vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
update_exception_bitmap(vcpu);
}
static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
{ {
vcpu_clear(vcpu); vcpu_clear(vcpu);
...@@ -331,6 +498,20 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) ...@@ -331,6 +498,20 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
INTR_INFO_VALID_MASK); INTR_INFO_VALID_MASK);
} }
/*
* Swap MSR entry in host/guest MSR entry array.
*/
void move_msr_up(struct kvm_vcpu *vcpu, int from, int to)
{
struct vmx_msr_entry tmp;
tmp = vcpu->guest_msrs[to];
vcpu->guest_msrs[to] = vcpu->guest_msrs[from];
vcpu->guest_msrs[from] = tmp;
tmp = vcpu->host_msrs[to];
vcpu->host_msrs[to] = vcpu->host_msrs[from];
vcpu->host_msrs[from] = tmp;
}
/* /*
* Set up the vmcs to automatically save and restore system * Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy * msrs. Don't touch the 64-bit msrs if the guest is in legacy
...@@ -338,35 +519,41 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code) ...@@ -338,35 +519,41 @@ static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
*/ */
static void setup_msrs(struct kvm_vcpu *vcpu) static void setup_msrs(struct kvm_vcpu *vcpu)
{ {
int nr_skip, nr_good_msrs; int save_nmsrs;
if (is_long_mode(vcpu))
nr_skip = NR_BAD_MSRS;
else
nr_skip = NR_64BIT_MSRS;
nr_good_msrs = vcpu->nmsrs - nr_skip;
/* save_nmsrs = 0;
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
if (find_msr_entry(vcpu, MSR_K6_STAR)) {
--nr_good_msrs;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
if (is_long_mode(vcpu) && (vcpu->shadow_efer & EFER_SCE)) if (is_long_mode(vcpu)) {
++nr_good_msrs; int index;
#endif
index = __find_msr_index(vcpu, MSR_SYSCALL_MASK);
if (index >= 0)
move_msr_up(vcpu, index, save_nmsrs++);
index = __find_msr_index(vcpu, MSR_LSTAR);
if (index >= 0)
move_msr_up(vcpu, index, save_nmsrs++);
index = __find_msr_index(vcpu, MSR_CSTAR);
if (index >= 0)
move_msr_up(vcpu, index, save_nmsrs++);
index = __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
if (index >= 0)
move_msr_up(vcpu, index, save_nmsrs++);
/*
* MSR_K6_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
index = __find_msr_index(vcpu, MSR_K6_STAR);
if ((index >= 0) && (vcpu->shadow_efer & EFER_SCE))
move_msr_up(vcpu, index, save_nmsrs++);
} }
#endif
vcpu->save_nmsrs = save_nmsrs;
vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR, #ifdef CONFIG_X86_64
virt_to_phys(vcpu->guest_msrs + nr_skip)); vcpu->msr_offset_kernel_gs_base =
vmcs_writel(VM_EXIT_MSR_STORE_ADDR, __find_msr_index(vcpu, MSR_KERNEL_GS_BASE);
virt_to_phys(vcpu->guest_msrs + nr_skip)); #endif
vmcs_writel(VM_EXIT_MSR_LOAD_ADDR, vcpu->msr_offset_efer = __find_msr_index(vcpu, MSR_EFER);
virt_to_phys(vcpu->host_msrs + nr_skip));
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
} }
/* /*
...@@ -394,23 +581,6 @@ static void guest_write_tsc(u64 guest_tsc) ...@@ -394,23 +581,6 @@ static void guest_write_tsc(u64 guest_tsc)
vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
} }
static void reload_tss(void)
{
#ifndef CONFIG_X86_64
/*
* VT restores TR but not its size. Useless.
*/
struct descriptor_table gdt;
struct segment_descriptor *descs;
get_gdt(&gdt);
descs = (void *)gdt.base;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
#endif
}
/* /*
* Reads an msr value (of 'msr_index') into 'pdata'. * Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise. * Returns 0 on success, non-0 otherwise.
...@@ -470,10 +640,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) ...@@ -470,10 +640,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{ {
struct vmx_msr_entry *msr; struct vmx_msr_entry *msr;
int ret = 0;
switch (msr_index) { switch (msr_index) {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
case MSR_EFER: case MSR_EFER:
return kvm_set_msr_common(vcpu, msr_index, data); ret = kvm_set_msr_common(vcpu, msr_index, data);
if (vcpu->vmx_host_state.loaded)
load_transition_efer(vcpu);
break;
case MSR_FS_BASE: case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data); vmcs_writel(GUEST_FS_BASE, data);
break; break;
...@@ -497,14 +672,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) ...@@ -497,14 +672,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
msr = find_msr_entry(vcpu, msr_index); msr = find_msr_entry(vcpu, msr_index);
if (msr) { if (msr) {
msr->data = data; msr->data = data;
if (vcpu->vmx_host_state.loaded)
load_msrs(vcpu->guest_msrs, vcpu->save_nmsrs);
break; break;
} }
return kvm_set_msr_common(vcpu, msr_index, data); ret = kvm_set_msr_common(vcpu, msr_index, data);
msr->data = data;
break;
} }
return 0; return ret;
} }
/* /*
...@@ -530,10 +705,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) ...@@ -530,10 +705,8 @@ static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
{ {
unsigned long dr7 = 0x400; unsigned long dr7 = 0x400;
u32 exception_bitmap;
int old_singlestep; int old_singlestep;
exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
old_singlestep = vcpu->guest_debug.singlestep; old_singlestep = vcpu->guest_debug.singlestep;
vcpu->guest_debug.enabled = dbg->enabled; vcpu->guest_debug.enabled = dbg->enabled;
...@@ -549,13 +722,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) ...@@ -549,13 +722,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
dr7 |= 0 << (i*4+16); /* execution breakpoint */ dr7 |= 0 << (i*4+16); /* execution breakpoint */
} }
exception_bitmap |= (1u << 1); /* Trap debug exceptions */
vcpu->guest_debug.singlestep = dbg->singlestep; vcpu->guest_debug.singlestep = dbg->singlestep;
} else { } else
exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
vcpu->guest_debug.singlestep = 0; vcpu->guest_debug.singlestep = 0;
}
if (old_singlestep && !vcpu->guest_debug.singlestep) { if (old_singlestep && !vcpu->guest_debug.singlestep) {
unsigned long flags; unsigned long flags;
...@@ -565,7 +734,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) ...@@ -565,7 +734,7 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_RFLAGS, flags);
} }
vmcs_write32(EXCEPTION_BITMAP, exception_bitmap); update_exception_bitmap(vcpu);
vmcs_writel(GUEST_DR7, dr7); vmcs_writel(GUEST_DR7, dr7);
return 0; return 0;
...@@ -679,14 +848,6 @@ static __exit void hardware_unsetup(void) ...@@ -679,14 +848,6 @@ static __exit void hardware_unsetup(void)
free_kvm_area(); free_kvm_area();
} }
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
if (vcpu->rmode.active)
vmcs_write32(EXCEPTION_BITMAP, ~0);
else
vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
}
static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
{ {
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
...@@ -793,6 +954,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) ...@@ -793,6 +954,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds); fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
init_rmode_tss(vcpu->kvm);
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -837,6 +1000,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) ...@@ -837,6 +1000,8 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{ {
vmx_fpu_deactivate(vcpu);
if (vcpu->rmode.active && (cr0 & CR0_PE_MASK)) if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
enter_pmode(vcpu); enter_pmode(vcpu);
...@@ -852,26 +1017,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) ...@@ -852,26 +1017,20 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
} }
#endif #endif
if (!(cr0 & CR0_TS_MASK)) {
vcpu->fpu_active = 1;
vmcs_clear_bits(EXCEPTION_BITMAP, CR0_TS_MASK);
}
vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, vmcs_writel(GUEST_CR0,
(cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
vcpu->cr0 = cr0; vcpu->cr0 = cr0;
if (!(cr0 & CR0_TS_MASK) || !(cr0 & CR0_PE_MASK))
vmx_fpu_activate(vcpu);
} }
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{ {
vmcs_writel(GUEST_CR3, cr3); vmcs_writel(GUEST_CR3, cr3);
if (vcpu->cr0 & CR0_PE_MASK)
if (!(vcpu->cr0 & CR0_TS_MASK)) { vmx_fpu_deactivate(vcpu);
vcpu->fpu_active = 0;
vmcs_set_bits(GUEST_CR0, CR0_TS_MASK);
vmcs_set_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
}
} }
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
...@@ -937,23 +1096,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, ...@@ -937,23 +1096,11 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->unusable = (ar >> 16) & 1; var->unusable = (ar >> 16) & 1;
} }
static void vmx_set_segment(struct kvm_vcpu *vcpu, static u32 vmx_segment_access_rights(struct kvm_segment *var)
struct kvm_segment *var, int seg)
{ {
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar; u32 ar;
vmcs_writel(sf->base, var->base); if (var->unusable)
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
if (vcpu->rmode.active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
if (var->base == 0xffff0000 && var->selector == 0xf000)
vmcs_writel(sf->base, 0xf0000);
ar = 0xf3;
} else if (var->unusable)
ar = 1 << 16; ar = 1 << 16;
else { else {
ar = var->type & 15; ar = var->type & 15;
...@@ -967,6 +1114,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ...@@ -967,6 +1114,35 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
} }
if (ar == 0) /* a 0 value means unusable */ if (ar == 0) /* a 0 value means unusable */
ar = AR_UNUSABLE_MASK; ar = AR_UNUSABLE_MASK;
return ar;
}
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
if (vcpu->rmode.active && seg == VCPU_SREG_TR) {
vcpu->rmode.tr.selector = var->selector;
vcpu->rmode.tr.base = var->base;
vcpu->rmode.tr.limit = var->limit;
vcpu->rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
if (vcpu->rmode.active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
if (var->base == 0xffff0000 && var->selector == 0xf000)
vmcs_writel(sf->base, 0xf0000);
ar = 0xf3;
} else
ar = vmx_segment_access_rights(var);
vmcs_write32(sf->ar_bytes, ar); vmcs_write32(sf->ar_bytes, ar);
} }
...@@ -1018,16 +1194,16 @@ static int init_rmode_tss(struct kvm* kvm) ...@@ -1018,16 +1194,16 @@ static int init_rmode_tss(struct kvm* kvm)
} }
page = kmap_atomic(p1, KM_USER0); page = kmap_atomic(p1, KM_USER0);
memset(page, 0, PAGE_SIZE); clear_page(page);
*(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
kunmap_atomic(page, KM_USER0); kunmap_atomic(page, KM_USER0);
page = kmap_atomic(p2, KM_USER0); page = kmap_atomic(p2, KM_USER0);
memset(page, 0, PAGE_SIZE); clear_page(page);
kunmap_atomic(page, KM_USER0); kunmap_atomic(page, KM_USER0);
page = kmap_atomic(p3, KM_USER0); page = kmap_atomic(p3, KM_USER0);
memset(page, 0, PAGE_SIZE); clear_page(page);
*(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0; *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
kunmap_atomic(page, KM_USER0); kunmap_atomic(page, KM_USER0);
...@@ -1066,7 +1242,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1066,7 +1242,7 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
struct descriptor_table dt; struct descriptor_table dt;
int i; int i;
int ret = 0; int ret = 0;
extern asmlinkage void kvm_vmx_return(void); unsigned long kvm_vmx_return;
if (!init_rmode_tss(vcpu->kvm)) { if (!init_rmode_tss(vcpu->kvm)) {
ret = -ENOMEM; ret = -ENOMEM;
...@@ -1076,9 +1252,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1076,9 +1252,9 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
memset(vcpu->regs, 0, sizeof(vcpu->regs)); memset(vcpu->regs, 0, sizeof(vcpu->regs));
vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val(); vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
vcpu->cr8 = 0; vcpu->cr8 = 0;
vcpu->apic_base = 0xfee00000 | vcpu->apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
/*for vcpu 0*/ MSR_IA32_APICBASE_BSP | if (vcpu == &vcpu->kvm->vcpus[0])
MSR_IA32_APICBASE_ENABLE; vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
fx_init(vcpu); fx_init(vcpu);
...@@ -1129,8 +1305,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1129,8 +1305,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
/* I/O */ /* I/O */
vmcs_write64(IO_BITMAP_A, 0); vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
vmcs_write64(IO_BITMAP_B, 0); vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
guest_write_tsc(0); guest_write_tsc(0);
...@@ -1150,12 +1326,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1150,12 +1326,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
CPU_BASED_HLT_EXITING /* 20.6.2 */ CPU_BASED_HLT_EXITING /* 20.6.2 */
| CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */ | CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
| CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */ | CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
| CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */ | CPU_BASED_ACTIVATE_IO_BITMAP /* 20.6.2 */
| CPU_BASED_MOV_DR_EXITING | CPU_BASED_MOV_DR_EXITING
| CPU_BASED_USE_TSC_OFFSETING /* 21.3 */ | CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
); );
vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
...@@ -1185,8 +1360,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1185,8 +1360,11 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
get_idt(&dt); get_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
asm ("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */ vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
...@@ -1210,10 +1388,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1210,10 +1388,6 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->host_msrs[j].reserved = 0; vcpu->host_msrs[j].reserved = 0;
vcpu->host_msrs[j].data = data; vcpu->host_msrs[j].data = data;
vcpu->guest_msrs[j] = vcpu->host_msrs[j]; vcpu->guest_msrs[j] = vcpu->host_msrs[j];
#ifdef CONFIG_X86_64
if (index == MSR_KERNEL_GS_BASE)
msr_offset_kernel_gs_base = j;
#endif
++vcpu->nmsrs; ++vcpu->nmsrs;
} }
...@@ -1241,6 +1415,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) ...@@ -1241,6 +1415,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
vmx_set_efer(vcpu, 0); vmx_set_efer(vcpu, 0);
#endif #endif
vmx_fpu_activate(vcpu);
update_exception_bitmap(vcpu);
return 0; return 0;
...@@ -1365,7 +1541,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, ...@@ -1365,7 +1541,11 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (!vcpu->rmode.active) if (!vcpu->rmode.active)
return 0; return 0;
if (vec == GP_VECTOR && err_code == 0) /*
* Instruction with address size override prefix opcode 0x67
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE) if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
return 1; return 1;
return 0; return 0;
...@@ -1400,10 +1580,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1400,10 +1580,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} }
if (is_no_device(intr_info)) { if (is_no_device(intr_info)) {
vcpu->fpu_active = 1; vmx_fpu_activate(vcpu);
vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
if (!(vcpu->cr0 & CR0_TS_MASK))
vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
return 1; return 1;
} }
...@@ -1445,8 +1622,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1445,8 +1622,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->rmode.active && if (vcpu->rmode.active &&
handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
error_code)) error_code)) {
if (vcpu->halt_request) {
vcpu->halt_request = 0;
return kvm_emulate_halt(vcpu);
}
return 1; return 1;
}
if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) { if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->exit_reason = KVM_EXIT_DEBUG;
...@@ -1595,11 +1777,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1595,11 +1777,10 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
break; break;
case 2: /* clts */ case 2: /* clts */
vcpu_load_rsp_rip(vcpu); vcpu_load_rsp_rip(vcpu);
vcpu->fpu_active = 1; vmx_fpu_deactivate(vcpu);
vmcs_clear_bits(EXCEPTION_BITMAP, 1 << NM_VECTOR);
vmcs_clear_bits(GUEST_CR0, CR0_TS_MASK);
vcpu->cr0 &= ~CR0_TS_MASK; vcpu->cr0 &= ~CR0_TS_MASK;
vmcs_writel(CR0_READ_SHADOW, vcpu->cr0); vmcs_writel(CR0_READ_SHADOW, vcpu->cr0);
vmx_fpu_activate(vcpu);
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
return 1; return 1;
case 1: /*mov from cr*/ case 1: /*mov from cr*/
...@@ -1734,12 +1915,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, ...@@ -1734,12 +1915,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
skip_emulated_instruction(vcpu); skip_emulated_instruction(vcpu);
if (vcpu->irq_summary) return kvm_emulate_halt(vcpu);
return 1;
kvm_run->exit_reason = KVM_EXIT_HLT;
++vcpu->stat.halt_exits;
return 0;
} }
static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
...@@ -1770,7 +1946,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, ...@@ -1770,7 +1946,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
}; };
static const int kvm_vmx_max_exit_handlers = static const int kvm_vmx_max_exit_handlers =
sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers); ARRAY_SIZE(kvm_vmx_exit_handlers);
/* /*
* The guest has exited. See if we can fix it or if we need userspace * The guest has exited. See if we can fix it or if we need userspace
...@@ -1810,61 +1986,44 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, ...@@ -1810,61 +1986,44 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
} }
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
}
static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{ {
u8 fail; u8 fail;
u16 fs_sel, gs_sel, ldt_sel;
int fs_gs_ldt_reload_needed;
int r; int r;
again: preempted:
/* if (vcpu->guest_debug.enabled)
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not kvm_guest_debug_pre(vcpu);
* allow segment selectors with cpl > 0 or ti == 1.
*/
fs_sel = read_fs();
gs_sel = read_gs();
ldt_sel = read_ldt();
fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
if (!fs_gs_ldt_reload_needed) {
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
vmcs_write16(HOST_GS_SELECTOR, gs_sel);
} else {
vmcs_write16(HOST_FS_SELECTOR, 0);
vmcs_write16(HOST_GS_SELECTOR, 0);
}
#ifdef CONFIG_X86_64
vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
#else
vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
#endif
again:
if (!vcpu->mmio_read_completed) if (!vcpu->mmio_read_completed)
do_interrupt_requests(vcpu, kvm_run); do_interrupt_requests(vcpu, kvm_run);
if (vcpu->guest_debug.enabled) vmx_save_host_state(vcpu);
kvm_guest_debug_pre(vcpu);
kvm_load_guest_fpu(vcpu); kvm_load_guest_fpu(vcpu);
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
/* /*
* Loading guest fpu may have cleared host cr0.ts * Loading guest fpu may have cleared host cr0.ts
*/ */
vmcs_writel(HOST_CR0, read_cr0()); vmcs_writel(HOST_CR0, read_cr0());
#ifdef CONFIG_X86_64 local_irq_disable();
if (is_long_mode(vcpu)) {
save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1); vcpu->guest_mode = 1;
load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); if (vcpu->requests)
} if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
#endif vmx_flush_tlb(vcpu);
asm ( asm (
/* Store host registers */ /* Store host registers */
"pushf \n\t"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
"push %%rax; push %%rbx; push %%rdx;" "push %%rax; push %%rbx; push %%rdx;"
"push %%rsi; push %%rdi; push %%rbp;" "push %%rsi; push %%rdi; push %%rbp;"
...@@ -1909,12 +2068,11 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1909,12 +2068,11 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */ "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
#endif #endif
/* Enter guest mode */ /* Enter guest mode */
"jne launched \n\t" "jne .Llaunched \n\t"
ASM_VMX_VMLAUNCH "\n\t" ASM_VMX_VMLAUNCH "\n\t"
"jmp kvm_vmx_return \n\t" "jmp .Lkvm_vmx_return \n\t"
"launched: " ASM_VMX_VMRESUME "\n\t" ".Llaunched: " ASM_VMX_VMRESUME "\n\t"
".globl kvm_vmx_return \n\t" ".Lkvm_vmx_return: "
"kvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */ /* Save guest registers, load host registers, keep flags */
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
"xchg %3, (%%rsp) \n\t" "xchg %3, (%%rsp) \n\t"
...@@ -1957,7 +2115,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1957,7 +2115,6 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
"pop %%ecx; popa \n\t" "pop %%ecx; popa \n\t"
#endif #endif
"setbe %0 \n\t" "setbe %0 \n\t"
"popf \n\t"
: "=q" (fail) : "=q" (fail)
: "r"(vcpu->launched), "d"((unsigned long)HOST_RSP), : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
"c"(vcpu), "c"(vcpu),
...@@ -1981,84 +2138,61 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -1981,84 +2138,61 @@ static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
[cr2]"i"(offsetof(struct kvm_vcpu, cr2)) [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
: "cc", "memory" ); : "cc", "memory" );
/* vcpu->guest_mode = 0;
* Reload segment selectors ASAP. (it's needed for a functional local_irq_enable();
* kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64
* relies on having 0 in %gs for the CPU PDA to work.)
*/
if (fs_gs_ldt_reload_needed) {
load_ldt(ldt_sel);
load_fs(fs_sel);
/*
* If we have to reload gs, we must take care to
* preserve our gs base.
*/
local_irq_disable();
load_gs(gs_sel);
#ifdef CONFIG_X86_64
wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
#endif
local_irq_enable();
reload_tss();
}
++vcpu->stat.exits; ++vcpu->stat.exits;
#ifdef CONFIG_X86_64
if (is_long_mode(vcpu)) {
save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
}
#endif
vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
if (fail) { if (unlikely(fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason kvm_run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR); = vmcs_read32(VM_INSTRUCTION_ERROR);
r = 0; r = 0;
} else { goto out;
/* }
* Profile KVM exit RIPs: /*
*/ * Profile KVM exit RIPs:
if (unlikely(prof_on == KVM_PROFILING)) */
profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); if (unlikely(prof_on == KVM_PROFILING))
profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
vcpu->launched = 1;
r = kvm_handle_exit(kvm_run, vcpu); vcpu->launched = 1;
if (r > 0) { r = kvm_handle_exit(kvm_run, vcpu);
/* Give scheduler a change to reschedule. */ if (r > 0) {
if (signal_pending(current)) { /* Give scheduler a change to reschedule. */
++vcpu->stat.signal_exits; if (signal_pending(current)) {
post_kvm_run_save(vcpu, kvm_run); r = -EINTR;
kvm_run->exit_reason = KVM_EXIT_INTR; kvm_run->exit_reason = KVM_EXIT_INTR;
return -EINTR; ++vcpu->stat.signal_exits;
} goto out;
}
if (dm_request_for_irq_injection(vcpu, kvm_run)) {
++vcpu->stat.request_irq_exits; if (dm_request_for_irq_injection(vcpu, kvm_run)) {
post_kvm_run_save(vcpu, kvm_run); r = -EINTR;
kvm_run->exit_reason = KVM_EXIT_INTR; kvm_run->exit_reason = KVM_EXIT_INTR;
return -EINTR; ++vcpu->stat.request_irq_exits;
} goto out;
}
kvm_resched(vcpu); if (!need_resched()) {
++vcpu->stat.light_exits;
goto again; goto again;
} }
} }
out:
if (r > 0) {
kvm_resched(vcpu);
goto preempted;
}
post_kvm_run_save(vcpu, kvm_run); post_kvm_run_save(vcpu, kvm_run);
return r; return r;
} }
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
}
static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
unsigned long addr, unsigned long addr,
u32 err_code) u32 err_code)
...@@ -2122,7 +2256,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) ...@@ -2122,7 +2256,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmcs_clear(vmcs); vmcs_clear(vmcs);
vcpu->vmcs = vmcs; vcpu->vmcs = vmcs;
vcpu->launched = 0; vcpu->launched = 0;
vcpu->fpu_active = 1;
return 0; return 0;
...@@ -2188,11 +2321,50 @@ static struct kvm_arch_ops vmx_arch_ops = { ...@@ -2188,11 +2321,50 @@ static struct kvm_arch_ops vmx_arch_ops = {
static int __init vmx_init(void) static int __init vmx_init(void)
{ {
return kvm_init_arch(&vmx_arch_ops, THIS_MODULE); void *iova;
int r;
vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
if (!vmx_io_bitmap_a)
return -ENOMEM;
vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
if (!vmx_io_bitmap_b) {
r = -ENOMEM;
goto out;
}
/*
* Allow direct access to the PC debug port (it is often used for I/O
* delays, but the vmexits simply slow things down).
*/
iova = kmap(vmx_io_bitmap_a);
memset(iova, 0xff, PAGE_SIZE);
clear_bit(0x80, iova);
kunmap(vmx_io_bitmap_a);
iova = kmap(vmx_io_bitmap_b);
memset(iova, 0xff, PAGE_SIZE);
kunmap(vmx_io_bitmap_b);
r = kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
if (r)
goto out1;
return 0;
out1:
__free_page(vmx_io_bitmap_b);
out:
__free_page(vmx_io_bitmap_a);
return r;
} }
static void __exit vmx_exit(void) static void __exit vmx_exit(void)
{ {
__free_page(vmx_io_bitmap_b);
__free_page(vmx_io_bitmap_a);
kvm_exit_arch(); kvm_exit_arch();
} }
......
...@@ -98,8 +98,11 @@ static u8 opcode_table[256] = { ...@@ -98,8 +98,11 @@ static u8 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x40 - 0x4F */ /* 0x40 - 0x4F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x50 - 0x5F */ /* 0x50 - 0x57 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0x58 - 0x5F */
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
/* 0x60 - 0x6F */ /* 0x60 - 0x6F */
0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
...@@ -128,9 +131,9 @@ static u8 opcode_table[256] = { ...@@ -128,9 +131,9 @@ static u8 opcode_table[256] = {
/* 0xB0 - 0xBF */ /* 0xB0 - 0xBF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xC0 - 0xC7 */ /* 0xC0 - 0xC7 */
ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, 0, ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
0, 0, ByteOp | DstMem | SrcImm | ModRM | Mov, 0, ImplicitOps, 0, 0,
DstMem | SrcImm | ModRM | Mov, ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
/* 0xC8 - 0xCF */ /* 0xC8 - 0xCF */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xD0 - 0xD7 */ /* 0xD0 - 0xD7 */
...@@ -143,7 +146,8 @@ static u8 opcode_table[256] = { ...@@ -143,7 +146,8 @@ static u8 opcode_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xF0 - 0xF7 */ /* 0xF0 - 0xF7 */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, ImplicitOps, 0,
ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
/* 0xF8 - 0xFF */ /* 0xF8 - 0xFF */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
...@@ -152,7 +156,7 @@ static u8 opcode_table[256] = { ...@@ -152,7 +156,7 @@ static u8 opcode_table[256] = {
static u16 twobyte_table[256] = { static u16 twobyte_table[256] = {
/* 0x00 - 0x0F */ /* 0x00 - 0x0F */
0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */ /* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
/* 0x20 - 0x2F */ /* 0x20 - 0x2F */
...@@ -481,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -481,6 +485,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
int mode = ctxt->mode; int mode = ctxt->mode;
unsigned long modrm_ea; unsigned long modrm_ea;
int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0; int use_modrm_ea, index_reg = 0, base_reg = 0, scale, rip_relative = 0;
int no_wb = 0;
/* Shadow copy of register state. Committed on successful emulation. */ /* Shadow copy of register state. Committed on successful emulation. */
unsigned long _regs[NR_VCPU_REGS]; unsigned long _regs[NR_VCPU_REGS];
...@@ -1047,7 +1052,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1047,7 +1052,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
_regs[VCPU_REGS_RSP]), _regs[VCPU_REGS_RSP]),
&dst.val, dst.bytes, ctxt)) != 0) &dst.val, dst.bytes, ctxt)) != 0)
goto done; goto done;
dst.val = dst.orig_val; /* skanky: disable writeback */ no_wb = 1;
break; break;
default: default:
goto cannot_emulate; goto cannot_emulate;
...@@ -1056,7 +1061,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1056,7 +1061,7 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
} }
writeback: writeback:
if ((d & Mov) || (dst.orig_val != dst.val)) { if (!no_wb) {
switch (dst.type) { switch (dst.type) {
case OP_REG: case OP_REG:
/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
...@@ -1149,6 +1154,23 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1149,6 +1154,23 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
case 0xae ... 0xaf: /* scas */ case 0xae ... 0xaf: /* scas */
DPRINTF("Urk! I don't handle SCAS.\n"); DPRINTF("Urk! I don't handle SCAS.\n");
goto cannot_emulate; goto cannot_emulate;
case 0xf4: /* hlt */
ctxt->vcpu->halt_request = 1;
goto done;
case 0xc3: /* ret */
dst.ptr = &_eip;
goto pop_instruction;
case 0x58 ... 0x5f: /* pop reg */
dst.ptr = (unsigned long *)&_regs[b & 0x7];
pop_instruction:
if ((rc = ops->read_std(register_address(ctxt->ss_base,
_regs[VCPU_REGS_RSP]), dst.ptr, op_bytes, ctxt)) != 0)
goto done;
register_address_increment(_regs[VCPU_REGS_RSP], op_bytes);
no_wb = 1; /* Disable writeback. */
break;
} }
goto writeback; goto writeback;
...@@ -1302,8 +1324,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ...@@ -1302,8 +1324,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
twobyte_special_insn: twobyte_special_insn:
/* Disable writeback. */ /* Disable writeback. */
dst.orig_val = dst.val; no_wb = 1;
switch (b) { switch (b) {
case 0x09: /* wbinvd */
break;
case 0x0d: /* GrpP (prefetch) */ case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */ case 0x18: /* Grp16 (prefetch/nop) */
break; break;
......
...@@ -139,6 +139,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, ...@@ -139,6 +139,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile,
put_filp(file); put_filp(file);
return error; return error;
} }
EXPORT_SYMBOL_GPL(anon_inode_getfd);
/* /*
* A single inode exists for all anon_inode files. Contrary to pipes, * A single inode exists for all anon_inode files. Contrary to pipes,
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#define HPFS_SUPER_MAGIC 0xf995e849 #define HPFS_SUPER_MAGIC 0xf995e849
#define ISOFS_SUPER_MAGIC 0x9660 #define ISOFS_SUPER_MAGIC 0x9660
#define JFFS2_SUPER_MAGIC 0x72b6 #define JFFS2_SUPER_MAGIC 0x72b6
#define KVMFS_SUPER_MAGIC 0x19700426
#define ANON_INODE_FS_MAGIC 0x09041934 #define ANON_INODE_FS_MAGIC 0x09041934
#define MINIX_SUPER_MAGIC 0x137F /* original minix fs */ #define MINIX_SUPER_MAGIC 0x137F /* original minix fs */
......
...@@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, ...@@ -196,6 +196,8 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
#define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */ #define CPU_DEAD 0x0007 /* CPU (unsigned)v dead */
#define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */ #define CPU_LOCK_ACQUIRE 0x0008 /* Acquire all hotcpu locks */
#define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */ #define CPU_LOCK_RELEASE 0x0009 /* Release all hotcpu locks */
#define CPU_DYING 0x000A /* CPU (unsigned)v not running any task,
* not handling interrupts, soon dead */
/* Used for CPU hotplug events occuring while tasks are frozen due to a suspend /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
* operation in progress * operation in progress
...@@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, ...@@ -208,6 +210,7 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
#define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN) #define CPU_DOWN_PREPARE_FROZEN (CPU_DOWN_PREPARE | CPU_TASKS_FROZEN)
#define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN) #define CPU_DOWN_FAILED_FROZEN (CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
#define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN) #define CPU_DEAD_FROZEN (CPU_DEAD | CPU_TASKS_FROZEN)
#define CPU_DYING_FROZEN (CPU_DYING | CPU_TASKS_FROZEN)
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* _LINUX_NOTIFIER_H */ #endif /* _LINUX_NOTIFIER_H */
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
*/ */
#include <linux/errno.h> #include <linux/errno.h>
#include <asm/system.h>
extern void cpu_idle(void); extern void cpu_idle(void);
...@@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { } ...@@ -102,7 +103,11 @@ static inline void smp_send_reschedule(int cpu) { }
static inline int smp_call_function_single(int cpuid, void (*func) (void *info), static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
void *info, int retry, int wait) void *info, int retry, int wait)
{ {
return -EBUSY; WARN_ON(cpuid != 0);
local_irq_disable();
func(info);
local_irq_enable();
return 0;
} }
#endif /* !SMP */ #endif /* !SMP */
......
...@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu) ...@@ -103,11 +103,19 @@ static inline void check_for_tasks(int cpu)
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
} }
struct take_cpu_down_param {
unsigned long mod;
void *hcpu;
};
/* Take this CPU down. */ /* Take this CPU down. */
static int take_cpu_down(void *unused) static int take_cpu_down(void *_param)
{ {
struct take_cpu_down_param *param = _param;
int err; int err;
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
/* Ensure this CPU doesn't handle any more interrupts. */ /* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable(); err = __cpu_disable();
if (err < 0) if (err < 0)
...@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) ...@@ -127,6 +135,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
cpumask_t old_allowed, tmp; cpumask_t old_allowed, tmp;
void *hcpu = (void *)(long)cpu; void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
.mod = mod,
.hcpu = hcpu,
};
if (num_online_cpus() == 1) if (num_online_cpus() == 1)
return -EBUSY; return -EBUSY;
...@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) ...@@ -153,7 +165,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
set_cpus_allowed(current, tmp); set_cpus_allowed(current, tmp);
mutex_lock(&cpu_bitmask_lock); mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, NULL, cpu); p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
mutex_unlock(&cpu_bitmask_lock); mutex_unlock(&cpu_bitmask_lock);
if (IS_ERR(p) || cpu_online(cpu)) { if (IS_ERR(p) || cpu_online(cpu)) {
......
...@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void) ...@@ -2138,6 +2138,9 @@ static void common_cpu_mem_hotplug_unplug(void)
static int cpuset_handle_cpuhp(struct notifier_block *nb, static int cpuset_handle_cpuhp(struct notifier_block *nb,
unsigned long phase, void *cpu) unsigned long phase, void *cpu)
{ {
if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
return NOTIFY_DONE;
common_cpu_mem_hotplug_unplug(); common_cpu_mem_hotplug_unplug();
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment