Commit 08d19f51 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm

* 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (134 commits)
  KVM: ia64: Add intel iommu support for guests.
  KVM: ia64: add directed mmio range support for kvm guests
  KVM: ia64: Make pmt table be able to hold physical mmio entries.
  KVM: Move irqchip_in_kernel() from ioapic.h to irq.h
  KVM: Separate irq ack notification out of arch/x86/kvm/irq.c
  KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for all archs
  KVM: Move device assignment logic to common code
  KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/
  KVM: VMX: enable invlpg exiting if EPT is disabled
  KVM: x86: Silence various LAPIC-related host kernel messages
  KVM: Device Assignment: Map mmio pages into VT-d page table
  KVM: PIC: enhance IPI avoidance
  KVM: MMU: add "oos_shadow" parameter to disable oos
  KVM: MMU: speed up mmu_unsync_walk
  KVM: MMU: out of sync shadow core
  KVM: MMU: mmu_convert_notrap helper
  KVM: MMU: awareness of new kvm_mmu_zap_page behaviour
  KVM: MMU: mmu_parent_walk
  KVM: x86: trap invlpg
  KVM: MMU: sync roots on mmu reload
  ...
parents 1c95e1b6 2381ad24
......@@ -2448,7 +2448,14 @@ S: Supported
KERNEL VIRTUAL MACHINE (KVM)
P: Avi Kivity
M: avi@qumranet.com
M: avi@redhat.com
L: kvm@vger.kernel.org
W: http://kvm.qumranet.com
S: Supported
KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
P: Joerg Roedel
M: joerg.roedel@amd.com
L: kvm@vger.kernel.org
W: http://kvm.qumranet.com
S: Supported
......
......@@ -132,7 +132,7 @@
#define GPFN_IOSAPIC (4UL << 60) /* IOSAPIC base */
#define GPFN_LEGACY_IO (5UL << 60) /* Legacy I/O base */
#define GPFN_GFW (6UL << 60) /* Guest Firmware */
#define GPFN_HIGH_MMIO (7UL << 60) /* High MMIO range */
#define GPFN_PHYS_MMIO (7UL << 60) /* Directed MMIO Range */
#define GPFN_IO_MASK (7UL << 60) /* Guest pfn is I/O type */
#define GPFN_INV_MASK (1UL << 63) /* Guest pfn is invalid */
......@@ -413,6 +413,10 @@ struct kvm_arch {
struct kvm_ioapic *vioapic;
struct kvm_vm_stat stat;
struct kvm_sal_data rdv_sal_data;
struct list_head assigned_dev_head;
struct dmar_domain *intel_iommu_domain;
struct hlist_head irq_ack_notifier_list;
};
union cpuid3_t {
......
......@@ -46,4 +46,6 @@ config KVM_INTEL
config KVM_TRACE
bool
source drivers/virtio/Kconfig
endif # VIRTUALIZATION
......@@ -44,7 +44,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o)
coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_DMAR),y)
common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
endif
kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
obj-$(CONFIG_KVM) += kvm.o
......
/*
* irq.h: In-kernel interrupt controller related definitions
* Copyright (c) 2008, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
* Authors:
* Xiantao Zhang <xiantao.zhang@intel.com>
*
*/
#ifndef __IRQ_H
#define __IRQ_H
static inline int irqchip_in_kernel(struct kvm *kvm)
{
return 1;
}
#endif
......@@ -31,6 +31,7 @@
#include <linux/bitops.h>
#include <linux/hrtimer.h>
#include <linux/uaccess.h>
#include <linux/intel-iommu.h>
#include <asm/pgtable.h>
#include <asm/gcc_intrin.h>
......@@ -45,6 +46,7 @@
#include "iodev.h"
#include "ioapic.h"
#include "lapic.h"
#include "irq.h"
static unsigned long kvm_vmm_base;
static unsigned long kvm_vsa_base;
......@@ -179,12 +181,16 @@ int kvm_dev_ioctl_check_extension(long ext)
switch (ext) {
case KVM_CAP_IRQCHIP:
case KVM_CAP_USER_MEMORY:
case KVM_CAP_MP_STATE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
case KVM_CAP_IOMMU:
r = intel_iommu_found();
break;
default:
r = 0;
}
......@@ -771,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm)
*/
kvm_build_io_pmt(kvm);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
}
struct kvm *kvm_arch_create_vm(void)
......@@ -1334,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_iommu_unmap_guest(kvm);
#ifdef KVM_CAP_DEVICE_ASSIGNMENT
kvm_free_all_assigned_devices(kvm);
#endif
kfree(kvm->arch.vioapic);
kvm_release_vm_pages(kvm);
kvm_free_physmem(kvm);
......@@ -1435,17 +1446,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
int user_alloc)
{
unsigned long i;
struct page *page;
unsigned long pfn;
int npages = mem->memory_size >> PAGE_SHIFT;
struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
unsigned long base_gfn = memslot->base_gfn;
for (i = 0; i < npages; i++) {
page = gfn_to_page(kvm, base_gfn + i);
kvm_set_pmt_entry(kvm, base_gfn + i,
page_to_pfn(page) << PAGE_SHIFT,
_PAGE_AR_RWX|_PAGE_MA_WB);
memslot->rmap[i] = (unsigned long)page;
pfn = gfn_to_pfn(kvm, base_gfn + i);
if (!kvm_is_mmio_pfn(pfn)) {
kvm_set_pmt_entry(kvm, base_gfn + i,
pfn << PAGE_SHIFT,
_PAGE_AR_RWX | _PAGE_MA_WB);
memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
} else {
kvm_set_pmt_entry(kvm, base_gfn + i,
GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
_PAGE_MA_UC);
memslot->rmap[i] = 0;
}
}
return 0;
......@@ -1789,11 +1807,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
return -EINVAL;
vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
vcpu_put(vcpu);
return 0;
}
static int vcpu_reset(struct kvm_vcpu *vcpu)
{
int r;
long psr;
local_irq_save(psr);
r = kvm_insert_vmm_mapping(vcpu);
if (r)
goto fail;
vcpu->arch.launched = 0;
kvm_arch_vcpu_uninit(vcpu);
r = kvm_arch_vcpu_init(vcpu);
if (r)
goto fail;
kvm_purge_vmm_mapping(vcpu);
r = 0;
fail:
local_irq_restore(psr);
return r;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
return -EINVAL;
int r = 0;
vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
r = vcpu_reset(vcpu);
vcpu_put(vcpu);
return r;
}
......@@ -50,27 +50,18 @@
#define PAL_VSA_SYNC_READ \
/* begin to call pal vps sync_read */ \
{.mii; \
add r25 = VMM_VPD_BASE_OFFSET, r21; \
adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21; /* entry point */ \
nop 0x0; \
mov r24=ip; \
;; \
} \
{.mmb \
add r24=0x20, r24; \
ld8 r25 = [r25]; /* read vpd base */ \
ld8 r20 = [r20]; \
;; \
add r20 = PAL_VPS_SYNC_READ,r20; \
;; \
{ .mii; \
nop 0x0; \
mov r24 = ip; \
mov b0 = r20; \
br.cond.sptk kvm_vps_sync_read; /*call the service*/ \
;; \
}; \
{ .mmb; \
add r24 = 0x20, r24; \
nop 0x0; \
br.cond.sptk b0; /* call the service */ \
;; \
};
#define KVM_MINSTATE_GET_CURRENT(reg) mov reg=r21
......
/*
* arch/ia64/vmx/optvfault.S
* arch/ia64/kvm/optvfault.S
* optimize virtualization fault handler
*
* Copyright (C) 2006 Intel Co
* Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
* Copyright (C) 2008 Intel Co
* Add the support for Tukwila processors.
* Xiantao Zhang <xiantao.zhang@intel.com>
*/
#include <asm/asmmacro.h>
......@@ -20,6 +23,98 @@
#define ACCE_MOV_TO_PSR
#define ACCE_THASH
#define VMX_VPS_SYNC_READ \
add r16=VMM_VPD_BASE_OFFSET,r21; \
mov r17 = b0; \
mov r18 = r24; \
mov r19 = r25; \
mov r20 = r31; \
;; \
{.mii; \
ld8 r16 = [r16]; \
nop 0x0; \
mov r24 = ip; \
;; \
}; \
{.mmb; \
add r24=0x20, r24; \
mov r25 =r16; \
br.sptk.many kvm_vps_sync_read; \
}; \
mov b0 = r17; \
mov r24 = r18; \
mov r25 = r19; \
mov r31 = r20
ENTRY(kvm_vps_entry)
adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
;;
ld8 r29 = [r29]
;;
add r29 = r29, r30
;;
mov b0 = r29
br.sptk.many b0
END(kvm_vps_entry)
/*
* Inputs:
* r24 : return address
* r25 : vpd
* r29 : scratch
*
*/
GLOBAL_ENTRY(kvm_vps_sync_read)
movl r30 = PAL_VPS_SYNC_READ
;;
br.sptk.many kvm_vps_entry
END(kvm_vps_sync_read)
/*
* Inputs:
* r24 : return address
* r25 : vpd
* r29 : scratch
*
*/
GLOBAL_ENTRY(kvm_vps_sync_write)
movl r30 = PAL_VPS_SYNC_WRITE
;;
br.sptk.many kvm_vps_entry
END(kvm_vps_sync_write)
/*
* Inputs:
* r23 : pr
* r24 : guest b0
* r25 : vpd
*
*/
GLOBAL_ENTRY(kvm_vps_resume_normal)
movl r30 = PAL_VPS_RESUME_NORMAL
;;
mov pr=r23,-2
br.sptk.many kvm_vps_entry
END(kvm_vps_resume_normal)
/*
* Inputs:
* r23 : pr
* r24 : guest b0
* r25 : vpd
* r17 : isr
*/
GLOBAL_ENTRY(kvm_vps_resume_handler)
movl r30 = PAL_VPS_RESUME_HANDLER
;;
ld8 r27=[r25]
shr r17=r17,IA64_ISR_IR_BIT
;;
dep r27=r17,r27,63,1 // bit 63 of r27 indicate whether enable CFLE
mov pr=r23,-2
br.sptk.many kvm_vps_entry
END(kvm_vps_resume_handler)
//mov r1=ar3
GLOBAL_ENTRY(kvm_asm_mov_from_ar)
#ifndef ACCE_MOV_FROM_AR
......@@ -157,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm)
#ifndef ACCE_RSM
br.many kvm_virtualization_fault_back
#endif
add r16=VMM_VPD_BASE_OFFSET,r21
VMX_VPS_SYNC_READ
;;
extr.u r26=r25,6,21
extr.u r27=r25,31,2
;;
ld8 r16=[r16]
extr.u r28=r25,36,1
dep r26=r27,r26,21,2
;;
......@@ -196,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
tbit.nz p6,p0=r23,0
;;
tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
(p6) br.dptk kvm_resume_to_guest
(p6) br.dptk kvm_resume_to_guest_with_sync
;;
add r26=VMM_VCPU_META_RR0_OFFSET,r21
add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
......@@ -212,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
mov rr[r28]=r27
;;
srlz.d
br.many kvm_resume_to_guest
br.many kvm_resume_to_guest_with_sync
END(kvm_asm_rsm)
......@@ -221,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm)
#ifndef ACCE_SSM
br.many kvm_virtualization_fault_back
#endif
add r16=VMM_VPD_BASE_OFFSET,r21
VMX_VPS_SYNC_READ
;;
extr.u r26=r25,6,21
extr.u r27=r25,31,2
;;
ld8 r16=[r16]
extr.u r28=r25,36,1
dep r26=r27,r26,21,2
;; //r26 is imm24
......@@ -271,7 +366,7 @@ kvm_asm_ssm_1:
tbit.nz p6,p0=r29,IA64_PSR_I_BIT
;;
tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
(p6) br.dptk kvm_resume_to_guest
(p6) br.dptk kvm_resume_to_guest_with_sync
;;
add r29=VPD_VTPR_START_OFFSET,r16
add r30=VPD_VHPI_START_OFFSET,r16
......@@ -286,7 +381,7 @@ kvm_asm_ssm_1:
;;
cmp.gt p6,p0=r30,r17
(p6) br.dpnt.few kvm_asm_dispatch_vexirq
br.many kvm_resume_to_guest
br.many kvm_resume_to_guest_with_sync
END(kvm_asm_ssm)
......@@ -295,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr)
#ifndef ACCE_MOV_TO_PSR
br.many kvm_virtualization_fault_back
#endif
add r16=VMM_VPD_BASE_OFFSET,r21
extr.u r26=r25,13,7 //r2
VMX_VPS_SYNC_READ
;;
ld8 r16=[r16]
extr.u r26=r25,13,7 //r2
addl r20=@gprel(asm_mov_from_reg),gp
;;
adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
......@@ -374,7 +468,7 @@ kvm_asm_mov_to_psr_1:
;;
tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
(p6) br.dpnt.few kvm_resume_to_guest
(p6) br.dpnt.few kvm_resume_to_guest_with_sync
;;
add r29=VPD_VTPR_START_OFFSET,r16
add r30=VPD_VHPI_START_OFFSET,r16
......@@ -389,13 +483,29 @@ kvm_asm_mov_to_psr_1:
;;
cmp.gt p6,p0=r30,r17
(p6) br.dpnt.few kvm_asm_dispatch_vexirq
br.many kvm_resume_to_guest
br.many kvm_resume_to_guest_with_sync
END(kvm_asm_mov_to_psr)
ENTRY(kvm_asm_dispatch_vexirq)
//increment iip
mov r17 = b0
mov r18 = r31
{.mii
add r25=VMM_VPD_BASE_OFFSET,r21
nop 0x0
mov r24 = ip
;;
}
{.mmb
add r24 = 0x20, r24
ld8 r25 = [r25]
br.sptk.many kvm_vps_sync_write
}
mov b0 =r17
mov r16=cr.ipsr
mov r31 = r18
mov r19 = 37
;;
extr.u r17=r16,IA64_PSR_RI_BIT,2
tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
......@@ -435,25 +545,31 @@ GLOBAL_ENTRY(kvm_asm_thash)
;;
kvm_asm_thash_back1:
shr.u r23=r19,61 // get RR number
adds r25=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr
adds r28=VMM_VCPU_VRR0_OFFSET,r21 // get vcpu->arch.vrr[0]'s addr
adds r16=VMM_VPD_VPTA_OFFSET,r16 // get vpta
;;
shladd r27=r23,3,r25 // get vcpu->arch.vrr[r23]'s addr
shladd r27=r23,3,r28 // get vcpu->arch.vrr[r23]'s addr
ld8 r17=[r16] // get PTA
mov r26=1
;;
extr.u r29=r17,2,6 // get pta.size
ld8 r25=[r27] // get vcpu->arch.vrr[r23]'s value
extr.u r29=r17,2,6 // get pta.size
ld8 r28=[r27] // get vcpu->arch.vrr[r23]'s value
;;
extr.u r25=r25,2,6 // get rr.ps
mov b0=r24
//Fallback to C if pta.vf is set
tbit.nz p6,p0=r17, 8
;;
(p6) mov r24=EVENT_THASH
(p6) br.cond.dpnt.many kvm_virtualization_fault_back
extr.u r28=r28,2,6 // get rr.ps
shl r22=r26,r29 // 1UL << pta.size
;;
shr.u r23=r19,r25 // vaddr >> rr.ps
shr.u r23=r19,r28 // vaddr >> rr.ps
adds r26=3,r29 // pta.size + 3
shl r27=r17,3 // pta << 3
;;
shl r23=r23,3 // (vaddr >> rr.ps) << 3
shr.u r27=r27,r26 // (pta << 3) >> (pta.size+3)
shr.u r27=r27,r26 // (pta << 3) >> (pta.size+3)
movl r16=7<<61
;;
adds r22=-1,r22 // (1UL << pta.size) - 1
......@@ -724,6 +840,29 @@ END(asm_mov_from_reg)
* r31: pr
* r24: b0
*/
ENTRY(kvm_resume_to_guest_with_sync)
adds r19=VMM_VPD_BASE_OFFSET,r21
mov r16 = r31
mov r17 = r24
;;
{.mii
ld8 r25 =[r19]
nop 0x0
mov r24 = ip
;;
}
{.mmb
add r24 =0x20, r24
nop 0x0
br.sptk.many kvm_vps_sync_write
}
mov r31 = r16
mov r24 =r17
;;
br.sptk.many kvm_resume_to_guest
END(kvm_resume_to_guest_with_sync)
ENTRY(kvm_resume_to_guest)
adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
;;
......
......@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
void vmm_transition(struct kvm_vcpu *vcpu)
{
ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
0, 0, 0, 0, 0, 0);
1, 0, 0, 0, 0, 0);
vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
0, 0, 0, 0, 0, 0);
1, 0, 0, 0, 0, 0);
kvm_do_resume_op(vcpu);
}
......@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
trp->rid = rid;
}
extern u64 kvm_lookup_mpa(u64 gpfn);
extern u64 kvm_gpa_to_mpa(u64 gpa);
/* Return I/O type if trye */
#define __gpfn_is_io(gpfn) \
({ \
u64 pte, ret = 0; \
pte = kvm_lookup_mpa(gpfn); \
if (!(pte & GPFN_INV_MASK)) \
ret = pte & GPFN_IO_MASK; \
ret; \
})
extern u64 kvm_get_mpt_entry(u64 gpfn);
/* Return I/ */
static inline u64 __gpfn_is_io(u64 gpfn)
{
u64 pte;
pte = kvm_get_mpt_entry(gpfn);
if (!(pte & GPFN_INV_MASK)) {
pte = pte & GPFN_IO_MASK;
if (pte != GPFN_PHYS_MMIO)
return pte;
}
return 0;
}
#endif
#define IA64_NO_FAULT 0
#define IA64_FAULT 1
......
......@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid:
adds r19=VMM_VPD_VPSR_OFFSET,r18
;;
ld8 r19=[r19] //vpsr
adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
;;
ld8 r20=[r20]
;;
//vsa_sync_write_start
mov r25=r18
adds r16= VMM_VCPU_GP_OFFSET,r21
;;
......@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid:
;;
add r24=r24,r16
;;
add r16=PAL_VPS_SYNC_WRITE,r20
;;
mov b0=r16
br.cond.sptk b0 // call the service
br.sptk.many kvm_vps_sync_write // call the service
;;
END(ia64_leave_hypervisor)
// fall through
......@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry)
* r17:cr.isr
* r18:vpd
* r19:vpsr
* r20:__vsa_base
* r22:b0
* r23:predicate
*/
mov r24=r22
mov r25=r18
tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT // p1=vpsr.ic
(p1) br.cond.sptk.few kvm_vps_resume_normal
(p2) br.cond.sptk.many kvm_vps_resume_handler
;;
(p1) add r29=PAL_VPS_RESUME_NORMAL,r20
(p1) br.sptk.many ia64_vmm_entry_out
;;
tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT //p1=cr.isr.ir
;;
(p1) add r29=PAL_VPS_RESUME_NORMAL,r20
(p2) add r29=PAL_VPS_RESUME_HANDLER,r20
(p2) ld8 r26=[r25]
;;
ia64_vmm_entry_out:
mov pr=r23,-2
mov b0=r29
;;
br.cond.sptk b0 // call pal service
END(ia64_vmm_entry)
......@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
//set up ipsr, iip, vpd.vpsr, dcr
// For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
// For DCR: all bits 0
bsw.0
;;
mov r21 =r13
adds r14=-VMM_PT_REGS_SIZE, r12
;;
movl r6=0x501008826000 // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
......@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry)
;;
srlz.i
;;
bsw.0
;;
mov r21 =r13
;;
bsw.1
;;
mov ar.rsc = 0
;;
flushrs
......@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
ld8 r1 = [r20]
;;
mov cr.iip=r4
;;
adds r16=VMM_VPD_BASE_OFFSET,r13
adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
;;
ld8 r18=[r16]
ld8 r20=[r20]
;;
adds r19=VMM_VPD_VPSR_OFFSET,r18
;;
......
......@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
{
u64 ps, ps_mask, paddr, maddr;
u64 ps, ps_mask, paddr, maddr, io_mask;
union pte_flags phy_pte;
ps = itir_ps(itir);
......@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
phy_pte.val = *pte;
paddr = *pte;
paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT);
if (maddr & GPFN_IO_MASK) {
maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT);
io_mask = maddr & GPFN_IO_MASK;
if (io_mask && (io_mask != GPFN_PHYS_MMIO)) {
*pte |= VTLB_PTE_IO;
return -1;
}
......@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
u64 ifa, int type)
{
u64 ps;
u64 phy_pte;
u64 phy_pte, io_mask, index;
union ia64_rr vrr, mrr;
int ret = 0;
......@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
vrr.val = vcpu_get_rr(v, ifa);
mrr.val = ia64_get_rr(ifa);
index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK;
phy_pte = translate_phy_pte(&pte, itir, ifa);
/* Ensure WB attribute if pte is related to a normal mem page,
* which is required by vga acceleration since qemu maps shared
* vram buffer with WB.
*/
if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) {
if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) &&
io_mask != GPFN_PHYS_MMIO) {
pte &= ~_PAGE_MA_MASK;
phy_pte &= ~_PAGE_MA_MASK;
}
......@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz)
}
}
u64 kvm_lookup_mpa(u64 gpfn)
u64 kvm_get_mpt_entry(u64 gpfn)
{
u64 *base = (u64 *) KVM_P2M_BASE;
return *(base + gpfn);
}
u64 kvm_lookup_mpa(u64 gpfn)
{
u64 maddr;
maddr = kvm_get_mpt_entry(gpfn);
return maddr&_PAGE_PPN_MASK;
}
u64 kvm_gpa_to_mpa(u64 gpa)
{
u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
......
......@@ -81,11 +81,17 @@ struct kvm_vcpu_arch {
struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
/* Pages which are referenced in the shadow TLB. */
struct page *shadow_pages[PPC44x_TLB_SIZE];
/* Copy of the host's TLB. */
struct tlbe host_tlb[PPC44x_TLB_SIZE];
/* Track which TLB entries we've modified in the current exit. */
u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
u32 host_stack;
u32 host_pid;
u32 host_dbcr0;
u32 host_dbcr1;
u32 host_dbcr2;
u32 host_iac[4];
u32 host_msr;
u64 fpr[32];
u32 gpr[32];
......@@ -123,7 +129,11 @@ struct kvm_vcpu_arch {
u32 ivor[16];
u32 ivpr;
u32 pir;
u32 shadow_pid;
u32 pid;
u32 swap_pid;
u32 pvr;
u32 ccr0;
u32 ccr1;
......
......@@ -64,6 +64,10 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
gva_t eend, u32 asid);
extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
/* XXX Book E specific */
extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
......@@ -92,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
kvm_vcpu_block(vcpu);
}
static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
{
if (vcpu->arch.pid != new_pid) {
vcpu->arch.pid = new_pid;
vcpu->arch.swap_pid = 1;
}
}
#endif /* __POWERPC_KVM_PPC_H__ */
......@@ -359,8 +359,8 @@ int main(void)
DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
......@@ -372,7 +372,7 @@ int main(void)
DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid));
DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
......
......@@ -19,6 +19,7 @@
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <asm/mmu-44x.h>
......@@ -109,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
}
/* Must be called with mmap_sem locked for writing. */
static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
unsigned int index)
{
......@@ -124,6 +124,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
}
}
void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
{
vcpu->arch.shadow_tlb_mod[i] = 1;
}
/* Caller must ensure that the specified guest TLB entry is safe to insert into
* the shadow TLB. */
void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
......@@ -142,19 +147,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
stlbe = &vcpu->arch.shadow_tlb[victim];
/* Get reference to new page. */
down_read(&current->mm->mmap_sem);
new_page = gfn_to_page(vcpu->kvm, gfn);
if (is_error_page(new_page)) {
printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
kvm_release_page_clean(new_page);
up_read(&current->mm->mmap_sem);
return;
}
hpaddr = page_to_phys(new_page);
/* Drop reference to old page. */
kvmppc_44x_shadow_release(vcpu, victim);
up_read(&current->mm->mmap_sem);
vcpu->arch.shadow_pages[victim] = new_page;
......@@ -164,27 +166,30 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
/* XXX what about AS? */
stlbe->tid = asid & 0xff;
stlbe->tid = !(asid & 0xff);
/* Force TS=1 for all guest mappings. */
/* For now we hardcode 4KB mappings, but it will be important to
* use host large pages in the future. */
stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
| PPC44x_TLB_4K;
stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
vcpu->arch.msr & MSR_PR);
kvmppc_tlbe_set_modified(vcpu, victim);
KVMTRACE_5D(STLB_WRITE, vcpu, victim,
stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
handler);
}
void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
gva_t eend, u32 asid)
{
unsigned int pid = asid & 0xff;
unsigned int pid = !(asid & 0xff);
int i;
/* XXX Replace loop with fancy data structures. */
down_write(&current->mm->mmap_sem);
for (i = 0; i <= tlb_44x_hwater; i++) {
struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
unsigned int tid;
......@@ -204,21 +209,35 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
kvmppc_44x_shadow_release(vcpu, i);
stlbe->word0 = 0;
kvmppc_tlbe_set_modified(vcpu, i);
KVMTRACE_5D(STLB_INVAL, vcpu, i,
stlbe->tid, stlbe->word0, stlbe->word1,
stlbe->word2, handler);
}
up_write(&current->mm->mmap_sem);
}
/* Invalidate all mappings, so that when they fault back in they will get the
* proper permission bits. */
/* Invalidate all mappings on the privilege switch after PID has been changed.
* The guest always runs with PID=1, so we must clear the entire TLB when
* switching address spaces. */
void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
{
int i;
/* XXX Replace loop with fancy data structures. */
down_write(&current->mm->mmap_sem);
for (i = 0; i <= tlb_44x_hwater; i++) {
kvmppc_44x_shadow_release(vcpu, i);
vcpu->arch.shadow_tlb[i].word0 = 0;
if (vcpu->arch.swap_pid) {
/* XXX Replace loop with fancy data structures. */
for (i = 0; i <= tlb_44x_hwater; i++) {
struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
/* Future optimization: clear only userspace mappings. */
kvmppc_44x_shadow_release(vcpu, i);
stlbe->word0 = 0;
kvmppc_tlbe_set_modified(vcpu, i);
KVMTRACE_5D(STLB_INVAL, vcpu, i,
stlbe->tid, stlbe->word0, stlbe->word1,
stlbe->word2, handler);
}
vcpu->arch.swap_pid = 0;
}
up_write(&current->mm->mmap_sem);
vcpu->arch.shadow_pid = !usermode;
}
......@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST
Provides host support for KVM on Book E PowerPC processors. Currently
this works on 440 processors only.
config KVM_TRACE
bool "KVM trace support"
depends on KVM && MARKERS && SYSFS
select RELAY
select DEBUG_FS
default n
---help---
This option allows reading a trace of kvm-related events through
relayfs. Note the ABI is not considered stable and will be
modified in future updates.
source drivers/virtio/Kconfig
endif # VIRTUALIZATION
......@@ -4,9 +4,11 @@
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o
common-objs-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o)
kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
obj-$(CONFIG_KVM) += kvm.o
AFLAGS_booke_interrupts.o := -I$(obj)
......
......@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
}
case BOOKE_INTERRUPT_DEBUG: {
u32 dbsr;
vcpu->arch.pc = mfspr(SPRN_CSRR0);
/* clear IAC events in DBSR register */
dbsr = mfspr(SPRN_DBSR);
dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
mtspr(SPRN_DBSR, dbsr);
run->exit_reason = KVM_EXIT_DEBUG;
r = RESUME_HOST;
break;
}
default:
printk(KERN_EMERG "exit_nr %d\n", exit_nr);
BUG();
......@@ -471,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu->arch.msr = 0;
vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
vcpu->arch.shadow_pid = 1;
/* Eye-catching number so we know if the guest takes an interrupt
* before it's programmed its own IVPR. */
vcpu->arch.ivpr = 0x55550000;
......
......@@ -42,7 +42,8 @@
#define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */
#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
(1<<BOOKE_INTERRUPT_DTLB_MISS))
(1<<BOOKE_INTERRUPT_DTLB_MISS) | \
(1<<BOOKE_INTERRUPT_DEBUG))
#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
(1<<BOOKE_INTERRUPT_DTLB_MISS))
......@@ -331,51 +332,57 @@ lightweight_exit:
mfspr r3, SPRN_PID
stw r3, VCPU_HOST_PID(r4)
lwz r3, VCPU_PID(r4)
lwz r3, VCPU_SHADOW_PID(r4)
mtspr SPRN_PID, r3
/* Prevent all TLB updates. */
/* Prevent all asynchronous TLB updates. */
mfmsr r5
lis r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
ori r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
andc r6, r5, r6
mtmsr r6
/* Save the host's non-pinned TLB mappings, and load the guest mappings
* over them. Leave the host's "pinned" kernel mappings in place. */
/* XXX optimization: use generation count to avoid swapping unmodified
* entries. */
/* Load the guest mappings, leaving the host's "pinned" kernel mappings
* in place. */
mfspr r10, SPRN_MMUCR /* Save host MMUCR. */
lis r8, tlb_44x_hwater@ha
lwz r8, tlb_44x_hwater@l(r8)
addi r3, r4, VCPU_HOST_TLB - 4
addi r9, r4, VCPU_SHADOW_TLB - 4
li r6, 0
li r5, PPC44x_TLB_SIZE
lis r5, tlb_44x_hwater@ha
lwz r5, tlb_44x_hwater@l(r5)
mtctr r5
addi r9, r4, VCPU_SHADOW_TLB
addi r5, r4, VCPU_SHADOW_MOD
li r3, 0
1:
/* Save host entry. */
tlbre r7, r6, PPC44x_TLB_PAGEID
mfspr r5, SPRN_MMUCR
stwu r5, 4(r3)
stwu r7, 4(r3)
tlbre r7, r6, PPC44x_TLB_XLAT
stwu r7, 4(r3)
tlbre r7, r6, PPC44x_TLB_ATTRIB
stwu r7, 4(r3)
lbzx r7, r3, r5
cmpwi r7, 0
beq 3f
/* Load guest entry. */
lwzu r7, 4(r9)
mulli r11, r3, TLBE_BYTES
add r11, r11, r9
lwz r7, 0(r11)
mtspr SPRN_MMUCR, r7
lwzu r7, 4(r9)
tlbwe r7, r6, PPC44x_TLB_PAGEID
lwzu r7, 4(r9)
tlbwe r7, r6, PPC44x_TLB_XLAT
lwzu r7, 4(r9)
tlbwe r7, r6, PPC44x_TLB_ATTRIB
/* Increment index. */
addi r6, r6, 1
cmpw r6, r8
blt 1b
lwz r7, 4(r11)
tlbwe r7, r3, PPC44x_TLB_PAGEID
lwz r7, 8(r11)
tlbwe r7, r3, PPC44x_TLB_XLAT
lwz r7, 12(r11)
tlbwe r7, r3, PPC44x_TLB_ATTRIB
3:
addi r3, r3, 1 /* Increment index. */
bdnz 1b
mtspr SPRN_MMUCR, r10 /* Restore host MMUCR. */
/* Clear bitmap of modified TLB entries */
li r5, PPC44x_TLB_SIZE>>2
mtctr r5
addi r5, r4, VCPU_SHADOW_MOD - 4
li r6, 0
1:
stwu r6, 4(r5)
bdnz 1b
iccci 0, 0 /* XXX hack */
/* Load some guest volatiles. */
......@@ -431,6 +438,14 @@ lightweight_exit:
oris r3, r3, KVMPPC_MSR_MASK@h
ori r3, r3, KVMPPC_MSR_MASK@l
mtsrr1 r3
/* Clear any debug events which occurred since we disabled MSR[DE].
* XXX This gives us a 3-instruction window in which a breakpoint
* intended for guest context could fire in the host instead. */
lis r3, 0xffff
ori r3, r3, 0xffff
mtspr SPRN_DBSR, r3
lwz r3, VCPU_GPR(r3)(r4)
lwz r4, VCPU_GPR(r4)(r4)
rfi
......@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
}
KVMTRACE_5D(GTLB_WRITE, vcpu, index,
tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
handler);
return EMULATE_DONE;
}
......@@ -504,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
case SPRN_MMUCR:
vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
case SPRN_PID:
vcpu->arch.pid = vcpu->arch.gpr[rs]; break;
kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
case SPRN_CCR0:
vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
case SPRN_CCR1:
......@@ -765,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
break;
}
KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
if (advance)
vcpu->arch.pc += 4; /* Advance past emulated instruction. */
......
......@@ -27,6 +27,7 @@
#include <asm/cputable.h>
#include <asm/uaccess.h>
#include <asm/kvm_ppc.h>
#include <asm/tlbflush.h>
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
......@@ -239,18 +240,114 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
}
/* Note: clearing MSR[DE] just means that the debug interrupt will not be
* delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
* If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
* will be delivered as an "imprecise debug event" (which is indicated by
* DBSR[IDE].
*/
static void kvmppc_disable_debug_interrupts(void)
{
mtmsr(mfmsr() & ~MSR_DE);
}
static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
{
kvmppc_disable_debug_interrupts();
mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
mtmsr(vcpu->arch.host_msr);
}
static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
{
struct kvm_guest_debug *dbg = &vcpu->guest_debug;
u32 dbcr0 = 0;
vcpu->arch.host_msr = mfmsr();
kvmppc_disable_debug_interrupts();
/* Save host debug register state. */
vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
/* set registers up for guest */
if (dbg->bp[0]) {
mtspr(SPRN_IAC1, dbg->bp[0]);
dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
}
if (dbg->bp[1]) {
mtspr(SPRN_IAC2, dbg->bp[1]);
dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
}
if (dbg->bp[2]) {
mtspr(SPRN_IAC3, dbg->bp[2]);
dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
}
if (dbg->bp[3]) {
mtspr(SPRN_IAC4, dbg->bp[3]);
dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
}
mtspr(SPRN_DBCR0, dbcr0);
mtspr(SPRN_DBCR1, 0);
mtspr(SPRN_DBCR2, 0);
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
int i;
if (vcpu->guest_debug.enabled)
kvmppc_load_guest_debug_registers(vcpu);
/* Mark every guest entry in the shadow TLB entry modified, so that they
* will all be reloaded on the next vcpu run (instead of being
* demand-faulted). */
for (i = 0; i <= tlb_44x_hwater; i++)
kvmppc_tlbe_set_modified(vcpu, i);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_debug.enabled)
kvmppc_restore_host_debug_state(vcpu);
/* Don't leave guest TLB entries resident when being de-scheduled. */
/* XXX It would be nice to differentiate between heavyweight exit and
* sched_out here, since we could avoid the TLB flush for heavyweight
* exits. */
_tlbia();
}
int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
struct kvm_debug_guest *dbg)
{
return -ENOTSUPP;
int i;
vcpu->guest_debug.enabled = dbg->enabled;
if (vcpu->guest_debug.enabled) {
for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
if (dbg->breakpoints[i].enabled)
vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
else
vcpu->guest_debug.bp[i] = 0;
}
}
return 0;
}
static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
......
......@@ -565,13 +565,16 @@ config ZFCPDUMP
Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
config S390_GUEST
bool "s390 guest support (EXPERIMENTAL)"
bool "s390 guest support for KVM (EXPERIMENTAL)"
depends on 64BIT && EXPERIMENTAL
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
help
Select this option if you want to run the kernel under s390 linux
Select this option if you want to run the kernel as a guest under
the KVM hypervisor. This will add detection for KVM as well as a
virtio transport. If KVM is detected, the virtio console will be
the default console.
endmenu
source "net/Kconfig"
......
......@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
int rc;
vcpu->stat.instruction_stfl++;
facility_list &= ~(1UL<<24); /* no stfle */
facility_list &= ~(1UL<<23); /* no large pages */
/* only pass the facility bits, which we can handle */
facility_list &= 0xfe00fff3;
rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
&facility_list, sizeof(facility_list));
......
......@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
return ret;
}
/*
* If we don't do that, there is the possibility that the guest
* will calibrate under heavy load - thus, getting a lower lpj -
* and execute the delays themselves without load. This is wrong,
* because no delay loop can finish beforehand.
* Any heuristics is subject to fail, because ultimately, a large
* poll of guests can be running and trouble each other. So we preset
* lpj here
*/
static unsigned long kvm_get_tsc_khz(void)
{
return preset_lpj;
}
static void kvm_get_preset_lpj(void)
{
struct pvclock_vcpu_time_info *src;
unsigned long khz;
u64 lpj;
src = &per_cpu(hv_clock, 0);
khz = pvclock_tsc_khz(src);
lpj = ((u64)khz * 1000);
do_div(lpj, HZ);
preset_lpj = lpj;
}
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
......@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
......@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
}
}
......@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
return dst->version;
}
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
u64 pv_tsc_khz = 1000000ULL << 32;
do_div(pv_tsc_khz, src->tsc_to_system_mul);
if (src->tsc_shift < 0)
pv_tsc_khz <<= -src->tsc_shift;
else
pv_tsc_khz >>= src->tsc_shift;
return pv_tsc_khz;
}
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
......
......@@ -3,10 +3,13 @@
#
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o)
coalesced_mmio.o irq_comm.o)
ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif
ifeq ($(CONFIG_DMAR),y)
common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
......
......@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
if (!atomic_inc_and_test(&pt->pending))
set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (vcpu0 && waitqueue_active(&vcpu0->wq))
wake_up_interruptible(&vcpu0->wq);
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
pt->scheduled = ktime_to_ns(pt->timer.expires);
if (pt->period)
ps->channels[0].count_load_time = pt->timer.expires;
return (pt->period == 0 ? 0 : 1);
}
......@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
{
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
return atomic_read(&pit->pit_state.pit_timer.pending);
return 0;
}
static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
spin_lock(&ps->inject_lock);
if (atomic_dec_return(&ps->pit_timer.pending) < 0)
atomic_inc(&ps->pit_timer.pending);
ps->irq_ack = 1;
spin_unlock(&ps->inject_lock);
}
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
struct kvm_kpit_state *ps;
......@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
hrtimer_cancel(&pt->timer);
}
static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
struct kvm_kpit_timer *pt = &ps->pit_timer;
s64 interval;
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
......@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
pt->period = (is_period == 0) ? 0 : interval;
pt->timer.function = pit_timer_fn;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
......@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
create_pit_timer(&ps->pit_timer, val, 0);
create_pit_timer(ps, val, 0);
break;
case 2:
case 3:
create_pit_timer(&ps->pit_timer, val, 1);
create_pit_timer(ps, val, 1);
break;
default:
destroy_pit_timer(&ps->pit_timer);
......@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
mutex_unlock(&pit->pit_state.lock);
atomic_set(&pit->pit_state.pit_timer.pending, 0);
pit->pit_state.inject_pending = 1;
pit->pit_state.irq_ack = 1;
}
struct kvm_pit *kvm_create_pit(struct kvm *kvm)
......@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
spin_lock_init(&pit->pit_state.inject_lock);
/* Initialize PIO device */
pit->dev.read = pit_ioport_read;
......@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
pit_state->pit = pit;
hrtimer_init(&pit_state->pit_timer.timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
......@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm)
{
mutex_lock(&kvm->lock);
kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
kvm_set_irq(kvm, 0, 1);
kvm_set_irq(kvm, 0, 0);
mutex_unlock(&kvm->lock);
}
......@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm_kpit_state *ps;
if (vcpu && pit) {
int inject = 0;
ps = &pit->pit_state;
/* Try to inject pending interrupts when:
* 1. Pending exists
* 2. Last interrupt was accepted or waited for too long time*/
if (atomic_read(&ps->pit_timer.pending) &&
(ps->inject_pending ||
(jiffies - ps->last_injected_time
>= KVM_MAX_PIT_INTR_INTERVAL))) {
ps->inject_pending = 0;
__inject_pit_timer_intr(kvm);
ps->last_injected_time = jiffies;
}
}
}
void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
struct kvm_arch *arch = &vcpu->kvm->arch;
struct kvm_kpit_state *ps;
if (vcpu && arch->vpit) {
ps = &arch->vpit->pit_state;
if (atomic_read(&ps->pit_timer.pending) &&
(((arch->vpic->pics[0].imr & 1) == 0 &&
arch->vpic->pics[0].irq_base == vec) ||
(arch->vioapic->redirtbl[0].fields.vector == vec &&
arch->vioapic->redirtbl[0].fields.mask != 1))) {
ps->inject_pending = 1;
atomic_dec(&ps->pit_timer.pending);
ps->channels[0].count_load_time = ktime_get();
/* Try to inject pending interrupts when
* last one has been acked.
*/
spin_lock(&ps->inject_lock);
if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
ps->irq_ack = 0;
inject = 1;
}
spin_unlock(&ps->inject_lock);
if (inject)
__inject_pit_timer_intr(kvm);
}
}
......@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
int irq;
s64 period; /* unit: ns */
s64 scheduled;
ktime_t last_update;
atomic_t pending;
};
......@@ -34,8 +33,9 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
bool inject_pending; /* if inject pending interrupts */
unsigned long last_injected_time;
spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
struct kvm_pit {
......@@ -54,7 +54,6 @@ struct kvm_pit {
#define KVM_PIT_CHANNEL_MASK 0x3
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
struct kvm_pit *kvm_create_pit(struct kvm *kvm);
void kvm_free_pit(struct kvm *kvm);
......
......@@ -30,6 +30,19 @@
#include <linux/kvm_host.h>
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
s->isr_ack |= (1 << irq);
}
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
}
/*
* set irq level. If an edge is detected, then the IRR is set to 1
*/
......@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
*/
static inline void pic_intack(struct kvm_kpic_state *s, int irq)
{
s->isr |= 1 << irq;
if (s->auto_eoi) {
if (s->rotate_on_auto_eoi)
s->priority_add = (irq + 1) & 7;
} else
s->isr |= (1 << irq);
pic_clear_isr(s, irq);
}
/*
* We don't clear a level sensitive interrupt here
*/
......@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
s->irr &= ~(1 << irq);
}
int kvm_pic_read_irq(struct kvm_pic *s)
int kvm_pic_read_irq(struct kvm *kvm)
{
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
......@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
kvm_notify_acked_irq(kvm, irq);
return intno;
}
void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq, irqbase;
struct kvm *kvm = s->pics_state->irq_request_opaque;
struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
if (s == &s->pics_state->pics[0])
irqbase = 0;
else
irqbase = 8;
for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
if (s->irr & (1 << irq) || s->isr & (1 << irq))
kvm_notify_acked_irq(kvm, irq+irqbase);
}
s->last_irr = 0;
s->irr = 0;
s->imr = 0;
s->isr = 0;
s->isr_ack = 0xff;
s->priority_add = 0;
s->irq_base = 0;
s->read_reg_select = 0;
......@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
priority = get_priority(s, s->isr);
if (priority != 8) {
irq = (priority + s->priority_add) & 7;
s->isr &= ~(1 << irq);
pic_clear_isr(s, irq);
if (cmd == 5)
s->priority_add = (irq + 1) & 7;
pic_update_irq(s->pics_state);
......@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 3:
irq = val & 7;
s->isr &= ~(1 << irq);
pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
case 6:
......@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
break;
case 7:
irq = val & 7;
s->isr &= ~(1 << irq);
s->priority_add = (irq + 1) & 7;
pic_clear_isr(s, irq);
pic_update_irq(s->pics_state);
break;
default:
......@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
s->pics_state->pics[0].irr &= ~(1 << 2);
}
s->irr &= ~(1 << ret);
s->isr &= ~(1 << ret);
pic_clear_isr(s, ret);
if (addr1 >> 7 || ret != 2)
pic_update_irq(s->pics_state);
} else {
......@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
{
struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->vcpus[0];
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
pic_irqchip(kvm)->output = level;
if (vcpu)
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
kvm_vcpu_kick(vcpu);
}
}
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
......
......@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (kvm_apic_accept_pic_intr(v)) {
s = pic_irqchip(v->kvm);
s->output = 0; /* PIC */
vector = kvm_pic_read_irq(s);
vector = kvm_pic_read_irq(v->kvm);
}
}
return vector;
......@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
kvm_apic_timer_intr_post(vcpu, vec);
kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
......
......@@ -42,6 +42,7 @@ struct kvm_kpic_state {
u8 irr; /* interrupt request register */
u8 imr; /* interrupt mask register */
u8 isr; /* interrupt service register */
u8 isr_ack; /* interrupt ack detection */
u8 priority_add; /* highest irq priority */
u8 irq_base;
u8 read_reg_select;
......@@ -63,12 +64,13 @@ struct kvm_pic {
void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
void kvm_pic_set_irq(void *opaque, int irq, int level);
int kvm_pic_read_irq(struct kvm_pic *s);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
void kvm_pic_clear_isr_ack(struct kvm *kvm);
static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
{
......
#ifndef ASM_KVM_CACHE_REGS_H
#define ASM_KVM_CACHE_REGS_H
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, reg);
return vcpu->arch.regs[reg];
}
static inline void kvm_register_write(struct kvm_vcpu *vcpu,
enum kvm_reg reg,
unsigned long val)
{
vcpu->arch.regs[reg] = val;
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
{
return kvm_register_read(vcpu, VCPU_REGS_RIP);
}
static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
{
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
}
#endif
......@@ -32,6 +32,7 @@
#include <asm/current.h>
#include <asm/apicdef.h>
#include <asm/atomic.h>
#include "kvm_cache_regs.h"
#include "irq.h"
#define PRId64 "d"
......@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
kvm_vcpu_kick(vcpu);
else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
kvm_vcpu_kick(vcpu);
result = (orig_irr == 0);
break;
......@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
printk(KERN_DEBUG
"Ignoring de-assert INIT to vcpu %d\n",
vcpu->vcpu_id);
apic_debug("Ignoring de-assert INIT to vcpu %d\n",
vcpu->vcpu_id);
}
break;
case APIC_DM_STARTUP:
printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
apic_debug("SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
kvm_vcpu_kick(vcpu);
}
break;
......@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
static void apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
int trigger_mode;
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
......@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
apic_update_ppr(apic);
if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
}
static void apic_send_ipi(struct kvm_lapic *apic)
......@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_run *run = vcpu->run;
set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
kvm_x86_ops->cache_regs(vcpu);
run->tpr_access.rip = vcpu->arch.rip;
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
......@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
* Refer SDM 8.4.1
*/
if (len != 4 || alignment) {
if (printk_ratelimit())
printk(KERN_ERR "apic write: bad size=%d %lx\n",
len, (long)address);
/* Don't shout loud, $infamous_os would cause only noise. */
apic_debug("apic write: bad size=%d %lx\n",
len, (long)address);
return;
}
......@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
if(!atomic_inc_and_test(&apic->timer.pending))
set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (waitqueue_active(q))
wake_up_interruptible(q);
}
if (apic_lvtt_period(apic)) {
result = 1;
apic->timer.dev.expires = ktime_add_ns(
......
This diff is collapsed.
......@@ -25,11 +25,11 @@
#if PTTYPE == 64
#define pt_element_t u64
#define guest_walker guest_walker64
#define shadow_walker shadow_walker64
#define FNAME(name) paging##64_##name
#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64
......@@ -42,11 +42,11 @@
#elif PTTYPE == 32
#define pt_element_t u32
#define guest_walker guest_walker32
#define shadow_walker shadow_walker32
#define FNAME(name) paging##32_##name
#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
#define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2
......@@ -73,6 +73,17 @@ struct guest_walker {
u32 error_code;
};
struct shadow_walker {
struct kvm_shadow_walk walker;
struct guest_walker *guest_walker;
int user_fault;
int write_fault;
int largepage;
int *ptwrite;
pfn_t pfn;
u64 *sptep;
};
static gfn_t gpte_to_gfn(pt_element_t gpte)
{
return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
......@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
pt_element_t *table;
struct page *page;
down_read(&current->mm->mmap_sem);
page = gfn_to_page(kvm, table_gfn);
up_read(&current->mm->mmap_sem);
table = kmap_atomic(page, KM_USER0);
ret = CMPXCHG(&table[index], orig_pte, new_pte);
kunmap_atomic(table, KM_USER0);
kvm_release_page_dirty(page);
......@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker,
int user_fault, int write_fault, int largepage,
int *ptwrite, pfn_t pfn)
static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
struct kvm_vcpu *vcpu, u64 addr,
u64 *sptep, int level)
{
hpa_t shadow_addr;
int level;
u64 *shadow_ent;
unsigned access = walker->pt_access;
if (!is_present_pte(walker->ptes[walker->level - 1]))
return NULL;
shadow_addr = vcpu->arch.mmu.root_hpa;
level = vcpu->arch.mmu.shadow_root_level;
if (level == PT32E_ROOT_LEVEL) {
shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
--level;
struct shadow_walker *sw =
container_of(_sw, struct shadow_walker, walker);
struct guest_walker *gw = sw->guest_walker;
unsigned access = gw->pt_access;
struct kvm_mmu_page *shadow_page;
u64 spte;
int metaphysical;
gfn_t table_gfn;
int r;
pt_element_t curr_pte;
if (level == PT_PAGE_TABLE_LEVEL
|| (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
sw->user_fault, sw->write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
false);
sw->sptep = sptep;
return 1;
}
for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level);
struct kvm_mmu_page *shadow_page;
u64 shadow_pte;
int metaphysical;
gfn_t table_gfn;
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (level == PT_PAGE_TABLE_LEVEL)
break;
if (largepage && level == PT_DIRECTORY_LEVEL)
break;
if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
return 0;
if (is_shadow_present_pte(*shadow_ent)
&& !is_large_pte(*shadow_ent)) {
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
continue;
}
if (is_large_pte(*sptep)) {
set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
rmap_remove(vcpu->kvm, sptep);
}
if (is_large_pte(*shadow_ent))
rmap_remove(vcpu->kvm, shadow_ent);
if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) {
metaphysical = 1;
if (!is_dirty_pte(walker->ptes[level - 1]))
access &= ~ACC_WRITE_MASK;
table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
} else {
metaphysical = 0;
table_gfn = walker->table_gfn[level - 2];
}
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
metaphysical, access,
shadow_ent);
if (!metaphysical) {
int r;
pt_element_t curr_pte;
r = kvm_read_guest_atomic(vcpu->kvm,
walker->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
if (r || curr_pte != walker->ptes[level - 2]) {
kvm_release_pfn_clean(pfn);
return NULL;
}
if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
metaphysical = 1;
if (!is_dirty_pte(gw->ptes[level - 1]))
access &= ~ACC_WRITE_MASK;
table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
} else {
metaphysical = 0;
table_gfn = gw->table_gfn[level - 2];
}
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
metaphysical, access, sptep);
if (!metaphysical) {
r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
if (r || curr_pte != gw->ptes[level - 2]) {
kvm_release_pfn_clean(sw->pfn);
sw->sptep = NULL;
return 1;
}
shadow_addr = __pa(shadow_page->spt);
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
set_shadow_pte(shadow_ent, shadow_pte);
}
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
ptwrite, largepage, walker->gfn, pfn, false);
spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
*sptep = spte;
return 0;
}
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *guest_walker,
int user_fault, int write_fault, int largepage,
int *ptwrite, pfn_t pfn)
{
struct shadow_walker walker = {
.walker = { .entry = FNAME(shadow_walk_entry), },
.guest_walker = guest_walker,
.user_fault = user_fault,
.write_fault = write_fault,
.largepage = largepage,
.ptwrite = ptwrite,
.pfn = pfn,
};
if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
return NULL;
walk_shadow(&walker.walker, vcpu, addr);
return shadow_ent;
return walker.sptep;
}
/*
......@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
down_read(&current->mm->mmap_sem);
if (walker.level == PT_DIRECTORY_LEVEL) {
gfn_t large_gfn;
large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
......@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
}
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
/* implicit mb(), we'll read before PT lock is unlocked */
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
up_read(&current->mm->mmap_sem);
/* mmio */
if (is_error_pfn(pfn)) {
......@@ -453,6 +461,31 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
return 0;
}
static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
struct kvm_vcpu *vcpu, u64 addr,
u64 *sptep, int level)
{
if (level == PT_PAGE_TABLE_LEVEL) {
if (is_shadow_present_pte(*sptep))
rmap_remove(vcpu->kvm, sptep);
set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
return 1;
}
if (!is_shadow_present_pte(*sptep))
return 1;
return 0;
}
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct shadow_walker walker = {
.walker = { .entry = FNAME(shadow_invlpg_entry), },
};
walk_shadow(&walker.walker, vcpu, gva);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
{
struct guest_walker walker;
......@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
}
}
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
* - Alias changes zap the entire shadow cache.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
offset = nr_present = 0;
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn = sp->gfns[i];
if (!is_shadow_present_pte(sp->spt[i]))
continue;
pte_gpa = gfn_to_gpa(sp->gfn);
pte_gpa += (i+offset) * sizeof(pt_element_t);
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
sizeof(pt_element_t)))
return -EINVAL;
if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
!(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
rmap_remove(vcpu->kvm, &sp->spt[i]);
if (is_present_pte(gpte))
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
set_shadow_pte(&sp->spt[i], nonpresent);
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_pte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
return !nr_present;
}
#undef pt_element_t
#undef guest_walker
#undef shadow_walker
#undef FNAME
#undef PT_BASE_ADDR_MASK
#undef PT_INDEX
#undef SHADOW_PT_INDEX
#undef PT_LEVEL_MASK
#undef PT_DIR_BASE_ADDR_MASK
#undef PT_LEVEL_BITS
......
This diff is collapsed.
This diff is collapsed.
......@@ -331,9 +331,6 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
......
This diff is collapsed.
#ifndef ARCH_X86_KVM_X86_H
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
}
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
{
vcpu->arch.interrupt.pending = true;
vcpu->arch.interrupt.nr = vector;
}
static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.interrupt.pending = false;
}
#endif
This diff is collapsed.
......@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
/* Get the TSC speed from Xen */
unsigned long xen_tsc_khz(void)
{
u64 xen_khz = 1000000ULL << 32;
const struct pvclock_vcpu_time_info *info =
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(xen_khz, info->tsc_to_system_mul);
if (info->tsc_shift < 0)
xen_khz <<= -info->tsc_shift;
else
xen_khz >>= info->tsc_shift;
return xen_khz;
return pvclock_tsc_khz(info);
}
cycle_t xen_clocksource_read(void)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include "intel-iommu.h"
#include <linux/intel-iommu.h>
struct ioapic_scope {
struct intel_iommu *iommu;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment