Commit 5c5ddf71 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvm-x86-mtrrs-6.11' of https://github.com/kvm-x86/linux into HEAD

KVM x86 MTRR virtualization removal

Remove support for virtualizing MTRRs on Intel CPUs, along with a nasty CR0.CD
hack, and instead always honor guest PAT on CPUs that support self-snoop.
parents 34b69ede 377b2f35
......@@ -8025,7 +8025,11 @@ The valid bits in cap.args[0] are:
When this quirk is disabled, the reset value
is 0x10000 (APIC_LVT_MASKED).
KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW.
KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW on
AMD CPUs to workaround buggy guest firmware
that runs in perpetuity with CR0.CD, i.e.
with caches in "no fill" mode.
When this quirk is disabled, KVM does not
change the value of CR0.CD and CR0.NW.
......
......@@ -48,3 +48,21 @@ have the same physical APIC ID, KVM will deliver events targeting that APIC ID
only to the vCPU with the lowest vCPU ID. If KVM_X2APIC_API_USE_32BIT_IDS is
not enabled, KVM follows x86 architecture when processing interrupts (all vCPUs
matching the target APIC ID receive the interrupt).
MTRRs
-----
KVM does not virtualize guest MTRR memory types. KVM emulates accesses to MTRR
MSRs, i.e. {RD,WR}MSR in the guest will behave as expected, but KVM does not
honor guest MTRRs when determining the effective memory type, and instead
treats all of guest memory as having Writeback (WB) MTRRs.
CR0.CD
------
KVM does not virtualize CR0.CD on Intel CPUs. Similar to MTRR MSRs, KVM
emulates CR0.CD accesses so that loads and stores from/to CR0 behave as
expected, but setting CR0.CD=1 has no impact on the cachaeability of guest
memory.
Note, this erratum does not affect AMD CPUs, which fully virtualize CR0.CD in
hardware, i.e. put the CPU caches into "no fill" mode when CR0.CD=1, even when
running in the guest.
\ No newline at end of file
......@@ -160,7 +160,6 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 256
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
......@@ -605,18 +604,12 @@ enum {
KVM_DEBUGREG_WONT_EXIT = 2,
};
struct kvm_mtrr_range {
u64 base;
u64 mask;
struct list_head node;
};
struct kvm_mtrr {
struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
u64 var[KVM_NR_VAR_MTRR * 2];
u64 fixed_64k;
u64 fixed_16k[2];
u64 fixed_4k[8];
u64 deftype;
struct list_head head;
};
/* Hyper-V SynIC timer */
......
......@@ -221,12 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
{
return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
}
bool kvm_mmu_may_ignore_guest_pat(void);
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
......
......@@ -4671,38 +4671,23 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
}
#endif
bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
bool kvm_mmu_may_ignore_guest_pat(void)
{
/*
* If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
* VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
* to honor the memtype from the guest's MTRRs so that guest accesses
* to memory that is DMA'd aren't cached against the guest's wishes.
*
* Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
* e.g. KVM will force UC memtype for host MMIO.
* When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
* not support self-snoop (or is affected by an erratum), and the VM
* has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
* honor the memtype from the guest's PAT so that guest accesses to
* memory that is DMA'd aren't cached against the guest's wishes. As a
* result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
* KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
* bits in response to non-coherent device (un)registration.
*/
return vm_has_noncoherent_dma && shadow_memtype_mask;
return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
}
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
/*
* If the guest's MTRRs may be used to compute the "real" memtype,
* restrict the mapping level to ensure KVM uses a consistent memtype
* across the entire mapping.
*/
if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
gfn_t base = gfn_round_for_level(fault->gfn,
fault->max_level);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
}
}
#ifdef CONFIG_X86_64
if (tdp_mmu_enabled)
return kvm_tdp_mmu_page_fault(vcpu, fault);
......
This diff is collapsed.
......@@ -7670,39 +7670,25 @@ int vmx_vm_init(struct kvm *kvm)
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
* We have to be careful as to what are honored and when.
*
* For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
* UC. The effective memory type is UC or WC depending on guest PAT.
* This was historically the source of MCEs and we want to be
* conservative.
*
* When there is no need to deal with noncoherent DMA (e.g., no VT-d
* or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
* EPT memory type is set to WB. The effective memory type is forced
* WB.
*
* Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
* EPT memory type is used to emulate guest CD/MTRR.
/*
* Force UC for host MMIO regions, as allowing the guest to access MMIO
* with cacheable accesses will result in Machine Checks.
*/
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
/*
* Force WB and ignore guest PAT if the VM does NOT have a non-coherent
* device attached and the CPU doesn't support self-snoop. Letting the
* guest control memory types on Intel CPUs without self-snoop may
* result in unexpected behavior, and so KVM's (historical) ABI is to
* trust the guest to behave only as a last resort.
*/
if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
else
return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
VMX_EPT_IPAT_BIT;
}
return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
......
......@@ -946,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
......@@ -11181,6 +11176,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_srcu_read_lock(vcpu);
/*
* Call this to ensure WC buffers in guest are evicted after each VM
* Exit, so that the evicted WC writes can be snooped across all cpus
*/
smp_mb__after_srcu_read_lock();
/*
* Profile KVM exit RIPs:
*/
......@@ -12264,7 +12265,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
......@@ -13528,13 +13528,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
* Non-coherent DMA assignment and de-assignment will affect
* whether KVM honors guest MTRRs and cause changes in memtypes
* in TDP.
* So, pass %true unconditionally to indicate non-coherent DMA was,
* or will be involved, and that zapping SPTEs might be necessary.
* Non-coherent DMA assignment and de-assignment may affect whether or
* not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
* due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
* (or last) non-coherent device is (un)registered to so that new SPTEs
* with the correct "ignore guest PAT" setting are created.
*/
if (__kvm_mmu_honors_guest_mtrrs(true))
if (kvm_mmu_may_ignore_guest_pat())
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
......
......@@ -325,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
struct kvm_queued_exception *ex);
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
......
......@@ -343,6 +343,20 @@ static inline void smp_mb__after_srcu_read_unlock(void)
/* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}
/**
* smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
*
* Converts the preceding srcu_read_lock into a two-way memory barrier.
*
* Call this after srcu_read_lock, to guarantee that all memory operations
* that occur after smp_mb__after_srcu_read_lock will appear to happen after
* the preceding srcu_read_lock.
*/
static inline void smp_mb__after_srcu_read_lock(void)
{
/* __srcu_read_lock has smp_mb() internally so nothing to do here. */
}
DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
_T->idx = srcu_read_lock(_T->lock),
srcu_read_unlock(_T->lock, _T->idx),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment