Commit b6b26489 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Take care of faults occuring between the PARange and IPA range by
     injecting an exception

   - Fix S2 faults taken from a host EL0 in protected mode

   - Work around Oops caused by a PMU access from a 32bit guest when PMU
     has been created. This is a temporary bodge until we fix it for
     good.

  x86:

   - Fix potential races when walking host page table

   - Fix shadow page table leak when KVM runs nested

   - Work around bug in userspace when KVM synthesizes leaf 0x80000021
     on older (pre-EPYC) or Intel processors

  Generic (but affects only RISC-V):

   - Fix bad user ABI for KVM_EXIT_SYSTEM_EVENT"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86: work around QEMU issue with synthetic CPUID leaves
  Revert "x86/mm: Introduce lookup_address_in_mm()"
  KVM: x86/mmu: fix potential races when walking host page table
  KVM: fix bad user ABI for KVM_EXIT_SYSTEM_EVENT
  KVM: x86/mmu: Do not create SPTEs for GFNs that exceed host.MAXPHYADDR
  KVM: arm64: Inject exception on out-of-IPA-range translation fault
  KVM/arm64: Don't emulate a PMU for 32-bit guests if feature not set
  KVM: arm64: Handle host stage-2 faults from 32-bit EL0
parents b2da7df5 f751d8ea
......@@ -5986,16 +5986,16 @@ should put the acknowledged interrupt vector into the 'epr' field.
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
__u32 type;
__u64 flags;
__u32 ndata;
__u64 data[16];
} system_event;
If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
a system-level event using some architecture specific mechanism (hypercall
or some special instruction). In case of ARM64, this is triggered using
HVC instruction based PSCI call from the vcpu. The 'type' field describes
the system-level event type. The 'flags' field describes architecture
specific flags for the system-level event.
HVC instruction based PSCI call from the vcpu.
The 'type' field describes the system-level event type.
Valid values for 'type' are:
- KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
......@@ -6010,10 +6010,20 @@ Valid values for 'type' are:
to ignore the request, or to gather VM memory core dump and/or
reset/shutdown of the VM.
Valid flags are:
If KVM_CAP_SYSTEM_EVENT_DATA is present, the 'data' field can contain
architecture specific information for the system-level event. Only
the first `ndata` items (possibly zero) of the data array are valid.
- KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 (arm64 only) -- the guest issued
a SYSTEM_RESET2 call according to v1.1 of the PSCI specification.
- for arm64, data[0] is set to KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2 if
the guest issued a SYSTEM_RESET2 call according to v1.1 of the PSCI
specification.
- for RISC-V, data[0] is set to the value of the second argument of the
``sbi_system_reset`` call.
Previous versions of Linux defined a `flags` member in this struct. The
field is now aliased to `data[0]`. Userspace can assume that it is only
written if ndata is greater than 0.
::
......
......@@ -40,6 +40,7 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
void kvm_inject_vabt(struct kvm_vcpu *vcpu);
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_size_fault(struct kvm_vcpu *vcpu);
void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);
......
......@@ -198,15 +198,15 @@ SYM_CODE_START(__kvm_hyp_host_vector)
invalid_host_el2_vect // FIQ EL2h
invalid_host_el2_vect // Error EL2h
host_el1_sync_vect // Synchronous 64-bit EL1
invalid_host_el1_vect // IRQ 64-bit EL1
invalid_host_el1_vect // FIQ 64-bit EL1
invalid_host_el1_vect // Error 64-bit EL1
invalid_host_el1_vect // Synchronous 32-bit EL1
invalid_host_el1_vect // IRQ 32-bit EL1
invalid_host_el1_vect // FIQ 32-bit EL1
invalid_host_el1_vect // Error 32-bit EL1
host_el1_sync_vect // Synchronous 64-bit EL1/EL0
invalid_host_el1_vect // IRQ 64-bit EL1/EL0
invalid_host_el1_vect // FIQ 64-bit EL1/EL0
invalid_host_el1_vect // Error 64-bit EL1/EL0
host_el1_sync_vect // Synchronous 32-bit EL1/EL0
invalid_host_el1_vect // IRQ 32-bit EL1/EL0
invalid_host_el1_vect // FIQ 32-bit EL1/EL0
invalid_host_el1_vect // Error 32-bit EL1/EL0
SYM_CODE_END(__kvm_hyp_host_vector)
/*
......
......@@ -145,6 +145,34 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
inject_abt64(vcpu, true, addr);
}
void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
{
unsigned long addr, esr;
addr = kvm_vcpu_get_fault_ipa(vcpu);
addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
if (kvm_vcpu_trap_is_iabt(vcpu))
kvm_inject_pabt(vcpu, addr);
else
kvm_inject_dabt(vcpu, addr);
/*
* If AArch64 or LPAE, set FSC to 0 to indicate an Address
* Size Fault at level 0, as if exceeding PARange.
*
* Non-LPAE guests will only get the external abort, as there
* is no way to to describe the ASF.
*/
if (vcpu_el1_is_32bit(vcpu) &&
!(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
return;
esr = vcpu_read_sys_reg(vcpu, ESR_EL1);
esr &= ~GENMASK_ULL(5, 0);
vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
}
/**
* kvm_inject_undefined - inject an undefined instruction into the guest
* @vcpu: The vCPU in which to inject the exception
......
......@@ -1337,6 +1337,25 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
if (fault_status == FSC_FAULT) {
/* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu);
return 1;
}
/* Falls between the IPA range and the PARange? */
if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
if (is_iabt)
kvm_inject_pabt(vcpu, fault_ipa);
else
kvm_inject_dabt(vcpu, fault_ipa);
return 1;
}
}
/* Synchronous External Abort? */
if (kvm_vcpu_abt_issea(vcpu)) {
/*
......
......@@ -177,6 +177,9 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
struct kvm_pmc *pmc = &pmu->pmc[select_idx];
if (!kvm_vcpu_has_pmu(vcpu))
return 0;
counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
if (kvm_pmu_pmc_is_chained(pmc) &&
......@@ -198,6 +201,9 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
u64 reg;
if (!kvm_vcpu_has_pmu(vcpu))
return;
reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
__vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
......@@ -322,6 +328,9 @@ void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
struct kvm_pmc *pmc;
if (!kvm_vcpu_has_pmu(vcpu))
return;
if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
return;
......@@ -357,7 +366,7 @@ void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
struct kvm_pmc *pmc;
if (!val)
if (!kvm_vcpu_has_pmu(vcpu) || !val)
return;
for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
......@@ -527,6 +536,9 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
int i;
if (!kvm_vcpu_has_pmu(vcpu))
return;
if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
return;
......@@ -576,6 +588,9 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
{
int i;
if (!kvm_vcpu_has_pmu(vcpu))
return;
if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu,
__vcpu_sys_reg(vcpu, PMCNTENSET_EL0));
......@@ -739,6 +754,9 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
{
u64 reg, mask;
if (!kvm_vcpu_has_pmu(vcpu))
return;
mask = ARMV8_PMU_EVTYPE_MASK;
mask &= ~ARMV8_PMU_EVTYPE_EVENT;
mask |= kvm_pmu_event_mask(vcpu->kvm);
......@@ -827,6 +845,9 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
u64 val, mask = 0;
int base, i, nr_events;
if (!kvm_vcpu_has_pmu(vcpu))
return 0;
if (!pmceid1) {
val = read_sysreg(pmceid0_el0);
base = 0;
......
......@@ -181,7 +181,8 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
vcpu->run->system_event.type = type;
vcpu->run->system_event.flags = flags;
vcpu->run->system_event.ndata = 1;
vcpu->run->system_event.data[0] = flags;
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}
......
......@@ -83,7 +83,7 @@ void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run)
void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
struct kvm_run *run,
u32 type, u64 flags)
u32 type, u64 reason)
{
unsigned long i;
struct kvm_vcpu *tmp;
......@@ -94,7 +94,8 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu,
memset(&run->system_event, 0, sizeof(run->system_event));
run->system_event.type = type;
run->system_event.flags = flags;
run->system_event.ndata = 1;
run->system_event.data[0] = reason;
run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}
......
......@@ -559,10 +559,6 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
struct mm_struct;
extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
......
......@@ -1085,12 +1085,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 0x80000000:
entry->eax = min(entry->eax, 0x80000021);
/*
* Serializing LFENCE is reported in a multitude of ways,
* and NullSegClearsBase is not reported in CPUID on Zen2;
* help userspace by providing the CPUID leaf ourselves.
* Serializing LFENCE is reported in a multitude of ways, and
* NullSegClearsBase is not reported in CPUID on Zen2; help
* userspace by providing the CPUID leaf ourselves.
*
* However, only do it if the host has CPUID leaf 0x8000001d.
* QEMU thinks that it can query the host blindly for that
* CPUID leaf if KVM reports that it supports 0x8000001d or
* above. The processor merrily returns values from the
* highest Intel leaf which QEMU tries to use as the guest's
* 0x8000001d. Even worse, this can result in an infinite
* loop if said highest leaf has no subleaves indexed by ECX.
*/
if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
|| !static_cpu_has_bug(X86_BUG_NULL_SEG))
if (entry->eax >= 0x8000001d &&
(static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
|| !static_cpu_has_bug(X86_BUG_NULL_SEG)))
entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
......
......@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
/*
* The number of non-reserved physical address bits irrespective of features
* that repurpose legal bits, e.g. MKTME.
*/
extern u8 __read_mostly shadow_phys_bits;
static inline gfn_t kvm_mmu_max_gfn(void)
{
/*
* Note that this uses the host MAXPHYADDR, not the guest's.
* EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
* assuming KVM is running on bare metal, guest accesses beyond
* host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
* (either EPT Violation/Misconfig or #NPF), and so KVM will never
* install a SPTE for such addresses. If KVM is running as a VM
* itself, on the other hand, it might see a MAXPHYADDR that is less
* than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
* disallows such SPTEs entirely and simplifies the TDP MMU.
*/
int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
......
......@@ -2804,8 +2804,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
const struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
int level;
unsigned long flags;
int level = PG_LEVEL_4K;
pgd_t pgd;
p4d_t p4d;
pud_t pud;
pmd_t pmd;
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K;
......@@ -2820,10 +2824,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
pte = lookup_address_in_mm(kvm->mm, hva, &level);
if (unlikely(!pte))
return PG_LEVEL_4K;
/*
* Lookup the mapping level in the current mm. The information
* may become stale soon, but it is safe to use as long as
* 1) mmu_notifier_retry was checked after taking mmu_lock, and
* 2) mmu_lock is taken now.
*
* We still need to disable IRQs to prevent concurrent tear down
* of page tables.
*/
local_irq_save(flags);
pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
if (pgd_none(pgd))
goto out;
p4d = READ_ONCE(*p4d_offset(&pgd, hva));
if (p4d_none(p4d) || !p4d_present(p4d))
goto out;
pud = READ_ONCE(*pud_offset(&p4d, hva));
if (pud_none(pud) || !pud_present(pud))
goto out;
if (pud_large(pud)) {
level = PG_LEVEL_1G;
goto out;
}
pmd = READ_ONCE(*pmd_offset(&pud, hva));
if (pmd_none(pmd) || !pmd_present(pmd))
goto out;
if (pmd_large(pmd))
level = PG_LEVEL_2M;
out:
local_irq_restore(flags);
return level;
}
......@@ -2992,9 +3029,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
* MMIO SPTE will just be an expensive nop.
* MMIO SPTE will just be an expensive nop. Do not cache MMIO
* whose gfn is greater than host.MAXPHYADDR, any guest that
* generates such gfns is running nested and is being tricked
* by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
* and only if L1's MAXPHYADDR is inaccurate with respect to
* the hardware's).
*/
if (unlikely(!shadow_mmio_value)) {
if (unlikely(!shadow_mmio_value) ||
unlikely(fault->gfn > kvm_mmu_max_gfn())) {
*ret_val = RET_PF_EMULATE;
return true;
}
......
......@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
/*
* The number of non-reserved physical address bits irrespective of features
* that repurpose legal bits, e.g. MKTME.
*/
extern u8 __read_mostly shadow_phys_bits;
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
......
......@@ -815,14 +815,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded;
}
static inline gfn_t tdp_mmu_max_gfn_host(void)
static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
{
/*
* Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
* will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
* and so KVM will never install a SPTE for such addresses.
* Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
* a gpa range that would exceed the max gfn, and KVM does not create
* MMIO SPTEs for "impossible" gfns, instead sending such accesses down
* the slow emulation path every time.
*/
return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
return kvm_mmu_max_gfn() + 1;
}
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
......@@ -830,7 +831,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
gfn_t end = tdp_mmu_max_gfn_host();
gfn_t end = tdp_mmu_max_gfn_exclusive();
gfn_t start = 0;
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
......@@ -923,7 +924,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
end = min(end, tdp_mmu_max_gfn_host());
end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock);
......
......@@ -10020,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
......@@ -12009,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
return kvm_alloc_memslot_metadata(kvm, new);
}
if (change == KVM_MR_FLAGS_ONLY)
memcpy(&new->arch, &old->arch, sizeof(old->arch));
......
......@@ -638,17 +638,6 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
}
EXPORT_SYMBOL_GPL(lookup_address);
/*
* Lookup the page table entry for a virtual address in a given mm. Return a
* pointer to the entry and the level of the mapping.
*/
pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
unsigned int *level)
{
return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
}
EXPORT_SYMBOL_GPL(lookup_address_in_mm);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
......
......@@ -445,7 +445,13 @@ struct kvm_run {
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
__u32 type;
__u64 flags;
__u32 ndata;
union {
#ifndef __KERNEL__
__u64 flags;
#endif
__u64 data[16];
};
} system_event;
/* KVM_EXIT_S390_STSI */
struct {
......@@ -1144,6 +1150,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_MEM_OP_EXTENSION 211
#define KVM_CAP_PMU_CAPABILITY 212
#define KVM_CAP_DISABLE_QUIRKS2 213
/* #define KVM_CAP_VM_TSC_CONTROL 214 */
#define KVM_CAP_SYSTEM_EVENT_DATA 215
#ifdef KVM_CAP_IRQ_ROUTING
......
......@@ -4354,6 +4354,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return 0;
#endif
case KVM_CAP_BINARY_STATS_FD:
case KVM_CAP_SYSTEM_EVENT_DATA:
return 1;
default:
break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment