Commit 8607daa2 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvmarm-fixes-6.3-2' of...

Merge tag 'kvmarm-fixes-6.3-2' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

KVM/arm64 fixes for 6.3, part #2

Fixes for a rather interesting set of bugs relating to the MMU:

 - Read the MMU notifier seq before dropping the mmap lock to guard
   against reading a potentially stale VMA

 - Disable interrupts when walking user page tables to protect against
   the page table being freed

 - Read the MTE permissions for the VMA within the mmap lock critical
   section, avoiding the use of a potentally stale VMA pointer

Additionally, some fixes targeting the vPMU:

 - Return the sum of the current perf event value and PMC snapshot for
   reads from userspace

 - Don't save the value of guest writes to PMCR_EL0.{C,P}, which could
   otherwise lead to userspace erroneously resetting the vPMU during VM
   save/restore
parents f3e70741 8c2e8ac8
No related merge requests found
...@@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr) ...@@ -666,14 +666,33 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
CONFIG_PGTABLE_LEVELS), CONFIG_PGTABLE_LEVELS),
.mm_ops = &kvm_user_mm_ops, .mm_ops = &kvm_user_mm_ops,
}; };
unsigned long flags;
kvm_pte_t pte = 0; /* Keep GCC quiet... */ kvm_pte_t pte = 0; /* Keep GCC quiet... */
u32 level = ~0; u32 level = ~0;
int ret; int ret;
/*
* Disable IRQs so that we hazard against a concurrent
* teardown of the userspace page tables (which relies on
* IPI-ing threads).
*/
local_irq_save(flags);
ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
VM_BUG_ON(ret); local_irq_restore(flags);
VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS);
VM_BUG_ON(!(pte & PTE_VALID)); if (ret)
return ret;
/*
* Not seeing an error, but not updating level? Something went
* deeply wrong...
*/
if (WARN_ON(level >= KVM_PGTABLE_MAX_LEVELS))
return -EFAULT;
/* Oops, the userspace PTs are gone... Replay the fault */
if (!kvm_pte_valid(pte))
return -EAGAIN;
return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
} }
...@@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, ...@@ -1079,7 +1098,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
* *
* Returns the size of the mapping. * Returns the size of the mapping.
*/ */
static unsigned long static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long hva, kvm_pfn_t *pfnp, unsigned long hva, kvm_pfn_t *pfnp,
phys_addr_t *ipap) phys_addr_t *ipap)
...@@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, ...@@ -1091,8 +1110,15 @@ transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
* sure that the HVA and IPA are sufficiently aligned and that the * sure that the HVA and IPA are sufficiently aligned and that the
* block map is contained within the memslot. * block map is contained within the memslot.
*/ */
if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
get_user_mapping_size(kvm, hva) >= PMD_SIZE) { int sz = get_user_mapping_size(kvm, hva);
if (sz < 0)
return sz;
if (sz < PMD_SIZE)
return PAGE_SIZE;
/* /*
* The address we faulted on is backed by a transparent huge * The address we faulted on is backed by a transparent huge
* page. However, because we map the compound huge page and * page. However, because we map the compound huge page and
...@@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1192,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
{ {
int ret = 0; int ret = 0;
bool write_fault, writable, force_pte = false; bool write_fault, writable, force_pte = false;
bool exec_fault; bool exec_fault, mte_allowed;
bool device = false; bool device = false;
unsigned long mmu_seq; unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm; struct kvm *kvm = vcpu->kvm;
...@@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1203,7 +1229,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_pfn_t pfn; kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot); bool logging_active = memslot_is_logging(memslot);
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
unsigned long vma_pagesize, fault_granule; long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
struct kvm_pgtable *pgt; struct kvm_pgtable *pgt;
...@@ -1217,6 +1243,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1217,6 +1243,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT; return -EFAULT;
} }
/*
* Permission faults just need to update the existing leaf entry,
* and so normally don't require allocations from the memcache. The
* only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table.
*/
if (fault_status != ESR_ELx_FSC_PERM ||
(logging_active && write_fault)) {
ret = kvm_mmu_topup_memory_cache(memcache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
return ret;
}
/* /*
* Let's check if we will get back a huge page backed by hugetlbfs, or * Let's check if we will get back a huge page backed by hugetlbfs, or
* get block mapping for device MMIO region. * get block mapping for device MMIO region.
...@@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1269,37 +1309,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
fault_ipa &= ~(vma_pagesize - 1); fault_ipa &= ~(vma_pagesize - 1);
gfn = fault_ipa >> PAGE_SHIFT; gfn = fault_ipa >> PAGE_SHIFT;
mmap_read_unlock(current->mm); mte_allowed = kvm_vma_mte_allowed(vma);
/* /* Don't use the VMA after the unlock -- it may have vanished */
* Permission faults just need to update the existing leaf entry, vma = NULL;
* and so normally don't require allocations from the memcache. The
* only exception to this is when dirty logging is enabled at runtime
* and a write fault needs to collapse a block entry into a table.
*/
if (fault_status != ESR_ELx_FSC_PERM ||
(logging_active && write_fault)) {
ret = kvm_mmu_topup_memory_cache(memcache,
kvm_mmu_cache_min_pages(kvm));
if (ret)
return ret;
}
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
/* /*
* Ensure the read of mmu_invalidate_seq happens before we call * Read mmu_invalidate_seq so that KVM can detect if the results of
* gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
* the page we just got a reference to gets unmapped before we have a * acquiring kvm->mmu_lock.
* chance to grab the mmu_lock, which ensure that if the page gets
* unmapped afterwards, the call to kvm_unmap_gfn will take it away
* from us again properly. This smp_rmb() interacts with the smp_wmb()
* in kvm_mmu_notifier_invalidate_<page|range_end>.
* *
* Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
* used to avoid unnecessary overhead introduced to locate the memory * with the smp_wmb() in kvm_mmu_invalidate_end().
* slot because it's always fixed even @gfn is adjusted for huge pages.
*/ */
smp_rmb(); mmu_seq = vcpu->kvm->mmu_invalidate_seq;
mmap_read_unlock(current->mm);
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
write_fault, &writable, NULL); write_fault, &writable, NULL);
...@@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1350,11 +1374,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_pagesize = transparent_hugepage_adjust(kvm, memslot, vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
hva, &pfn, hva, &pfn,
&fault_ipa); &fault_ipa);
if (vma_pagesize < 0) {
ret = vma_pagesize;
goto out_unlock;
}
} }
if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new disallowed VMA */ /* Check the VMM hasn't introduced a new disallowed VMA */
if (kvm_vma_mte_allowed(vma)) { if (mte_allowed) {
sanitise_mte_tags(kvm, pfn, vma_pagesize); sanitise_mte_tags(kvm, pfn, vma_pagesize);
} else { } else {
ret = -EFAULT; ret = -EFAULT;
......
...@@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) ...@@ -538,7 +538,8 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
if (!kvm_pmu_is_3p5(vcpu)) if (!kvm_pmu_is_3p5(vcpu))
val &= ~ARMV8_PMU_PMCR_LP; val &= ~ARMV8_PMU_PMCR_LP;
__vcpu_sys_reg(vcpu, PMCR_EL0) = val; /* The reset bits don't indicate any state, and shouldn't be saved. */
__vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P);
if (val & ARMV8_PMU_PMCR_E) { if (val & ARMV8_PMU_PMCR_E) {
kvm_pmu_enable_counter_mask(vcpu, kvm_pmu_enable_counter_mask(vcpu,
......
...@@ -856,6 +856,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) ...@@ -856,6 +856,22 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
return true; return true;
} }
static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
u64 *val)
{
u64 idx;
if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
/* PMCCNTR_EL0 */
idx = ARMV8_PMU_CYCLE_IDX;
else
/* PMEVCNTRn_EL0 */
idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
*val = kvm_pmu_get_counter_value(vcpu, idx);
return 0;
}
static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
struct sys_reg_params *p, struct sys_reg_params *p,
const struct sys_reg_desc *r) const struct sys_reg_desc *r)
...@@ -1072,7 +1088,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, ...@@ -1072,7 +1088,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
/* Macro to expand the PMEVCNTRn_EL0 register */ /* Macro to expand the PMEVCNTRn_EL0 register */
#define PMU_PMEVCNTR_EL0(n) \ #define PMU_PMEVCNTR_EL0(n) \
{ PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \
.reset = reset_pmevcntr, \ .reset = reset_pmevcntr, .get_user = get_pmu_evcntr, \
.access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }
/* Macro to expand the PMEVTYPERn_EL0 register */ /* Macro to expand the PMEVTYPERn_EL0 register */
...@@ -1982,7 +1998,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { ...@@ -1982,7 +1998,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ PMU_SYS_REG(SYS_PMCEID1_EL0), { PMU_SYS_REG(SYS_PMCEID1_EL0),
.access = access_pmceid, .reset = NULL }, .access = access_pmceid, .reset = NULL },
{ PMU_SYS_REG(SYS_PMCCNTR_EL0), { PMU_SYS_REG(SYS_PMCCNTR_EL0),
.access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, .access = access_pmu_evcntr, .reset = reset_unknown,
.reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr},
{ PMU_SYS_REG(SYS_PMXEVTYPER_EL0), { PMU_SYS_REG(SYS_PMXEVTYPER_EL0),
.access = access_pmu_evtyper, .reset = NULL }, .access = access_pmu_evtyper, .reset = NULL },
{ PMU_SYS_REG(SYS_PMXEVCNTR_EL0), { PMU_SYS_REG(SYS_PMXEVCNTR_EL0),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment