Commit 43d8ac22 authored by Marc Zyngier's avatar Marc Zyngier

Merge branch kvm-arm64/pkvm-hyp-sharing into kvmarm-master/next

* kvm-arm64/pkvm-hyp-sharing:
  : .
  : Series from Quentin Perret, implementing HYP page share/unshare:
  :
  : This series implements an unshare hypercall at EL2 in nVHE
  : protected mode, and makes use of it to unmmap guest-specific
  : data-structures from EL2 stage-1 during guest tear-down.
  : Crucially, the implementation of the share and unshare
  : routines use page refcounts in the host kernel to avoid
  : accidentally unmapping data-structures that overlap a common
  : page.
  : [...]
  : .
  KVM: arm64: pkvm: Unshare guest structs during teardown
  KVM: arm64: Expose unshare hypercall to the host
  KVM: arm64: Implement do_unshare() helper for unsharing memory
  KVM: arm64: Implement __pkvm_host_share_hyp() using do_share()
  KVM: arm64: Implement do_share() helper for sharing memory
  KVM: arm64: Introduce wrappers for host and hyp spin lock accessors
  KVM: arm64: Extend pkvm_page_state enumeration to handle absent pages
  KVM: arm64: pkvm: Refcount the pages shared with EL2
  KVM: arm64: Introduce kvm_share_hyp()
  KVM: arm64: Implement kvm_pgtable_hyp_unmap() at EL2
  KVM: arm64: Hook up ->page_count() for hypervisor stage-1 page-table
  KVM: arm64: Fixup hyp stage-1 refcount
  KVM: arm64: Refcount hyp stage-1 pgtable pages
  KVM: arm64: Provide {get,put}_page() stubs for early hyp allocator
Signed-off-by: default avatarMarc Zyngier <maz@kernel.org>
parents ce5b5b05 52b28657
......@@ -63,6 +63,7 @@ enum __kvm_host_smccc_func {
/* Hypercalls available after pKVM finalisation */
__KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
__KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
......
......@@ -321,6 +321,7 @@ struct kvm_vcpu_arch {
struct kvm_guest_debug_arch external_debug_state;
struct user_fpsimd_state *host_fpsimd_state; /* hyp VA */
struct task_struct *parent_task;
struct {
/* {Break,watch}point registers */
......@@ -737,6 +738,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu);
void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu);
static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
{
......
......@@ -150,6 +150,8 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>
int kvm_share_hyp(void *from, void *to);
void kvm_unshare_hyp(void *from, void *to);
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
......
......@@ -251,6 +251,27 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
enum kvm_pgtable_prot prot);
/**
* kvm_pgtable_hyp_unmap() - Remove a mapping from a hypervisor stage-1 page-table.
* @pgt: Page-table structure initialised by kvm_pgtable_hyp_init().
* @addr: Virtual address from which to remove the mapping.
* @size: Size of the mapping.
*
* The offset of @addr within a page is ignored, @size is rounded-up to
* the next page boundary and @phys is rounded-down to the previous page
* boundary.
*
* TLB invalidation is performed for each page-table entry cleared during the
* unmapping operation and the reference count for the page-table page
* containing the cleared entry is decremented, with unreferenced pages being
* freed. The unmapping operation will stop early if it encounters either an
* invalid page-table entry or a valid block mapping which maps beyond the range
* being unmapped.
*
* Return: Number of bytes unmapped, which may be 0.
*/
u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
/**
* kvm_get_vtcr() - Helper to construct VTCR_EL2
* @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register.
......
......@@ -146,7 +146,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
return ret;
ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
ret = kvm_share_hyp(kvm, kvm + 1);
if (ret)
goto out_free_stage2_pgd;
......@@ -188,6 +188,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
}
}
atomic_set(&kvm->online_vcpus, 0);
kvm_unshare_hyp(kvm, kvm + 1);
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
......@@ -342,7 +344,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (err)
return err;
return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
return kvm_share_hyp(vcpu, vcpu + 1);
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
......
......@@ -14,6 +14,19 @@
#include <asm/kvm_mmu.h>
#include <asm/sysreg.h>
void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu)
{
struct task_struct *p = vcpu->arch.parent_task;
struct user_fpsimd_state *fpsimd;
if (!is_protected_kvm_enabled() || !p)
return;
fpsimd = &p->thread.uw.fpsimd_state;
kvm_unshare_hyp(fpsimd, fpsimd + 1);
put_task_struct(p);
}
/*
* Called on entry to KVM_RUN unless this vcpu previously ran at least
* once and the most recent prior KVM_RUN for this vcpu was called from
......@@ -29,12 +42,27 @@ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state;
kvm_vcpu_unshare_task_fp(vcpu);
/* Make sure the host task fpsimd state is visible to hyp: */
ret = create_hyp_mappings(fpsimd, fpsimd + 1, PAGE_HYP);
if (!ret)
vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
ret = kvm_share_hyp(fpsimd, fpsimd + 1);
if (ret)
return ret;
vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd);
/*
* We need to keep current's task_struct pinned until its data has been
* unshared with the hypervisor to make sure it is not re-used by the
* kernel and donated to someone else while already shared -- see
* kvm_vcpu_unshare_task_fp() for the matching put_task_struct().
*/
if (is_protected_kvm_enabled()) {
get_task_struct(current);
vcpu->arch.parent_task = current;
}
return ret;
return 0;
}
/*
......
......@@ -24,6 +24,11 @@ enum pkvm_page_state {
PKVM_PAGE_OWNED = 0ULL,
PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0,
PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1,
__PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 |
KVM_PGTABLE_PROT_SW1,
/* Meta-states which aren't encoded directly in the PTE's SW bits */
PKVM_NOPAGE,
};
#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1)
......@@ -50,6 +55,7 @@ extern const u8 pkvm_hyp_id;
int __pkvm_prot_finalize(void);
int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
bool addr_is_memory(phys_addr_t phys);
int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot);
......
......@@ -43,6 +43,9 @@ void *hyp_early_alloc_page(void *arg)
return hyp_early_alloc_contig(1);
}
static void hyp_early_alloc_get_page(void *addr) { }
static void hyp_early_alloc_put_page(void *addr) { }
void hyp_early_alloc_init(void *virt, unsigned long size)
{
base = cur = (unsigned long)virt;
......@@ -51,4 +54,6 @@ void hyp_early_alloc_init(void *virt, unsigned long size)
hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page;
hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt;
hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys;
hyp_early_alloc_mm_ops.get_page = hyp_early_alloc_get_page;
hyp_early_alloc_mm_ops.put_page = hyp_early_alloc_put_page;
}
......@@ -147,6 +147,13 @@ static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn);
}
static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(u64, pfn, host_ctxt, 1);
cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn);
}
static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
......@@ -184,6 +191,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__pkvm_prot_finalize),
HANDLE_FUNC(__pkvm_host_share_hyp),
HANDLE_FUNC(__pkvm_host_unshare_hyp),
HANDLE_FUNC(__kvm_adjust_pc),
HANDLE_FUNC(__kvm_vcpu_run),
HANDLE_FUNC(__kvm_flush_vm_context),
......
This diff is collapsed.
......@@ -166,6 +166,7 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
enum kvm_pgtable_walk_flags flag,
void * const arg)
{
struct kvm_pgtable_mm_ops *mm_ops = arg;
enum kvm_pgtable_prot prot;
enum pkvm_page_state state;
kvm_pte_t pte = *ptep;
......@@ -174,6 +175,15 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level,
if (!kvm_pte_valid(pte))
return 0;
/*
* Fix-up the refcount for the page-table pages as the early allocator
* was unable to access the hyp_vmemmap and so the buddy allocator has
* initialised the refcount to '1'.
*/
mm_ops->get_page(ptep);
if (flag != KVM_PGTABLE_WALK_LEAF)
return 0;
if (level != (KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
......@@ -206,7 +216,8 @@ static int finalize_host_mappings(void)
{
struct kvm_pgtable_walker walker = {
.cb = finalize_host_mappings_walker,
.flags = KVM_PGTABLE_WALK_LEAF,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
.arg = pkvm_pgtable.mm_ops,
};
int i, ret;
......@@ -241,19 +252,20 @@ void __noreturn __pkvm_init_finalise(void)
if (ret)
goto out;
ret = finalize_host_mappings();
if (ret)
goto out;
pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) {
.zalloc_page = hyp_zalloc_hyp_page,
.phys_to_virt = hyp_phys_to_virt,
.virt_to_phys = hyp_virt_to_phys,
.get_page = hpool_get_page,
.put_page = hpool_put_page,
.page_count = hyp_page_count,
};
pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops;
ret = finalize_host_mappings();
if (ret)
goto out;
out:
/*
* We tail-called to here from handle___pkvm_init() and will not return,
......
......@@ -383,21 +383,6 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
return prot;
}
static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
{
/*
* Tolerate KVM recreating the exact same mapping, or changing software
* bits if the existing mapping was valid.
*/
if (old == new)
return false;
if (!kvm_pte_valid(old))
return true;
return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW);
}
static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
kvm_pte_t *ptep, struct hyp_map_data *data)
{
......@@ -407,11 +392,16 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
if (!kvm_block_mapping_supported(addr, end, phys, level))
return false;
data->phys += granule;
new = kvm_init_valid_leaf_pte(phys, data->attr, level);
if (hyp_pte_needs_update(old, new))
smp_store_release(ptep, new);
if (old == new)
return true;
if (!kvm_pte_valid(old))
data->mm_ops->get_page(ptep);
else if (WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
return false;
data->phys += granule;
smp_store_release(ptep, new);
return true;
}
......@@ -433,6 +423,7 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
return -ENOMEM;
kvm_set_table_pte(ptep, childp, mm_ops);
mm_ops->get_page(ptep);
return 0;
}
......@@ -460,6 +451,69 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
return ret;
}
struct hyp_unmap_data {
u64 unmapped;
struct kvm_pgtable_mm_ops *mm_ops;
};
static int hyp_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
kvm_pte_t pte = *ptep, *childp = NULL;
u64 granule = kvm_granule_size(level);
struct hyp_unmap_data *data = arg;
struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
if (!kvm_pte_valid(pte))
return -EINVAL;
if (kvm_pte_table(pte, level)) {
childp = kvm_pte_follow(pte, mm_ops);
if (mm_ops->page_count(childp) != 1)
return 0;
kvm_clear_pte(ptep);
dsb(ishst);
__tlbi_level(vae2is, __TLBI_VADDR(addr, 0), level);
} else {
if (end - addr < granule)
return -EINVAL;
kvm_clear_pte(ptep);
dsb(ishst);
__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
data->unmapped += granule;
}
dsb(ish);
isb();
mm_ops->put_page(ptep);
if (childp)
mm_ops->put_page(childp);
return 0;
}
u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
struct hyp_unmap_data unmap_data = {
.mm_ops = pgt->mm_ops,
};
struct kvm_pgtable_walker walker = {
.cb = hyp_unmap_walker,
.arg = &unmap_data,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
};
if (!pgt->mm_ops->page_count)
return 0;
kvm_pgtable_walk(pgt, addr, size, &walker);
return unmap_data.unmapped;
}
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
struct kvm_pgtable_mm_ops *mm_ops)
{
......@@ -482,8 +536,16 @@ static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
enum kvm_pgtable_walk_flags flag, void * const arg)
{
struct kvm_pgtable_mm_ops *mm_ops = arg;
kvm_pte_t pte = *ptep;
if (!kvm_pte_valid(pte))
return 0;
mm_ops->put_page(ptep);
if (kvm_pte_table(pte, level))
mm_ops->put_page(kvm_pte_follow(pte, mm_ops));
mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops));
return 0;
}
......@@ -491,7 +553,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
{
struct kvm_pgtable_walker walker = {
.cb = hyp_free_walker,
.flags = KVM_PGTABLE_WALK_TABLE_POST,
.flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
.arg = pgt->mm_ops,
};
......
......@@ -284,14 +284,117 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
}
}
static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end)
struct hyp_shared_pfn {
u64 pfn;
int count;
struct rb_node node;
};
static DEFINE_MUTEX(hyp_shared_pfns_lock);
static struct rb_root hyp_shared_pfns = RB_ROOT;
static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
struct rb_node **parent)
{
phys_addr_t addr;
struct hyp_shared_pfn *this;
*node = &hyp_shared_pfns.rb_node;
*parent = NULL;
while (**node) {
this = container_of(**node, struct hyp_shared_pfn, node);
*parent = **node;
if (this->pfn < pfn)
*node = &((**node)->rb_left);
else if (this->pfn > pfn)
*node = &((**node)->rb_right);
else
return this;
}
return NULL;
}
static int share_pfn_hyp(u64 pfn)
{
struct rb_node **node, *parent;
struct hyp_shared_pfn *this;
int ret = 0;
mutex_lock(&hyp_shared_pfns_lock);
this = find_shared_pfn(pfn, &node, &parent);
if (this) {
this->count++;
goto unlock;
}
this = kzalloc(sizeof(*this), GFP_KERNEL);
if (!this) {
ret = -ENOMEM;
goto unlock;
}
this->pfn = pfn;
this->count = 1;
rb_link_node(&this->node, parent, node);
rb_insert_color(&this->node, &hyp_shared_pfns);
ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
unlock:
mutex_unlock(&hyp_shared_pfns_lock);
return ret;
}
static int unshare_pfn_hyp(u64 pfn)
{
struct rb_node **node, *parent;
struct hyp_shared_pfn *this;
int ret = 0;
mutex_lock(&hyp_shared_pfns_lock);
this = find_shared_pfn(pfn, &node, &parent);
if (WARN_ON(!this)) {
ret = -ENOENT;
goto unlock;
}
this->count--;
if (this->count)
goto unlock;
rb_erase(&this->node, &hyp_shared_pfns);
kfree(this);
ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
unlock:
mutex_unlock(&hyp_shared_pfns_lock);
return ret;
}
int kvm_share_hyp(void *from, void *to)
{
phys_addr_t start, end, cur;
u64 pfn;
int ret;
for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) {
ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp,
__phys_to_pfn(addr));
if (is_kernel_in_hyp_mode())
return 0;
/*
* The share hcall maps things in the 'fixed-offset' region of the hyp
* VA space, so we can only share physically contiguous data-structures
* for now.
*/
if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
return -EINVAL;
if (kvm_host_owns_hyp_mappings())
return create_hyp_mappings(from, to, PAGE_HYP);
start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
end = PAGE_ALIGN(__pa(to));
for (cur = start; cur < end; cur += PAGE_SIZE) {
pfn = __phys_to_pfn(cur);
ret = share_pfn_hyp(pfn);
if (ret)
return ret;
}
......@@ -299,6 +402,22 @@ static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end)
return 0;
}
void kvm_unshare_hyp(void *from, void *to)
{
phys_addr_t start, end, cur;
u64 pfn;
if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
return;
start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
end = PAGE_ALIGN(__pa(to));
for (cur = start; cur < end; cur += PAGE_SIZE) {
pfn = __phys_to_pfn(cur);
WARN_ON(unshare_pfn_hyp(pfn));
}
}
/**
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
* @from: The virtual kernel start address of the range
......@@ -319,12 +438,8 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
if (is_kernel_in_hyp_mode())
return 0;
if (!kvm_host_owns_hyp_mappings()) {
if (WARN_ON(prot != PAGE_HYP))
return -EPERM;
return pkvm_share_hyp(kvm_kaddr_to_phys(from),
kvm_kaddr_to_phys(to));
}
if (!kvm_host_owns_hyp_mappings())
return -EPERM;
start = start & PAGE_MASK;
end = PAGE_ALIGN(end);
......
......@@ -113,7 +113,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
if (!buf)
return -ENOMEM;
ret = create_hyp_mappings(buf, buf + reg_sz, PAGE_HYP);
ret = kvm_share_hyp(buf, buf + reg_sz);
if (ret) {
kfree(buf);
return ret;
......@@ -150,7 +150,13 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu)
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
kfree(vcpu->arch.sve_state);
void *sve_state = vcpu->arch.sve_state;
kvm_vcpu_unshare_task_fp(vcpu);
kvm_unshare_hyp(vcpu, vcpu + 1);
if (sve_state)
kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu));
kfree(sve_state);
}
static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment