Commit f587661f authored by Oliver Upton's avatar Oliver Upton Committed by Marc Zyngier

KVM: arm64: Don't split hugepages outside of MMU write lock

It is possible to take a stage-2 permission fault on a page larger than
PAGE_SIZE. For example, when running a guest backed by 2M HugeTLB, KVM
eagerly maps at the largest possible block size. When dirty logging is
enabled on a memslot, KVM does *not* eagerly split these 2M stage-2
mappings and instead clears the write bit on the pte.

Since dirty logging is always performed at PAGE_SIZE granularity, KVM
lazily splits these 2M block mappings down to PAGE_SIZE in the stage-2
fault handler. This operation must be done under the write lock. Since
commit f783ef1c ("KVM: arm64: Add fast path to handle permission
relaxation during dirty logging"), the stage-2 fault handler
conditionally takes the read lock on permission faults with dirty
logging enabled. To that end, it is possible to split a 2M block mapping
while only holding the read lock.

The problem is demonstrated by running kvm_page_table_test with 2M
anonymous HugeTLB, which splats like so:

  WARNING: CPU: 5 PID: 15276 at arch/arm64/kvm/hyp/pgtable.c:153 stage2_map_walk_leaf+0x124/0x158

  [...]

  Call trace:
  stage2_map_walk_leaf+0x124/0x158
  stage2_map_walker+0x5c/0xf0
  __kvm_pgtable_walk+0x100/0x1d4
  __kvm_pgtable_walk+0x140/0x1d4
  __kvm_pgtable_walk+0x140/0x1d4
  kvm_pgtable_walk+0xa0/0xf8
  kvm_pgtable_stage2_map+0x15c/0x198
  user_mem_abort+0x56c/0x838
  kvm_handle_guest_abort+0x1fc/0x2a4
  handle_exit+0xa4/0x120
  kvm_arch_vcpu_ioctl_run+0x200/0x448
  kvm_vcpu_ioctl+0x588/0x664
  __arm64_sys_ioctl+0x9c/0xd4
  invoke_syscall+0x4c/0x144
  el0_svc_common+0xc4/0x190
  do_el0_svc+0x30/0x8c
  el0_svc+0x28/0xcc
  el0t_64_sync_handler+0x84/0xe4
  el0t_64_sync+0x1a4/0x1a8

Fix the issue by only acquiring the read lock if the guest faulted on a
PAGE_SIZE granule w/ dirty logging enabled. Add a WARN to catch locking
bugs in future changes.

Fixes: f783ef1c ("KVM: arm64: Add fast path to handle permission relaxation during dirty logging")
Cc: Jing Zhang <jingzhangos@google.com>
Signed-off-by: default avatarOliver Upton <oupton@google.com>
Reviewed-by: default avatarReiji Watanabe <reijiw@google.com>
Signed-off-by: default avatarMarc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20220401194652.950240-1-oupton@google.com
parent 73b725c7
...@@ -1079,7 +1079,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1079,7 +1079,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
gfn_t gfn; gfn_t gfn;
kvm_pfn_t pfn; kvm_pfn_t pfn;
bool logging_active = memslot_is_logging(memslot); bool logging_active = memslot_is_logging(memslot);
bool logging_perm_fault = false; bool use_read_lock = false;
unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu); unsigned long fault_level = kvm_vcpu_trap_get_fault_level(vcpu);
unsigned long vma_pagesize, fault_granule; unsigned long vma_pagesize, fault_granule;
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
...@@ -1114,7 +1114,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1114,7 +1114,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (logging_active) { if (logging_active) {
force_pte = true; force_pte = true;
vma_shift = PAGE_SHIFT; vma_shift = PAGE_SHIFT;
logging_perm_fault = (fault_status == FSC_PERM && write_fault); use_read_lock = (fault_status == FSC_PERM && write_fault &&
fault_granule == PAGE_SIZE);
} else { } else {
vma_shift = get_vma_page_shift(vma, hva); vma_shift = get_vma_page_shift(vma, hva);
} }
...@@ -1218,7 +1219,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1218,7 +1219,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* logging dirty logging, only acquire read lock for permission * logging dirty logging, only acquire read lock for permission
* relaxation. * relaxation.
*/ */
if (logging_perm_fault) if (use_read_lock)
read_lock(&kvm->mmu_lock); read_lock(&kvm->mmu_lock);
else else
write_lock(&kvm->mmu_lock); write_lock(&kvm->mmu_lock);
...@@ -1268,6 +1269,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1268,6 +1269,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (fault_status == FSC_PERM && vma_pagesize == fault_granule) { if (fault_status == FSC_PERM && vma_pagesize == fault_granule) {
ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
} else { } else {
WARN_ONCE(use_read_lock, "Attempted stage-2 map outside of write lock\n");
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot, __pfn_to_phys(pfn), prot,
memcache); memcache);
...@@ -1280,7 +1283,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, ...@@ -1280,7 +1283,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
} }
out_unlock: out_unlock:
if (logging_perm_fault) if (use_read_lock)
read_unlock(&kvm->mmu_lock); read_unlock(&kvm->mmu_lock);
else else
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment