Commit bb95dfb9 authored by Sean Christopherson's avatar Sean Christopherson Committed by Paolo Bonzini

KVM: x86/mmu: Defer TLB flush to caller when freeing TDP MMU shadow pages

Defer TLB flushes to the caller when freeing TDP MMU shadow pages instead
of immediately flushing.  Because the shadow pages are freed in an RCU
callback, so long as at least one CPU holds RCU, all CPUs are protected.
For vCPUs running in the guest, i.e. consuming TLB entries, KVM only
needs to ensure the caller services the pending TLB flush before dropping
its RCU protections.  I.e. use the caller's RCU as a proxy for all vCPUs
running in the guest.

Deferring the flushes allows batching flushes, e.g. when installing a
1gb hugepage and zapping a pile of SPs.  And when zapping an entire root,
deferring flushes allows skipping the flush entirely (because flushes are
not needed in that case).

Avoiding flushes when zapping an entire root is especially important as
synchronizing with other CPUs via IPI after zapping every shadow page can
cause significant performance issues for large VMs.  The issue is
exacerbated by KVM zapping entire top-level entries without dropping
RCU protection, which can lead to RCU stalls even when zapping roots
backing relatively "small" amounts of guest memory, e.g. 2tb.  Removing
the IPI bottleneck largely mitigates the RCU issues, though it's likely
still a problem for 5-level paging.  A future patch will further address
the problem by zapping roots in multiple passes to avoid holding RCU for
an extended duration.
Reviewed-by: default avatarBen Gardon <bgardon@google.com>
Signed-off-by: default avatarSean Christopherson <seanjc@google.com>
Message-Id: <20220226001546.360188-20-seanjc@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent bd296779
...@@ -6360,6 +6360,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ...@@ -6360,6 +6360,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
rcu_idx = srcu_read_lock(&kvm->srcu); rcu_idx = srcu_read_lock(&kvm->srcu);
write_lock(&kvm->mmu_lock); write_lock(&kvm->mmu_lock);
/*
* Zapping TDP MMU shadow pages, including the remote TLB flush, must
* be done under RCU protection, because the pages are freed via RCU
* callback.
*/
rcu_read_lock();
ratio = READ_ONCE(nx_huge_pages_recovery_ratio); ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) { for ( ; to_zap; --to_zap) {
...@@ -6384,12 +6391,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) ...@@ -6384,12 +6391,18 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
rcu_read_unlock();
cond_resched_rwlock_write(&kvm->mmu_lock); cond_resched_rwlock_write(&kvm->mmu_lock);
flush = false; flush = false;
rcu_read_lock();
} }
} }
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
rcu_read_unlock();
write_unlock(&kvm->mmu_lock); write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx); srcu_read_unlock(&kvm->srcu, rcu_idx);
} }
......
...@@ -9,10 +9,9 @@ ...@@ -9,10 +9,9 @@
/* /*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs) * TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
* to be zapped while holding mmu_lock for read. Holding RCU isn't required for * to be zapped while holding mmu_lock for read, and to allow TLB flushes to be
* correctness if mmu_lock is held for write, but plumbing "struct kvm" down to * batched without having to collect the list of zapped SPs. Flows that can
* the lower depths of the TDP MMU just to make lockdep happy is a nightmare, so * remove SPs must service pending TLB flushes prior to dropping RCU protection.
* all accesses to SPTEs are done under RCU protection.
*/ */
static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep) static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{ {
......
...@@ -391,9 +391,6 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) ...@@ -391,9 +391,6 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
shared); shared);
} }
kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
} }
...@@ -817,19 +814,13 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) ...@@ -817,19 +814,13 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!sp->ptep)) if (WARN_ON_ONCE(!sp->ptep))
return false; return false;
rcu_read_lock();
old_spte = kvm_tdp_mmu_read_spte(sp->ptep); old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) { if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
rcu_read_unlock();
return false; return false;
}
__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
sp->gfn, sp->role.level + 1, true, true); sp->gfn, sp->role.level + 1, true, true);
rcu_read_unlock();
return true; return true;
} }
...@@ -870,6 +861,11 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, ...@@ -870,6 +861,11 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
} }
rcu_read_unlock(); rcu_read_unlock();
/*
* Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
* to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
*/
return flush; return flush;
} }
...@@ -1036,6 +1032,10 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, ...@@ -1036,6 +1032,10 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
ret = RET_PF_SPURIOUS; ret = RET_PF_SPURIOUS;
else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY; return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
KVM_PAGES_PER_HPAGE(iter->level + 1));
/* /*
* If the page fault was caused by a write but the page is write * If the page fault was caused by a write but the page is write
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment