Commit 22b94c4b authored by Paolo Bonzini's avatar Paolo Bonzini

KVM: x86/mmu: Zap invalidated roots via asynchronous worker

Use the system worker threads to zap the roots invalidated
by the TDP MMU's "fast zap" mechanism, implemented by
kvm_tdp_mmu_invalidate_all_roots().

At this point, apart from allowing some parallelism in the zapping of
roots, the workqueue is a glorified linked list: work items are added and
flushed entirely within a single kvm->slots_lock critical section.  However,
the workqueue fixes a latent issue where kvm_mmu_zap_all_invalidated_roots()
assumes that it owns a reference to all invalid roots; therefore, no
one can set the invalid bit outside kvm_mmu_zap_all_fast().  Putting the
invalidated roots on a linked list... erm, on a workqueue ensures that
tdp_mmu_zap_root_work() only puts back those extra references that
kvm_mmu_zap_all_invalidated_roots() had gifted to it.
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent bb95dfb9
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/cpumask.h> #include <linux/cpumask.h>
#include <linux/irq_work.h> #include <linux/irq_work.h>
#include <linux/irq.h> #include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/kvm.h> #include <linux/kvm.h>
#include <linux/kvm_para.h> #include <linux/kvm_para.h>
...@@ -1217,6 +1218,7 @@ struct kvm_arch { ...@@ -1217,6 +1218,7 @@ struct kvm_arch {
* the thread holds the MMU lock in write mode. * the thread holds the MMU lock in write mode.
*/ */
spinlock_t tdp_mmu_pages_lock; spinlock_t tdp_mmu_pages_lock;
struct workqueue_struct *tdp_mmu_zap_wq;
#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_64 */
/* /*
......
...@@ -5752,11 +5752,8 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm) ...@@ -5752,11 +5752,8 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Deferring the zap until the final reference to the root is put would * Deferring the zap until the final reference to the root is put would
* lead to use-after-free. * lead to use-after-free.
*/ */
if (is_tdp_mmu_enabled(kvm)) { if (is_tdp_mmu_enabled(kvm))
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_invalidated_roots(kvm); kvm_tdp_mmu_zap_invalidated_roots(kvm);
read_unlock(&kvm->mmu_lock);
}
} }
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
......
...@@ -65,7 +65,13 @@ struct kvm_mmu_page { ...@@ -65,7 +65,13 @@ struct kvm_mmu_page {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
tdp_ptep_t ptep; tdp_ptep_t ptep;
}; };
union {
DECLARE_BITMAP(unsync_child_bitmap, 512); DECLARE_BITMAP(unsync_child_bitmap, 512);
struct {
struct work_struct tdp_mmu_async_work;
void *tdp_mmu_async_data;
};
};
struct list_head lpage_disallowed_link; struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
......
...@@ -25,6 +25,8 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) ...@@ -25,6 +25,8 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
kvm->arch.tdp_mmu_zap_wq =
alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
return true; return true;
} }
...@@ -46,12 +48,16 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) ...@@ -46,12 +48,16 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled) if (!kvm->arch.tdp_mmu_enabled)
return; return;
flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/* /*
* Ensure that all the outstanding RCU callbacks to free shadow pages * Ensure that all the outstanding RCU callbacks to free shadow pages
* can run before the VM is torn down. * can run before the VM is torn down. Work items on tdp_mmu_zap_wq
* can call kvm_tdp_mmu_put_root and create new callbacks.
*/ */
rcu_barrier(); rcu_barrier();
} }
...@@ -81,6 +87,43 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) ...@@ -81,6 +87,43 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared); bool shared);
static void tdp_mmu_zap_root_work(struct work_struct *work)
{
struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
tdp_mmu_async_work);
struct kvm *kvm = root->tdp_mmu_async_data;
read_lock(&kvm->mmu_lock);
/*
* A TLB flush is not necessary as KVM performs a local TLB flush when
* allocating a new root (see kvm_mmu_load()), and when migrating vCPU
* to a different pCPU. Note, the local TLB flush on reuse also
* invalidates any paging-structure-cache entries, i.e. TLB entries for
* intermediate paging structures, that may be zapped, as such entries
* are associated with the ASID on both VMX and SVM.
*/
tdp_mmu_zap_root(kvm, root, true);
/*
* Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
* avoiding an infinite loop. By design, the root is reachable while
* it's being asynchronously zapped, thus a different task can put its
* last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
* asynchronously zapped root is unavoidable.
*/
kvm_tdp_mmu_put_root(kvm, root, true);
read_unlock(&kvm->mmu_lock);
}
static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
root->tdp_mmu_async_data = kvm;
INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
}
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared) bool shared)
{ {
...@@ -892,6 +935,13 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) ...@@ -892,6 +935,13 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
int i; int i;
/* /*
* Zap all roots, including invalid roots, as all SPTEs must be dropped
* before returning to the caller. Zap directly even if the root is
* also being zapped by a worker. Walking zapped top-level SPTEs isn't
* all that expensive and mmu_lock is already held, which means the
* worker has yielded, i.e. flushing the work instead of zapping here
* isn't guaranteed to be any faster.
*
* A TLB flush is unnecessary, KVM zaps everything if and only the VM * A TLB flush is unnecessary, KVM zaps everything if and only the VM
* is being destroyed or the userspace VMM has exited. In both cases, * is being destroyed or the userspace VMM has exited. In both cases,
* KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
...@@ -902,96 +952,28 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) ...@@ -902,96 +952,28 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
} }
} }
static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
struct kvm_mmu_page *prev_root)
{
struct kvm_mmu_page *next_root;
if (prev_root)
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&prev_root->link,
typeof(*prev_root), link);
else
next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
typeof(*next_root), link);
while (next_root && !(next_root->role.invalid &&
refcount_read(&next_root->tdp_mmu_root_count)))
next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
&next_root->link,
typeof(*next_root), link);
return next_root;
}
/* /*
* Zap all invalidated roots to ensure all SPTEs are dropped before the "fast * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
* zap" completes. Since kvm_tdp_mmu_invalidate_all_roots() has acquired a * zap" completes.
* reference to each invalidated root, roots will not be freed until after this
* function drops the gifted reference, e.g. so that vCPUs don't get stuck with
* tearing down paging structures.
*/ */
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
{ {
struct kvm_mmu_page *next_root; flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
rcu_read_lock();
root = next_invalidated_root(kvm, NULL);
while (root) {
next_root = next_invalidated_root(kvm, root);
rcu_read_unlock();
/*
* A TLB flush is unnecessary, invalidated roots are guaranteed
* to be unreachable by the guest (see kvm_tdp_mmu_put_root()
* for more details), and unlike the legacy MMU, no vCPU kick
* is needed to play nice with lockless shadow walks as the TDP
* MMU protects its paging structures via RCU. Note, zapping
* will still flush on yield, but that's a minor performance
* blip and not a functional issue.
*/
tdp_mmu_zap_root(kvm, root, true);
/*
* Put the reference acquired in
* kvm_tdp_mmu_invalidate_roots
*/
kvm_tdp_mmu_put_root(kvm, root, true);
root = next_root;
rcu_read_lock();
}
rcu_read_unlock();
} }
/* /*
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
* is about to be zapped, e.g. in response to a memslots update. The caller is * is about to be zapped, e.g. in response to a memslots update. The actual
* responsible for invoking kvm_tdp_mmu_zap_invalidated_roots() to do the actual * zapping is performed asynchronously, so a reference is taken on all roots.
* zapping. * Using a separate workqueue makes it easy to ensure that the destruction is
* * performed before the "fast zap" completes, without keeping a separate list
* Take a reference on all roots to prevent the root from being freed before it * of invalidated roots; the list is effectively the list of work items in
* is zapped by this thread. Freeing a root is not a correctness issue, but if * the workqueue.
* a vCPU drops the last reference to a root prior to the root being zapped, it
* will get stuck with tearing down the entire paging structure.
* *
* Get a reference even if the root is already invalid, * Get a reference even if the root is already invalid, the asynchronous worker
* kvm_tdp_mmu_zap_invalidated_roots() assumes it was gifted a reference to all * assumes it was gifted a reference to the root it processes. Because mmu_lock
* invalid roots, e.g. there's no epoch to identify roots that were invalidated * is held for write, it should be impossible to observe a root with zero refcount,
* by a previous call. Roots stay on the list until the last reference is * i.e. the list of roots cannot be stale.
* dropped, so even though all invalid roots are zapped, a root may not go away
* for quite some time, e.g. if a vCPU blocks across multiple memslot updates.
*
* Because mmu_lock is held for write, it should be impossible to observe a
* root with zero refcount, i.e. the list of roots cannot be stale.
* *
* This has essentially the same effect for the TDP MMU * This has essentially the same effect for the TDP MMU
* as updating mmu_valid_gen does for the shadow MMU. * as updating mmu_valid_gen does for the shadow MMU.
...@@ -1002,8 +984,10 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) ...@@ -1002,8 +984,10 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held_write(&kvm->mmu_lock);
list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) if (!WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
root->role.invalid = true; root->role.invalid = true;
tdp_mmu_schedule_zap_root(kvm, root);
}
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment