Commit c2a2ac2b authored by Xiao Guangrong's avatar Xiao Guangrong Committed by Avi Kivity

KVM: MMU: lockless walking shadow page table

Use rcu to protect shadow pages table to be freed, so we can safely walk it,
it should run fastly and is needed by mmio page fault
Signed-off-by: default avatarXiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
parent 603e0651
......@@ -233,6 +233,12 @@ struct kvm_mmu_page {
unsigned int unsync_children;
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
int clear_spte_count;
#endif
struct rcu_head rcu;
};
struct kvm_pv_mmu_op_buffer {
......@@ -486,6 +492,8 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;
atomic_t reader_counter;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
......
......@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
for (shadow_walk_init(&(_walker), _vcpu, _addr); \
shadow_walk_okay(&(_walker)) && \
({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
__shadow_walk_next(&(_walker), spte))
static struct kmem_cache *pte_list_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
......@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
return xchg(sptep, spte);
}
static u64 __get_spte_lockless(u64 *sptep)
{
return ACCESS_ONCE(*sptep);
}
#else
union split_spte {
struct {
......@@ -283,6 +294,18 @@ union split_spte {
u64 spte;
};
static void count_spte_clear(u64 *sptep, u64 spte)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));
if (is_shadow_present_pte(spte))
return;
/* Ensure the spte is completely set before we increase the count */
smp_wmb();
sp->clear_spte_count++;
}
static void __set_spte(u64 *sptep, u64 spte)
{
union split_spte *ssptep, sspte;
......@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
smp_wmb();
ssptep->spte_high = sspte.spte_high;
count_spte_clear(sptep, spte);
}
static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
......@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/* xchg acts as a barrier before the setting of the high bits */
orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
orig.spte_high = ssptep->spte_high = sspte.spte_high;
count_spte_clear(sptep, spte);
return orig.spte;
}
/*
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte(arch/x86/mm/gup.c).
* The difference is we can not catch the spte tlb flush if we leave
* guest mode, so we emulate it by increase clear_spte_count when spte
* is cleared.
*/
static u64 __get_spte_lockless(u64 *sptep)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));
union split_spte spte, *orig = (union split_spte *)sptep;
int count;
retry:
count = sp->clear_spte_count;
smp_rmb();
spte.spte_low = orig->spte_low;
smp_rmb();
spte.spte_high = orig->spte_high;
smp_rmb();
if (unlikely(spte.spte_low != orig->spte_low ||
count != sp->clear_spte_count))
goto retry;
return spte.spte;
}
#endif
static bool spte_has_volatile_bits(u64 spte)
......@@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
__update_clear_spte_fast(sptep, 0ull);
}
static u64 mmu_spte_get_lockless(u64 *sptep)
{
return __get_spte_lockless(sptep);
}
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
rcu_read_lock();
atomic_inc(&vcpu->kvm->arch.reader_counter);
/* Increase the counter before walking shadow page table */
smp_mb__after_atomic_inc();
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
/* Decrease the counter after walking shadow page table finished */
smp_mb__before_atomic_dec();
atomic_dec(&vcpu->kvm->arch.reader_counter);
rcu_read_unlock();
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
......@@ -1597,17 +1674,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
return true;
}
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
u64 spte)
{
if (is_last_spte(*iterator->sptep, iterator->level)) {
if (is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}
iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
--iterator->level;
}
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
return __shadow_walk_next(iterator, *iterator->sptep);
}
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
u64 spte;
......@@ -1754,6 +1837,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
return ret;
}
static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
{
struct kvm_mmu_page *sp;
list_for_each_entry(sp, invalid_list, link)
kvm_mmu_isolate_page(sp);
}
static void free_pages_rcu(struct rcu_head *head)
{
struct kvm_mmu_page *next, *sp;
sp = container_of(head, struct kvm_mmu_page, rcu);
while (sp) {
if (!list_empty(&sp->link))
next = list_first_entry(&sp->link,
struct kvm_mmu_page, link);
else
next = NULL;
kvm_mmu_free_page(sp);
sp = next;
}
}
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
......@@ -1764,6 +1871,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
if (atomic_read(&kvm->arch.reader_counter)) {
kvm_mmu_isolate_pages(invalid_list);
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
list_del_init(invalid_list);
call_rcu(&sp->rcu, free_pages_rcu);
return;
}
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
......@@ -3784,16 +3899,17 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
struct kvm_shadow_walk_iterator iterator;
u64 spte;
int nr_sptes = 0;
spin_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry(vcpu, addr, iterator) {
sptes[iterator.level-1] = *iterator.sptep;
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
sptes[iterator.level-1] = spte;
nr_sptes++;
if (!is_shadow_present_pte(*iterator.sptep))
if (!is_shadow_present_pte(spte))
break;
}
spin_unlock(&vcpu->kvm->mmu_lock);
walk_shadow_page_lockless_end(vcpu);
return nr_sptes;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment