Commit cb00a70b authored by David Matlack's avatar David Matlack Committed by Paolo Bonzini

KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG

When using KVM_DIRTY_LOG_INITIALLY_SET, huge pages are not
write-protected when dirty logging is enabled on the memslot. Instead
they are write-protected once userspace invokes KVM_CLEAR_DIRTY_LOG for
the first time and only for the specific sub-region being cleared.

Enhance KVM_CLEAR_DIRTY_LOG to also try to split huge pages prior to
write-protecting to avoid causing write-protection faults on vCPU
threads. This also allows userspace to smear the cost of huge page
splitting across multiple ioctls, rather than splitting the entire
memslot as is the case when initially-all-set is not used.
Signed-off-by: default avatarDavid Matlack <dmatlack@google.com>
Message-Id: <20220119230739.2234394-17-dmatlack@google.com>
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parent a3fe5dbd
...@@ -2356,7 +2356,9 @@ ...@@ -2356,7 +2356,9 @@
KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If KVM_DIRTY_LOG_INITIALLY_SET is enabled or disabled. If
disabled, all huge pages in a memslot will be eagerly disabled, all huge pages in a memslot will be eagerly
split when dirty logging is enabled on that memslot. If split when dirty logging is enabled on that memslot. If
enabled, huge pages will not be eagerly split. enabled, eager page splitting will be performed during
the KVM_CLEAR_DIRTY ioctl, and only for the pages being
cleared.
Eager page splitting currently only supports splitting Eager page splitting currently only supports splitting
huge pages mapped by the TDP MMU. huge pages mapped by the TDP MMU.
......
...@@ -1590,6 +1590,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, ...@@ -1590,6 +1590,10 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot, const struct kvm_memory_slot *memslot,
int target_level); int target_level);
void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot,
u64 start, u64 end,
int target_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot); const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
......
...@@ -1358,6 +1358,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, ...@@ -1358,6 +1358,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask);
if (READ_ONCE(eager_page_split))
kvm_mmu_try_split_huge_pages(kvm, slot, start, end, PG_LEVEL_4K);
kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M);
/* Cross two large pages? */ /* Cross two large pages? */
...@@ -5830,16 +5833,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, ...@@ -5830,16 +5833,32 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
} }
/* Must be called with the mmu_lock held in write-mode. */
void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot,
u64 start, u64 end,
int target_level)
{
if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
target_level, false);
/*
* A TLB flush is unnecessary at this point for the same resons as in
* kvm_mmu_slot_try_split_huge_pages().
*/
}
void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *memslot, const struct kvm_memory_slot *memslot,
int target_level) int target_level)
{ {
u64 start = memslot->base_gfn; u64 start = memslot->base_gfn;
u64 end = start + memslot->npages; u64 end = start + memslot->npages;
if (is_tdp_mmu_enabled(kvm)) { if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock); read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);
read_unlock(&kvm->mmu_lock); read_unlock(&kvm->mmu_lock);
} }
......
...@@ -963,27 +963,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, ...@@ -963,27 +963,33 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
} }
/* /*
* tdp_mmu_link_sp_atomic - Atomically replace the given spte with an spte * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
* pointing to the provided page table. * provided page table.
* *
* @kvm: kvm instance * @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set * @iter: a tdp_iter instance currently on the SPTE that should be set
* @sp: The new TDP page table to install. * @sp: The new TDP page table to install.
* @account_nx: True if this page table is being installed to split a * @account_nx: True if this page table is being installed to split a
* non-executable huge page. * non-executable huge page.
* @shared: This operation is running under the MMU lock in read mode.
* *
* Returns: 0 if the new page table was installed. Non-0 if the page table * Returns: 0 if the new page table was installed. Non-0 if the page table
* could not be installed (e.g. the atomic compare-exchange failed). * could not be installed (e.g. the atomic compare-exchange failed).
*/ */
static int tdp_mmu_link_sp_atomic(struct kvm *kvm, struct tdp_iter *iter, static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool account_nx) struct kvm_mmu_page *sp, bool account_nx,
bool shared)
{ {
u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask); u64 spte = make_nonleaf_spte(sp->spt, !shadow_accessed_mask);
int ret; int ret = 0;
ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); if (shared) {
if (ret) ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
return ret; if (ret)
return ret;
} else {
tdp_mmu_set_spte(kvm, iter, spte);
}
spin_lock(&kvm->arch.tdp_mmu_pages_lock); spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add(&sp->link, &kvm->arch.tdp_mmu_pages); list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
...@@ -1051,7 +1057,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) ...@@ -1051,7 +1057,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
sp = tdp_mmu_alloc_sp(vcpu); sp = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_child_sp(sp, &iter); tdp_mmu_init_child_sp(sp, &iter);
if (tdp_mmu_link_sp_atomic(vcpu->kvm, &iter, sp, account_nx)) { if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
tdp_mmu_free_sp(sp); tdp_mmu_free_sp(sp);
break; break;
} }
...@@ -1277,12 +1283,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) ...@@ -1277,12 +1283,11 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
} }
static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
struct tdp_iter *iter) struct tdp_iter *iter,
bool shared)
{ {
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
lockdep_assert_held_read(&kvm->mmu_lock);
/* /*
* Since we are allocating while under the MMU lock we have to be * Since we are allocating while under the MMU lock we have to be
* careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
...@@ -1297,20 +1302,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, ...@@ -1297,20 +1302,27 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
return sp; return sp;
rcu_read_unlock(); rcu_read_unlock();
read_unlock(&kvm->mmu_lock);
if (shared)
read_unlock(&kvm->mmu_lock);
else
write_unlock(&kvm->mmu_lock);
iter->yielded = true; iter->yielded = true;
sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
read_lock(&kvm->mmu_lock); if (shared)
read_lock(&kvm->mmu_lock);
else
write_lock(&kvm->mmu_lock);
rcu_read_lock(); rcu_read_lock();
return sp; return sp;
} }
static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm, static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared)
struct kvm_mmu_page *sp)
{ {
const u64 huge_spte = iter->old_spte; const u64 huge_spte = iter->old_spte;
const int level = iter->level; const int level = iter->level;
...@@ -1333,7 +1345,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm, ...@@ -1333,7 +1345,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
* correctness standpoint since the translation will be the same either * correctness standpoint since the translation will be the same either
* way. * way.
*/ */
ret = tdp_mmu_link_sp_atomic(kvm, iter, sp, false); ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
if (ret) if (ret)
return ret; return ret;
...@@ -1350,7 +1362,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm, ...@@ -1350,7 +1362,7 @@ static int tdp_mmu_split_huge_page_atomic(struct kvm *kvm,
static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
struct kvm_mmu_page *root, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, gfn_t start, gfn_t end,
int target_level) int target_level, bool shared)
{ {
struct kvm_mmu_page *sp = NULL; struct kvm_mmu_page *sp = NULL;
struct tdp_iter iter; struct tdp_iter iter;
...@@ -1371,14 +1383,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, ...@@ -1371,14 +1383,14 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
*/ */
for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
retry: retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
continue; continue;
if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
continue; continue;
if (!sp) { if (!sp) {
sp = tdp_mmu_alloc_sp_for_split(kvm, &iter); sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
if (!sp) { if (!sp) {
ret = -ENOMEM; ret = -ENOMEM;
break; break;
...@@ -1388,7 +1400,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, ...@@ -1388,7 +1400,7 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
continue; continue;
} }
if (tdp_mmu_split_huge_page_atomic(kvm, &iter, sp)) if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
goto retry; goto retry;
sp = NULL; sp = NULL;
...@@ -1408,23 +1420,24 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, ...@@ -1408,23 +1420,24 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
return ret; return ret;
} }
/* /*
* Try to split all huge pages mapped by the TDP MMU down to the target level. * Try to split all huge pages mapped by the TDP MMU down to the target level.
*/ */
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot, const struct kvm_memory_slot *slot,
gfn_t start, gfn_t end, gfn_t start, gfn_t end,
int target_level) int target_level, bool shared)
{ {
struct kvm_mmu_page *root; struct kvm_mmu_page *root;
int r = 0; int r = 0;
lockdep_assert_held_read(&kvm->mmu_lock); kvm_lockdep_assert_mmu_lock_held(kvm, shared);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) { for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level); r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
if (r) { if (r) {
kvm_tdp_mmu_put_root(kvm, root, true); kvm_tdp_mmu_put_root(kvm, root, shared);
break; break;
} }
} }
......
...@@ -70,7 +70,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, ...@@ -70,7 +70,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
const struct kvm_memory_slot *slot, const struct kvm_memory_slot *slot,
gfn_t start, gfn_t end, gfn_t start, gfn_t end,
int target_level); int target_level, bool shared);
static inline void kvm_tdp_mmu_walk_lockless_begin(void) static inline void kvm_tdp_mmu_walk_lockless_begin(void)
{ {
......
...@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true; ...@@ -192,7 +192,7 @@ bool __read_mostly enable_pmu = true;
EXPORT_SYMBOL_GPL(enable_pmu); EXPORT_SYMBOL_GPL(enable_pmu);
module_param(enable_pmu, bool, 0444); module_param(enable_pmu, bool, 0444);
static bool __read_mostly eager_page_split = true; bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644); module_param(eager_page_split, bool, 0644);
/* /*
......
...@@ -307,6 +307,8 @@ extern int pi_inject_timer; ...@@ -307,6 +307,8 @@ extern int pi_inject_timer;
extern bool report_ignored_msrs; extern bool report_ignored_msrs;
extern bool eager_page_split;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{ {
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment