Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini: "x86 has lots of small bugfixes, mostly one liners. It's quite late in 5.11-rc but none of them are related to this merge window; it's just bugs coming in at the wrong time. Of note among the others is "KVM: x86: Allow guests to see MSR_IA32_TSX_CTRL even if tsx=off" that fixes a live migration failure seen on distros that hadn't switched to tsx=off right away. ARM: - Avoid clobbering extra registers on initialisation" [ Sean Christopherson notes that commit 943dea8a ("KVM: x86: Update emulator context mode if SYSENTER xfers to 64-bit mode") should have had authorship credited to Jonny Barker, not to him. - Linus ] * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: Set so called 'reserved CR3 bits in LM mask' at vCPU reset KVM: x86/mmu: Fix TDP MMU zap collapsible SPTEs KVM: x86: cleanup CR3 reserved bits checks KVM: SVM: Treat SVM as unsupported when running as an SEV guest KVM: x86: Update emulator context mode if SYSENTER xfers to 64-bit mode KVM: x86: Supplement __cr4_reserved_bits() with X86_FEATURE_PCID check KVM/x86: assign hva with the right value to vm_munmap the pages KVM: x86: Allow guests to see MSR_IA32_TSX_CTRL even if tsx=off Fix unsynchronized access to sev members through svm_register_enc_region KVM: Documentation: Fix documentation for nested. KVM: x86: fix CPUID entries returned by KVM_GET_CPUID2 ioctl KVM: arm64: Don't clobber x4 in __do_hyp_init

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "x86 has lots of small bugfixes, mostly one liners. It's quite late in 5.11-rc but none of them are related to this merge window; it's just bugs coming in at the wrong time. Of note among the others is "KVM: x86: Allow guests to see MSR_IA32_TSX_CTRL even if tsx=off" that fixes a live migration failure seen on distros that hadn't switched to tsx=off right away. ARM: - Avoid clobbering extra registers on initialisation" [ Sean Christopherson notes that commit 943dea8a ("KVM: x86: Update emulator context mode if SYSENTER xfers to 64-bit mode") should have had authorship credited to Jonny Barker, not to him. - Linus ] * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: Set so called 'reserved CR3 bits in LM mask' at vCPU reset KVM: x86/mmu: Fix TDP MMU zap collapsible SPTEs KVM: x86: cleanup CR3 reserved bits checks KVM: SVM: Treat SVM as unsupported when running as an SEV guest KVM: x86: Update emulator context mode if SYSENTER xfers to 64-bit mode KVM: x86: Supplement __cr4_reserved_bits() with X86_FEATURE_PCID check KVM/x86: assign hva with the right value to vm_munmap the pages KVM: x86: Allow guests to see MSR_IA32_TSX_CTRL even if tsx=off Fix unsynchronized access to sev members through svm_register_enc_region KVM: Documentation: Fix documentation for nested. KVM: x86: fix CPUID entries returned by KVM_GET_CPUID2 ioctl KVM: arm64: Don't clobber x4 in __do_hyp_init
6157ce59 · Linus Torvalds · 97ba0c74 · 031b91a5 · 6157ce59 · 6157ce59
Commit 6157ce59 authored Feb 05, 2021 by Linus Torvalds
14 changed files
--- a/Documentation/virt/kvm/nested-vmx.rst
+++ b/Documentation/virt/kvm/nested-vmx.rst
@@ -37,8 +37,10 @@ call L2.
 Running nested VMX
 ------------------

-The nested VMX feature is disabled by default. It can be enabled by giving
-the "nested=1" option to the kvm-intel module.
+The nested VMX feature is enabled by default since Linux kernel v4.20. For
+older Linux kernel, it can be enabled by giving the "nested=1" option to the
+kvm-intel module.
+

 No modifications are required to user space (qemu). However, qemu's default
 emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be

--- a/Documentation/virt/kvm/running-nested-guests.rst
+++ b/Documentation/virt/kvm/running-nested-guests.rst
@@ -74,7 +74,7 @@ few:
 Enabling "nested" (x86)
 -----------------------

-From Linux kernel v4.19 onwards, the ``nested`` KVM parameter is enabled
+From Linux kernel v4.20 onwards, the ``nested`` KVM parameter is enabled
 by default for Intel and AMD.  (Though your Linux distribution might
 override this default.)


--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -47,6 +47,8 @@ __invalid:
 	b	.

 	/*
+	 * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers.
+	 *
 	 * x0: SMCCC function ID
 	 * x1: struct kvm_nvhe_init_params PA
 	 */
@@ -70,9 +72,9 @@ __do_hyp_init:
 	eret

 1:	mov	x0, x1
-	mov	x4, lr
-	bl	___kvm_hyp_init
-	mov	lr, x4
+	mov	x3, lr
+	bl	___kvm_hyp_init			// Clobbers x0..x2
+	mov	lr, x3

 	/* Hello, World! */
 	mov	x0, #SMCCC_RET_SUCCESS
@@ -82,8 +84,8 @@ SYM_CODE_END(__kvm_hyp_init)
 /*
 * Initialize the hypervisor in EL2.
 *
- * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers
- * and leave x4 for the caller.
+ * Only uses x0..x2 so as to not clobber callee-saved SMCCC registers
+ * and leave x3 for the caller.
 *
 * x0: struct kvm_nvhe_init_params PA
 */
@@ -112,9 +114,9 @@ alternative_else_nop_endif
 	/*
 	 * Set the PS bits in TCR_EL2.
 	 */
-	ldr	x1, [x0, #NVHE_INIT_TCR_EL2]
-	tcr_compute_pa_size x1, #TCR_EL2_PS_SHIFT, x2, x3
-	msr	tcr_el2, x1
+	ldr	x0, [x0, #NVHE_INIT_TCR_EL2]
+	tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
+	msr	tcr_el2, x0

 	isb

@@ -193,7 +195,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)

 	/* Enable MMU, set vectors and stack. */
 	mov	x0, x28
-	bl	___kvm_hyp_init			// Clobbers x0..x3
+	bl	___kvm_hyp_init			// Clobbers x0..x2

 	/* Leave idmap. */
 	mov	x0, x29

--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -321,7 +321,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
 	r = -EFAULT;
-	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+	if (copy_to_user(entries, vcpu->arch.cpuid_entries,
 			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
 	return 0;

--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2879,6 +2879,8 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
 	*reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
 							      (u32)msr_data;
+	if (efer & EFER_LMA)
+		ctxt->mode = X86EMUL_MODE_PROT64;

 	return X86EMUL_CONTINUE;
 }

--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1049,8 +1049,8 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
 }

 /*
- * Clear non-leaf entries (and free associated page tables) which could
- * be replaced by large mappings, for GFNs within the slot.
+ * Clear leaf entries which could be replaced by large mappings, for
+ * GFNs within the slot.
 */
 static void zap_collapsible_spte_range(struct kvm *kvm,
 				       struct kvm_mmu_page *root,
@@ -1062,7 +1062,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,

 	tdp_root_for_each_pte(iter, root, start, end) {
 		if (!is_shadow_present_pte(iter.old_spte) ||
-		    is_last_spte(iter.old_spte, iter.level))
+		    !is_last_spte(iter.old_spte, iter.level))
 			continue;

 		pfn = spte_to_pfn(iter.old_spte);

--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -231,6 +231,7 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control)

 static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
 {
+	struct kvm_vcpu *vcpu = &svm->vcpu;
 	bool vmcb12_lma;

 	if ((vmcb12->save.efer & EFER_SVME) == 0)
@@ -244,18 +245,10 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)

 	vmcb12_lma = (vmcb12->save.efer & EFER_LME) && (vmcb12->save.cr0 & X86_CR0_PG);

-	if (!vmcb12_lma) {
-		if (vmcb12->save.cr4 & X86_CR4_PAE) {
-			if (vmcb12->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK)
-				return false;
-		} else {
-			if (vmcb12->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK)
-				return false;
-		}
-	} else {
+	if (vmcb12_lma) {
 		if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
 		    !(vmcb12->save.cr0 & X86_CR0_PE) ||
-		    (vmcb12->save.cr3 & MSR_CR3_LONG_MBZ_MASK))
+		    (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits))
 			return false;
 	}
 	if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))

--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -342,6 +342,8 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 	unsigned long first, last;
 	int ret;

+	lockdep_assert_held(&kvm->lock);
+
 	if (ulen == 0 || uaddr + ulen < uaddr)
 		return ERR_PTR(-EINVAL);

@@ -1119,12 +1121,20 @@ int svm_register_enc_region(struct kvm *kvm,
 	if (!region)
 		return -ENOMEM;

+	mutex_lock(&kvm->lock);
 	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
 	if (IS_ERR(region->pages)) {
 		ret = PTR_ERR(region->pages);
+		mutex_unlock(&kvm->lock);
 		goto e_free;
 	}

+	region->uaddr = range->addr;
+	region->size = range->size;
+
+	list_add_tail(&region->list, &sev->regions_list);
+	mutex_unlock(&kvm->lock);
+
 	/*
 	 * The guest may change the memory encryption attribute from C=0 -> C=1
 	 * or vice versa for this memory range. Lets make sure caches are
@@ -1133,13 +1143,6 @@ int svm_register_enc_region(struct kvm *kvm,
 	 */
 	sev_clflush_pages(region->pages, region->npages);

-	region->uaddr = range->addr;
-	region->size = range->size;
-
-	mutex_lock(&kvm->lock);
-	list_add_tail(&region->list, &sev->regions_list);
-	mutex_unlock(&kvm->lock);
-
 	return ret;

 e_free:

--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -454,6 +454,11 @@ static int has_svm(void)
 		return 0;
 	}

+	if (sev_active()) {
+		pr_info("KVM is unsupported when running as an SEV guest\n");
+		return 0;
+	}
+
 	return 1;
 }


--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -403,9 +403,6 @@ static inline bool gif_set(struct vcpu_svm *svm)
 }

 /* svm.c */
-#define MSR_CR3_LEGACY_RESERVED_MASK		0xfe7U
-#define MSR_CR3_LEGACY_PAE_RESERVED_MASK	0x7U
-#define MSR_CR3_LONG_MBZ_MASK			0xfff0000000000000U
 #define MSR_INVALID				0xffffffffU

 extern int sev;

--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6860,11 +6860,20 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 		switch (index) {
 		case MSR_IA32_TSX_CTRL:
 			/*
-			 * No need to pass TSX_CTRL_CPUID_CLEAR through, so
-			 * let's avoid changing CPUID bits under the host
-			 * kernel's feet.
+			 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
+			 * interception.  Keep the host value unchanged to avoid
+			 * changing CPUID bits under the host kernel's feet.
+			 *
+			 * hle=0, rtm=0, tsx_ctrl=1 can be found with some
+			 * combinations of new kernel and old userspace.  If
+			 * those guests run on a tsx=off host, do allow guests
+			 * to use TSX_CTRL, but do not change the value on the
+			 * host so that TSX remains always disabled.
 			 */
-			vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+			if (boot_cpu_has(X86_FEATURE_RTM))
+				vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+			else
+				vmx->guest_uret_msrs[j].mask = 0;
 			break;
 		default:
 			vmx->guest_uret_msrs[j].mask = -1ull;

--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1394,16 +1394,24 @@ static u64 kvm_get_arch_capabilities(void)
 	if (!boot_cpu_has_bug(X86_BUG_MDS))
 		data |= ARCH_CAP_MDS_NO;

-	/*
-	 * On TAA affected systems:
-	 *      - nothing to do if TSX is disabled on the host.
-	 *      - we emulate TSX_CTRL if present on the host.
-	 *	  This lets the guest use VERW to clear CPU buffers.
-	 */
-	if (!boot_cpu_has(X86_FEATURE_RTM))
-		data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
-	else if (!boot_cpu_has_bug(X86_BUG_TAA))
+	if (!boot_cpu_has(X86_FEATURE_RTM)) {
+		/*
+		 * If RTM=0 because the kernel has disabled TSX, the host might
+		 * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
+		 * and therefore knows that there cannot be TAA) but keep
+		 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+		 * and we want to allow migrating those guests to tsx=off hosts.
+		 */
+		data &= ~ARCH_CAP_TAA_NO;
+	} else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
 		data |= ARCH_CAP_TAA_NO;
+	} else {
+		/*
+		 * Nothing to do here; we emulate TSX_CTRL if present on the
+		 * host so the guest can choose between disabling TSX or
+		 * using VERW to clear CPU buffers.
+		 */
+	}

 	return data;
 }
@@ -9616,6 +9624,8 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 		 */
 		if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
 			return false;
+		if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
+			return false;
 	} else {
 		/*
 		 * Not in 64-bit mode: EFER.LMA is clear and the code
@@ -9993,6 +10003,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	fx_init(vcpu);

 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+	vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);

 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;

@@ -10494,7 +10505,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
 			return 0;

 		old_npages = slot->npages;
-		hva = 0;
+		hva = slot->userspace_addr;
 	}

 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {

--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -425,6 +425,8 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
 		__reserved_bits |= X86_CR4_UMIP;        \
 	if (!__cpu_has(__c, X86_FEATURE_VMX))           \
 		__reserved_bits |= X86_CR4_VMXE;        \
+	if (!__cpu_has(__c, X86_FEATURE_PCID))          \
+		__reserved_bits |= X86_CR4_PCIDE;       \
 	__reserved_bits;                                \
 })


--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -382,6 +382,7 @@ bool sev_active(void)
 {
 	return sev_status & MSR_AMD64_SEV_ENABLED;
 }
+EXPORT_SYMBOL_GPL(sev_active);

 /* Needs to be called from non-instrumentable code */
 bool noinstr sev_es_active(void)