Merge commit 'upstream-x86-entry' into WIP.x86/mm

Pull in a minimal set of v4.15 entry code changes, for a base for the MM isolation patches. Signed-off-by: Ingo Molnar <mingo@kernel.org>

Merge commit 'upstream-x86-entry' into WIP.x86/mm
Pull in a minimal set of v4.15 entry code changes, for a base for the MM isolation patches. Signed-off-by: Ingo Molnar <mingo@kernel.org>
0fd2e9c5 · Ingo Molnar · 1784f914 · 1e4c4f61 · 0fd2e9c5 · 0fd2e9c5
Commit 0fd2e9c5 authored Dec 01, 2017 by Ingo Molnar
55 changed files
--- a/Documentation/x86/orc-unwinder.txt
+++ b/Documentation/x86/orc-unwinder.txt
@@ -4,7 +4,7 @@ ORC unwinder
 Overview
 --------

-The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is
 similar in concept to a DWARF unwinder.  The difference is that the
 format of the ORC data is much simpler than DWARF, which in turn allows
 the ORC unwinder to be much simpler and faster.

--- a/Makefile
+++ b/Makefile
@@ -934,8 +934,8 @@ ifdef CONFIG_STACK_VALIDATION
  ifeq ($(has_libelf),1)
    objtool_target := tools/objtool FORCE
  else
-    ifdef CONFIG_ORC_UNWINDER
-      $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+    ifdef CONFIG_UNWINDER_ORC
+      $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
    else
      $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
    endif

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -171,7 +171,7 @@ config X86
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
+	select HAVE_RELIABLE_STACKTRACE		if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK

--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -359,28 +359,14 @@ config PUNIT_ATOM_DEBUG

 choice
 	prompt "Choose kernel unwinder"
-	default FRAME_POINTER_UNWINDER
+	default UNWINDER_ORC if X86_64
+	default UNWINDER_FRAME_POINTER if X86_32
 	---help---
 	  This determines which method will be used for unwinding kernel stack
 	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
 	  livepatch, lockdep, and more.

-config FRAME_POINTER_UNWINDER
-	bool "Frame pointer unwinder"
-	select FRAME_POINTER
-	---help---
-	  This option enables the frame pointer unwinder for unwinding kernel
-	  stack traces.
-
-	  The unwinder itself is fast and it uses less RAM than the ORC
-	  unwinder, but the kernel text size will grow by ~3% and the kernel's
-	  overall performance will degrade by roughly 5-10%.
-
-	  This option is recommended if you want to use the livepatch
-	  consistency model, as this is currently the only way to get a
-	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
-
-config ORC_UNWINDER
+config UNWINDER_ORC
 	bool "ORC unwinder"
 	depends on X86_64
 	select STACK_VALIDATION
@@ -396,7 +382,22 @@ config ORC_UNWINDER
 	  Enabling this option will increase the kernel's runtime memory usage
 	  by roughly 2-4MB, depending on your kernel config.

-config GUESS_UNWINDER
+config UNWINDER_FRAME_POINTER
+	bool "Frame pointer unwinder"
+	select FRAME_POINTER
+	---help---
+	  This option enables the frame pointer unwinder for unwinding kernel
+	  stack traces.
+
+	  The unwinder itself is fast and it uses less RAM than the ORC
+	  unwinder, but the kernel text size will grow by ~3% and the kernel's
+	  overall performance will degrade by roughly 5-10%.
+
+	  This option is recommended if you want to use the livepatch
+	  consistency model, as this is currently the only way to get a
+	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config UNWINDER_GUESS
 	bool "Guess unwinder"
 	depends on EXPERT
 	---help---
@@ -411,7 +412,7 @@ config GUESS_UNWINDER
 endchoice

 config FRAME_POINTER
-	depends on !ORC_UNWINDER && !GUESS_UNWINDER
+	depends on !UNWINDER_ORC && !UNWINDER_GUESS
 	bool

 endmenu
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
 # CONFIG_HIGHMEM64G is not set
-CONFIG_GUESS_UNWINDER=y
-# CONFIG_FRAME_POINTER_UNWINDER is not set
+CONFIG_UNWINDER_GUESS=y
+# CONFIG_UNWINDER_FRAME_POINTER is not set
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_OPTIMIZE_INLINING=y
+CONFIG_UNWINDER_ORC=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y

--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -142,56 +142,25 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm

-	.macro RESTORE_EXTRA_REGS offset=0
-	movq 0*8+\offset(%rsp), %r15
-	movq 1*8+\offset(%rsp), %r14
-	movq 2*8+\offset(%rsp), %r13
-	movq 3*8+\offset(%rsp), %r12
-	movq 4*8+\offset(%rsp), %rbp
-	movq 5*8+\offset(%rsp), %rbx
-	UNWIND_HINT_REGS offset=\offset extra=0
-	.endm
-
-	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
-	.if \rstor_r11
-	movq 6*8(%rsp), %r11
-	.endif
-	.if \rstor_r8910
-	movq 7*8(%rsp), %r10
-	movq 8*8(%rsp), %r9
-	movq 9*8(%rsp), %r8
-	.endif
-	.if \rstor_rax
-	movq 10*8(%rsp), %rax
-	.endif
-	.if \rstor_rcx
-	movq 11*8(%rsp), %rcx
-	.endif
-	.if \rstor_rdx
-	movq 12*8(%rsp), %rdx
-	.endif
-	movq 13*8(%rsp), %rsi
-	movq 14*8(%rsp), %rdi
-	UNWIND_HINT_IRET_REGS offset=16*8
-	.endm
-	.macro RESTORE_C_REGS
-	RESTORE_C_REGS_HELPER 1,1,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RAX
-	RESTORE_C_REGS_HELPER 0,1,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RCX
-	RESTORE_C_REGS_HELPER 1,0,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_R11
-	RESTORE_C_REGS_HELPER 1,1,0,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
-	RESTORE_C_REGS_HELPER 1,0,0,1,1
-	.endm
-
-	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
-	subq $-(15*8+\addskip), %rsp
+	.macro POP_EXTRA_REGS
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	.endm
+
+	.macro POP_C_REGS
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
 	.endm

 	.macro icebp

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -221,10 +221,9 @@ entry_SYSCALL_64_fastpath:
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RSP(%rsp), %rsp
+	addq	$6*8, %rsp	/* skip extra regs -- they were preserved */
 	UNWIND_HINT_EMPTY
-	USERGS_SYSRET64
+	jmp	.Lpop_c_regs_except_rcx_r11_and_sysret

 1:
 	/*
@@ -246,17 +245,18 @@ entry_SYSCALL64_slow_path:
 	call	do_syscall_64		/* returns with IRQs disabled */

 return_from_SYSCALL_64:
-	RESTORE_EXTRA_REGS
 	TRACE_IRQS_IRETQ		/* we're about to change IF */

 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
-	 * a completely clean 64-bit userspace context.
+	 * a completely clean 64-bit userspace context.  If we're not,
+	 * go to the slow exit path.
 	 */
 	movq	RCX(%rsp), %rcx
 	movq	RIP(%rsp), %r11
-	cmpq	%rcx, %r11			/* RCX == RIP */
-	jne	opportunistic_sysret_failed
+
+	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
+	jne	swapgs_restore_regs_and_return_to_usermode

 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@@ -274,14 +274,14 @@ return_from_SYSCALL_64:

 	/* If this changed %rcx, it was not canonical */
 	cmpq	%rcx, %r11
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode

 	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode

 	movq	R11(%rsp), %r11
 	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode

 	/*
 	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
@@ -302,12 +302,12 @@ return_from_SYSCALL_64:
 	 * would never get past 'stuck_here'.
 	 */
 	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz	opportunistic_sysret_failed
+	jnz	swapgs_restore_regs_and_return_to_usermode

 	/* nothing to check for RSP */

 	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode

 	/*
 	 * We win! This label is here just for ease of understanding
@@ -315,14 +315,20 @@ return_from_SYSCALL_64:
 	 */
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RSP(%rsp), %rsp
 	UNWIND_HINT_EMPTY
+	POP_EXTRA_REGS
+.Lpop_c_regs_except_rcx_r11_and_sysret:
+	popq	%rsi	/* skip r11 */
+	popq	%r10
+	popq	%r9
+	popq	%r8
+	popq	%rax
+	popq	%rsi	/* skip rcx */
+	popq	%rdx
+	popq	%rsi
+	popq	%rdi
+	movq	RSP-ORIG_RAX(%rsp), %rsp
 	USERGS_SYSRET64
-
-opportunistic_sysret_failed:
-	SWAPGS
-	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)

 ENTRY(stub_ptregs_64)
@@ -423,8 +429,7 @@ ENTRY(ret_from_fork)
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode

 1:
 	/* kernel thread */
@@ -612,8 +617,21 @@ GLOBAL(retint_user)
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
+
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates user mode. */
+	testb	$3, CS(%rsp)
+	jnz	1f
+	ud2
+1:
+#endif
 	SWAPGS
-	jmp	restore_regs_and_iret
+	POP_EXTRA_REGS
+	POP_C_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
+	INTERRUPT_RETURN
+

 /* Returning to kernel space */
 retint_kernel:
@@ -633,15 +651,17 @@ retint_kernel:
 	 */
 	TRACE_IRQS_IRETQ

-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-GLOBAL(restore_regs_and_iret)
-	RESTORE_EXTRA_REGS
-restore_c_regs_and_iret:
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
+GLOBAL(restore_regs_and_return_to_kernel)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates kernel mode. */
+	testb	$3, CS(%rsp)
+	jz	1f
+	ud2
+1:
+#endif
+	POP_EXTRA_REGS
+	POP_C_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
 	INTERRUPT_RETURN

 ENTRY(native_iret)
@@ -818,7 +838,7 @@ ENTRY(\sym)

 	ASM_CLAC

-	.ifeq \has_error_code
+	.if \has_error_code == 0
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif

@@ -1059,6 +1079,7 @@ idtentry int3			do_int3			has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
 idtentry stack_segment		do_stack_segment	has_error_code=1

 #ifdef CONFIG_XEN
+idtentry xennmi			do_nmi			has_error_code=0
 idtentry xendebug		do_debug		has_error_code=0
 idtentry xenint3		do_int3			has_error_code=0
 #endif
@@ -1112,17 +1133,14 @@ ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF_DEBUG
 	testl	%ebx, %ebx			/* swapgs needed? */
-	jnz	paranoid_exit_no_swapgs
+	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
-	jmp	paranoid_exit_restore
-paranoid_exit_no_swapgs:
+	jmp	.Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
-	INTERRUPT_RETURN
+.Lparanoid_exit_restore:
+	jmp restore_regs_and_return_to_kernel
 END(paranoid_exit)

 /*
@@ -1223,10 +1241,13 @@ ENTRY(error_exit)
 	jmp	retint_user
 END(error_exit)

-/* Runs on exception stack */
-/* XXX: broken on Xen PV */
+/*
+ * Runs on exception stack.  Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ */
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
+
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
 	 * the iretq it performs will take us out of NMI context.
@@ -1284,7 +1305,7 @@ ENTRY(nmi)
 	 * stacks lest we corrupt the "NMI executing" variable.
 	 */

-	SWAPGS_UNSAFE_STACK
+	swapgs
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1328,8 +1349,7 @@ ENTRY(nmi)
 	 * Return back to user mode.  We must *not* do the normal exit
 	 * work, because we don't want to enable interrupts.
 	 */
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode

 .Lnmi_from_kernel:
 	/*
@@ -1450,7 +1470,7 @@ nested_nmi_out:
 	popq	%rdx

 	/* We are returning to kernel mode, so this cannot result in a fault. */
-	INTERRUPT_RETURN
+	iretq

 first_nmi:
 	/* Restore rdx. */
@@ -1481,7 +1501,7 @@ first_nmi:
 	pushfq			/* RFLAGS */
 	pushq	$__KERNEL_CS	/* CS */
 	pushq	$1f		/* RIP */
-	INTERRUPT_RETURN	/* continues at repeat_nmi below */
+	iretq			/* continues at repeat_nmi below */
 	UNWIND_HINT_IRET_REGS
 1:
 #endif
@@ -1544,29 +1564,34 @@ end_repeat_nmi:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
+	POP_EXTRA_REGS
+	POP_C_REGS

-	/* Point RSP at the "iret" frame. */
-	REMOVE_PT_GPREGS_FROM_STACK 6*8
+	/*
+	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+	 * at the "iret" frame.
+	 */
+	addq	$6*8, %rsp

 	/*
 	 * Clear "NMI executing".  Set DF first so that we can easily
 	 * distinguish the remaining code between here and IRET from
-	 * the SYSCALL entry and exit paths.  On a native kernel, we
-	 * could just inspect RIP, but, on paravirt kernels,
-	 * INTERRUPT_RETURN can translate into a jump into a
-	 * hypercall page.
+	 * the SYSCALL entry and exit paths.
+	 *
+	 * We arguably should just inspect RIP instead, but I (Andy) wrote
+	 * this code when I had the misapprehension that Xen PV supported
+	 * NMIs, and Xen PV would break that approach.
 	 */
 	std
 	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */

 	/*
-	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
-	 * stack in a single instruction.  We are returning to kernel
-	 * mode, so this cannot result in a fault.
+	 * iretq reads the "iret" frame and exits the NMI stack in a
+	 * single instruction.  We are returning to kernel mode, so this
+	 * cannot result in a fault.  Similarly, we don't need to worry
+	 * about espfix64 on the way back to kernel mode.
 	 */
-	INTERRUPT_RETURN
+	iretq
 END(nmi)

 ENTRY(ignore_sysret)

--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -337,8 +337,7 @@ ENTRY(entry_INT80_compat)

 	/* Go back to user mode. */
 	TRACE_IRQS_ON
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 END(entry_INT80_compat)

 ENTRY(stub32_clone)

--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v)
 	bool ok;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	do {
-		asm volatile(RDRAND_LONG "\n\t"
+		asm volatile(RDRAND_LONG
 			     CC_SET(c)
 			     : CC_OUT(c) (ok), "=a" (*v));
 		if (ok)
@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v)
 	bool ok;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	do {
-		asm volatile(RDRAND_INT "\n\t"
+		asm volatile(RDRAND_INT
 			     CC_SET(c)
 			     : CC_OUT(c) (ok), "=a" (*v));
 		if (ok)
@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v)
 static inline bool rdseed_long(unsigned long *v)
 {
 	bool ok;
-	asm volatile(RDSEED_LONG "\n\t"
+	asm volatile(RDSEED_LONG
 		     CC_SET(c)
 		     : CC_OUT(c) (ok), "=a" (*v));
 	return ok;
@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v)
 static inline bool rdseed_int(unsigned int *v)
 {
 	bool ok;
-	asm volatile(RDSEED_INT "\n\t"
+	asm volatile(RDSEED_INT
 		     CC_SET(c)
 		     : CC_OUT(c) (ok), "=a" (*v));
 	return ok;

--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -143,7 +143,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
 {
 	bool negative;
-	asm volatile(LOCK_PREFIX "andb %2,%1\n\t"
+	asm volatile(LOCK_PREFIX "andb %2,%1"
 		CC_SET(s)
 		: CC_OUT(s) (negative), ADDR
 		: "ir" ((char) ~(1 << nr)) : "memory");
@@ -246,7 +246,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
 {
 	bool oldbit;

-	asm("bts %2,%1\n\t"
+	asm("bts %2,%1"
 	    CC_SET(c)
 	    : CC_OUT(c) (oldbit), ADDR
 	    : "Ir" (nr));
@@ -286,7 +286,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
 {
 	bool oldbit;

-	asm volatile("btr %2,%1\n\t"
+	asm volatile("btr %2,%1"
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit), ADDR
 		     : "Ir" (nr));
@@ -298,7 +298,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
 {
 	bool oldbit;

-	asm volatile("btc %2,%1\n\t"
+	asm volatile("btc %2,%1"
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit), ADDR
 		     : "Ir" (nr) : "memory");
@@ -329,7 +329,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
 {
 	bool oldbit;

-	asm volatile("bt %2,%1\n\t"
+	asm volatile("bt %2,%1"
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit)
 		     : "m" (*(unsigned long *)addr), "Ir" (nr));

--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -7,6 +7,7 @@
 */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <asm/processor.h>
 #include <asm/user32.h>
 #include <asm/unistd.h>

--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -126,11 +126,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)

 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
-	clear_cpu_cap(&boot_cpu_data, bit);	\
-	set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
 #define setup_force_cpu_cap(bit) do { \
 	set_cpu_cap(&boot_cpu_data, bit);	\
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\

--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -22,6 +22,11 @@
 * this feature bit is not displayed in /proc/cpuinfo at all.
 */

+/*
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c
+ */
+
 /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
 #define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
 #define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
@@ -295,6 +300,12 @@
 #define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
 #define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI	(16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES	(16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ	(16*32+ 10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
 #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57	(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID	(16*32+22) /* RDPID instruction */

--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -6,7 +6,7 @@
 #include <asm/orc_types.h>

 struct mod_arch_specific {
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 	unsigned int num_orcs;
 	int *orc_unwind_ip;
 	struct orc_entry *orc_unwind;

--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -16,10 +16,9 @@
 #include <linux/cpumask.h>
 #include <asm/frame.h>

-static inline void load_sp0(struct tss_struct *tss,
-			     struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
 {
-	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+	PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
 }

 /* The paravirtualized CPUID instruction. */

--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -134,7 +134,7 @@ struct pv_cpu_ops {
 	void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
 	void (*free_ldt)(struct desc_struct *ldt, unsigned entries);

-	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+	void (*load_sp0)(unsigned long sp0);

 	void (*set_iopl_mask)(unsigned mask);


--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
 {
 	bool oldbit;

-	asm volatile("bt "__percpu_arg(2)",%1\n\t"
+	asm volatile("bt "__percpu_arg(2)",%1"
 			CC_SET(c)
 			: CC_OUT(c) (oldbit)
 			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));

--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -431,7 +431,9 @@ typedef struct {
 struct thread_struct {
 	/* Cached TLS descriptors: */
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
 	unsigned long		sp0;
+#endif
 	unsigned long		sp;
 #ifdef CONFIG_X86_32
 	unsigned long		sysenter_cs;
@@ -518,16 +520,9 @@ static inline void native_set_iopl_mask(unsigned mask)
 }

 static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+native_load_sp0(unsigned long sp0)
 {
-	tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
-	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-#endif
+	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
 }

 static inline void native_swapgs(void)
@@ -547,15 +542,20 @@ static inline unsigned long current_top_of_stack(void)
 #endif
 }

+static inline bool on_thread_stack(void)
+{
+	return (unsigned long)(current_top_of_stack() -
+			       current_stack_pointer) < THREAD_SIZE;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
 #define __cpuid			native_cpuid

-static inline void load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
 {
-	native_load_sp0(tss, thread);
+	native_load_sp0(sp0);
 }

 #define set_iopl_mask native_set_iopl_mask
@@ -804,6 +804,15 @@ static inline void spin_lock_prefetch(const void *x)
 #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
 			   TOP_OF_KERNEL_STACK_PADDING)

+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
+#define task_pt_regs(task) \
+({									\
+	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
+	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
+	((struct pt_regs *)__ptr) - 1;					\
+})
+
 #ifdef CONFIG_X86_32
 /*
 * User space process size: 3GB (default).
@@ -823,23 +832,6 @@ static inline void spin_lock_prefetch(const void *x)
 	.addr_limit		= KERNEL_DS,				  \
 }

-/*
- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessible even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task) \
-({									\
-	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
-	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
-	((struct pt_regs *)__ptr) - 1;					\
-})
-
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)

 #else
@@ -873,11 +865,9 @@ static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP_MAX		TASK_SIZE_MAX

 #define INIT_THREAD  {						\
-	.sp0			= TOP_OF_INIT_STACK,		\
 	.addr_limit		= KERNEL_DS,			\
 }

-#define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);

 #endif /* CONFIG_X86_64 */

--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -136,9 +136,9 @@ static inline int v8086_mode(struct pt_regs *regs)
 #endif
 }

-#ifdef CONFIG_X86_64
 static inline bool user_64bit_mode(struct pt_regs *regs)
 {
+#ifdef CONFIG_X86_64
 #ifndef CONFIG_PARAVIRT
 	/*
 	 * On non-paravirt systems, this is the only long mode CPL 3
@@ -149,8 +149,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
 	/* Headers are too twisted for this to go in paravirt.h. */
 	return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
 #endif
+#else /* !CONFIG_X86_64 */
+	return false;
+#endif
 }

+#ifdef CONFIG_X86_64
 #define current_user_stack_pointer()	current_pt_regs()->sp
 #define compat_user_stack_pointer()	current_pt_regs()->sp
 #endif

--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -29,7 +29,7 @@ cc_label:								\
 #define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 	bool c;								\
-	asm volatile (fullop ";" CC_SET(cc)				\
+	asm volatile (fullop CC_SET(cc)					\
 			: [counter] "+m" (var), CC_OUT(cc) (c)		\
 			: __VA_ARGS__ : clobbers);			\
 	return c;							\

--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_SWITCH_TO_H
 #define _ASM_X86_SWITCH_TO_H

+#include <linux/sched/task_stack.h>
+
 struct task_struct; /* one of the stranger aspects of C forward declarations */

 struct task_struct *__switch_to_asm(struct task_struct *prev,
@@ -73,4 +75,26 @@ do {									\
 	((last) = __switch_to_asm((prev), (next)));			\
 } while (0)

+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
+	if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+		return;
+
+	this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+	wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+}
+#endif
+
+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_sp0(struct task_struct *task)
+{
+#ifdef CONFIG_X86_32
+	load_sp0(task->thread.sp0);
+#else
+	load_sp0(task_top_of_stack(task));
+#endif
+}
+
 #endif /* _ASM_X86_SWITCH_TO_H */
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
 asmlinkage long sys_iopl(unsigned int);

 /* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long);

 /* kernel/signal.c */
 asmlinkage long sys_rt_sigreturn(void);

--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -34,11 +34,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
 	)
 );

-DEFINE_EVENT(x86_fpu, x86_fpu_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
@@ -74,11 +69,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
 	TP_ARGS(fpu)
 );

-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)

--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -38,9 +38,9 @@ asmlinkage void simd_coprocessor_error(void);

 #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
 asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xennmi(void);
 asmlinkage void xen_xendebug(void);
 asmlinkage void xen_xenint3(void);
-asmlinkage void xen_nmi(void);
 asmlinkage void xen_overflow(void);
 asmlinkage void xen_bounds(void);
 asmlinkage void xen_invalid_op(void);
@@ -145,4 +145,22 @@ enum {
 	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
 };

+/*
+ * Page fault error code bits:
+ *
+ *   bit 0 ==	 0: no page found	1: protection fault
+ *   bit 1 ==	 0: read access		1: write access
+ *   bit 2 ==	 0: kernel-mode access	1: user-mode access
+ *   bit 3 ==				1: use of reserved bit detected
+ *   bit 4 ==				1: fault was an instruction fetch
+ *   bit 5 ==				1: protection keys block access
+ */
+enum x86_pf_error_code {
+	X86_PF_PROT	=		1 << 0,
+	X86_PF_WRITE	=		1 << 1,
+	X86_PF_USER	=		1 << 2,
+	X86_PF_RSVD	=		1 << 3,
+	X86_PF_INSTR	=		1 << 4,
+	X86_PF_PK	=		1 << 5,
+};
 #endif /* _ASM_X86_TRAPS_H */
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -13,11 +13,11 @@ struct unwind_state {
 	struct task_struct *task;
 	int graph_idx;
 	bool error;
-#if defined(CONFIG_ORC_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC)
 	bool signal, full_regs;
 	unsigned long sp, bp, ip;
 	struct pt_regs *regs;
-#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
 	bool got_irq;
 	unsigned long *bp, *orig_sp, ip;
 	struct pt_regs *regs;
@@ -51,7 +51,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 	__unwind_start(state, task, regs, first_frame);
 }

-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	if (unwind_done(state))
@@ -66,7 +66,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 }
 #endif

-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 void unwind_init(void);
 void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
 			void *orc, size_t orc_size);

--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -152,5 +152,8 @@
 #define CX86_ARR_BASE	0xc4
 #define CX86_RCR_BASE	0xdc

+#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+			 X86_CR0_PG)

 #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -27,7 +27,6 @@ KASAN_SANITIZE_dumpstack.o				:= n
 KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
 KASAN_SANITIZE_stacktrace.o := n

-OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
@@ -128,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o

-obj-$(CONFIG_ORC_UNWINDER)		+= unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER)	+= unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER)		+= unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o

 ###
 # 64 bit specific files

--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -23,6 +23,7 @@ obj-y			+= rdrand.o
 obj-y			+= match.o
 obj-y			+= bugs.o
 obj-$(CONFIG_CPU_FREQ)	+= aperfmperf.o
+obj-y			+= cpuid-deps.o

 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		pr_cont(")\n");
 }

-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
 {
-	int bit;
-
-	if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-
 	return 1;
 }
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);

 #ifdef CONFIG_X86_64
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
@@ -1572,9 +1570,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, me);

-	load_sp0(t, &current->thread);
+	/*
+	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
+	 * task never enters user mode.
+	 */
 	set_tss_desc(cpu, t);
 	load_TR_desc();
+
 	load_mm_ldt(&init_mm);

 	clear_all_debug_regs();
@@ -1596,7 +1598,6 @@ void cpu_init(void)
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
 	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
-	struct thread_struct *thread = &curr->thread;

 	wait_for_master_cpu(cpu);

@@ -1627,9 +1628,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, curr);

-	load_sp0(t, thread);
+	/*
+	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
+	 * task never enters user mode.
+	 */
 	set_tss_desc(cpu, t);
 	load_TR_desc();
+
 	load_mm_ldt(&init_mm);

 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);

--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+	unsigned int	feature;
+	unsigned int	depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+	{ X86_FEATURE_XSAVEOPT,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XSAVEC,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XSAVES,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_AVX,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_PKU,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_MPX,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XGETBV1,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_FXSR_OPT,		X86_FEATURE_FXSR      },
+	{ X86_FEATURE_XMM,		X86_FEATURE_FXSR      },
+	{ X86_FEATURE_XMM2,		X86_FEATURE_XMM       },
+	{ X86_FEATURE_XMM3,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM4_1,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM4_2,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM3,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_PCLMULQDQ,	X86_FEATURE_XMM2      },
+	{ X86_FEATURE_SSSE3,		X86_FEATURE_XMM2,     },
+	{ X86_FEATURE_F16C,		X86_FEATURE_XMM2,     },
+	{ X86_FEATURE_AES,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_SHA_NI,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_FMA,		X86_FEATURE_AVX       },
+	{ X86_FEATURE_AVX2,		X86_FEATURE_AVX,      },
+	{ X86_FEATURE_AVX512F,		X86_FEATURE_AVX,      },
+	{ X86_FEATURE_AVX512IFMA,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512PF,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512ER,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512CD,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512DQ,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512BW,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512VL,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512VBMI,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_VBMI2,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_GFNI,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_VAES,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_VPCLMULQDQ,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_VNNI,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_BITALG,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_4VNNIW,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_4FMAPS,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F   },
+	{}
+};
+
+static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit)
+{
+	clear_bit32(bit, c->x86_capability);
+}
+
+static inline void __setup_clear_cpu_cap(unsigned int bit)
+{
+	clear_cpu_cap(&boot_cpu_data, bit);
+	set_bit32(bit, cpu_caps_cleared);
+}
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	if (!c)
+		__setup_clear_cpu_cap(feature);
+	else
+		__clear_cpu_cap(c, feature);
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+	const struct cpuid_dep *d;
+	bool changed;
+
+	if (WARN_ON(feature >= MAX_FEATURE_BITS))
+		return;
+
+	clear_feature(c, feature);
+
+	/* Collect all features to disable, handling dependencies */
+	memset(disable, 0, sizeof(disable));
+	__set_bit(feature, disable);
+
+	/* Loop until we get a stable state. */
+	do {
+		changed = false;
+		for (d = cpuid_deps; d->feature; d++) {
+			if (!test_bit(d->depends, disable))
+				continue;
+			if (__test_and_set_bit(d->feature, disable))
+				continue;
+
+			changed = true;
+			clear_feature(c, d->feature);
+		}
+	} while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+	do_clear_cpu_cap(NULL, feature);
+}
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
 */
 static void __init fpu__init_parse_early_param(void)
 {
+	char arg[32];
+	char *argptr = arg;
+	int bit;
+
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
 		setup_clear_cpu_cap(X86_FEATURE_FPU);

@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)

 	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
 		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+				sizeof(arg)) &&
+	    get_option(&argptr, &bit) &&
+	    bit >= 0 &&
+	    bit < NCAPINTS * 32)
+		setup_clear_cpu_cap(bit);
 }

 /*

--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -15,6 +15,7 @@
 #include <asm/fpu/xstate.h>

 #include <asm/tlbflush.h>
+#include <asm/cpufeature.h>

 /*
 * Although we spell it out in here, the Processor Trace
@@ -36,6 +37,19 @@ static const char *xfeature_names[] =
 	"unknown xstate feature"	,
 };

+static short xsave_cpuid_features[] __initdata = {
+	X86_FEATURE_FPU,
+	X86_FEATURE_XMM,
+	X86_FEATURE_AVX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_INTEL_PT,
+	X86_FEATURE_PKU,
+};
+
 /*
 * Mask of xstate features supported by the CPU and the kernel:
 */
@@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size;
 void fpu__xstate_clear_all_cpu_caps(void)
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
-	setup_clear_cpu_cap(X86_FEATURE_MPX);
-	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
-	setup_clear_cpu_cap(X86_FEATURE_PKU);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
 }

 /*
@@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void)
 	unsigned int eax, ebx, ecx, edx;
 	static int on_boot_cpu __initdata = 1;
 	int err;
+	int i;

 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void)
 		goto out_disable;
 	}

+	/*
+	 * Clear XSAVE features that are disabled in the normal CPUID.
+	 */
+	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+		if (!boot_cpu_has(xsave_cpuid_features[i]))
+			xfeatures_mask &= ~BIT(i);
+	}
+
 	xfeatures_mask &= fpu__get_supported_xfeatures_mask();

 	/* Enable xstate instructions to be able to continue with initialization: */

--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -212,9 +212,6 @@ ENTRY(startup_32_smp)
 #endif

 .Ldefault_entry:
-#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
-			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
-			 X86_CR0_PG)
 	movl $(CR0_STATE & ~X86_CR0_PG),%eax
 	movl %eax,%cr0

@@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array)
 	# 24(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+	.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
 	pushl $0		# Dummy error code, to make stack frame uniform
 	.endif
 	pushl $i		# 20(%esp) Vector number

--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -50,6 +50,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
 	.code64
 	.globl startup_64
 startup_64:
+	UNWIND_HINT_EMPTY
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded an identity mapped page table
@@ -89,6 +90,7 @@ startup_64:
 	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 ENTRY(secondary_startup_64)
+	UNWIND_HINT_EMPTY
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded a mapped page table.
@@ -133,6 +135,7 @@ ENTRY(secondary_startup_64)
 	movq	$1f, %rax
 	jmp	*%rax
 1:
+	UNWIND_HINT_EMPTY

 	/* Check if nx is implemented */
 	movl	$0x80000001, %eax
@@ -150,9 +153,6 @@ ENTRY(secondary_startup_64)
 1:	wrmsr				/* Make changes effective */

 	/* Setup cr0 */
-#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
-			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
-			 X86_CR0_PG)
 	movl	$CR0_STATE, %eax
 	/* Make changes effective */
 	movq	%rax, %cr0
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
 	pushq	%rax		# target address in negative space
 	lretq
 .Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)

 #include "verify_cpu.S"

@@ -247,6 +247,7 @@ ENDPROC(secondary_startup_64)
 */
 ENTRY(start_cpu0)
 	movq	initial_stack(%rip), %rsp
+	UNWIND_HINT_EMPTY
 	jmp	.Ljump_to_C_code
 ENDPROC(start_cpu0)
 #endif
@@ -266,26 +267,24 @@ ENDPROC(start_cpu0)
 	.quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 	__FINITDATA

-bad_address:
-	jmp bad_address
-
 	__INIT
 ENTRY(early_idt_handler_array)
-	# 104(%rsp) %rflags
-	#  96(%rsp) %cs
-	#  88(%rsp) %rip
-	#  80(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
-	pushq $0		# Dummy error code, to make stack frame uniform
+	.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+		UNWIND_HINT_IRET_REGS
+		pushq $0	# Dummy error code, to make stack frame uniform
+	.else
+		UNWIND_HINT_IRET_REGS offset=8
 	.endif
 	pushq $i		# 72(%rsp) Vector number
 	jmp early_idt_handler_common
+	UNWIND_HINT_IRET_REGS
 	i = i + 1
 	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
-ENDPROC(early_idt_handler_array)
+	UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)

 early_idt_handler_common:
 	/*
@@ -313,6 +312,7 @@ early_idt_handler_common:
 	pushq %r13				/* pt_regs->r13 */
 	pushq %r14				/* pt_regs->r14 */
 	pushq %r15				/* pt_regs->r15 */
+	UNWIND_HINT_REGS

 	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
@@ -327,8 +327,8 @@ early_idt_handler_common:

 20:
 	decl early_recursion_flag(%rip)
-	jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+	jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)

 	__INITDATA

@@ -435,7 +435,7 @@ ENTRY(phys_base)
 EXPORT_SYMBOL(phys_base)

 #include "../../x86/xen/xen-head.S"
-	
+
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE

--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
+#include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
@@ -295,8 +296,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	return error;
 }

-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
-			      unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+		unsigned long , bytecount)
 {
 	int ret = -ENOSYS;

@@ -314,5 +315,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
 		ret = write_ldt(ptr, bytecount, 0);
 		break;
 	}
-	return ret;
+	/*
+	 * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+	 * return type, but tht ABI for sys_modify_ldt() expects
+	 * 'int'.  This cast gives us an int-sized value in %rax
+	 * for the return code.  The 'unsigned' is necessary so
+	 * the compiler does not try to sign-extend the negative
+	 * return codes into the high half of the register when
+	 * taking the value from int->long.
+	 */
+	return (unsigned int)ret;
 }
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -49,7 +49,13 @@
 */
 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
-		.sp0 = TOP_OF_INIT_STACK,
+		/*
+		 * .sp0 is only used when entering ring 0 from a lower
+		 * privilege level.  Since the init task never runs anything
+		 * but ring 0 code, there is no need for a valid value here.
+		 * Poison it.
+		 */
+		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,

--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)

 	/*
 	 * Reload esp0 and cpu_current_top_of_stack.  This changes
-	 * current_thread_info().
+	 * current_thread_info().  Refresh the SYSENTER configuration in
+	 * case prev or next is vm86.
 	 */
-	load_sp0(tss, next);
+	update_sp0(next_p);
+	refresh_sysenter_cs(next);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);

--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	struct inactive_task_frame *frame;
 	struct task_struct *me = current;

-	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 	childregs = task_pt_regs(p);
 	fork_frame = container_of(childregs, struct fork_frame, regs);
 	frame = &fork_frame->frame;
@@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	this_cpu_write(current_task, next_p);

-	/* Reload esp0 and ss1.  This changes current_thread_info(). */
-	load_sp0(tss, next);
+	/* Reload sp0. */
+	update_sp0(next_p);

 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps

--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -962,8 +962,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
-	per_cpu(cpu_current_top_of_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
+	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
 #else
 	initial_gs = per_cpu_offset(cpu);
 #endif

--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * from double_fault.
 	 */
-	BUG_ON((unsigned long)(current_top_of_stack() -
-			       current_stack_pointer) >= THREAD_SIZE);
+	BUG_ON(!on_thread_stack());

 	preempt_enable_no_resched();
 }

--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -33,7 +33,7 @@
 #include <asm/cpufeatures.h>
 #include <asm/msr-index.h>

-verify_cpu:
+ENTRY(verify_cpu)
 	pushf				# Save caller passed flags
 	push	$0			# Kill any dangerous flags
 	popf
@@ -139,3 +139,4 @@ verify_cpu:
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
+ENDPROC(verify_cpu)
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -55,6 +55,7 @@
 #include <asm/irq.h>
 #include <asm/traps.h>
 #include <asm/vm86.h>
+#include <asm/switch_to.h>

 /*
 * Known problems:
@@ -94,7 +95,6 @@

 void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
-	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
@@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		do_exit(SIGSEGV);
 	}

-	tss = &per_cpu(cpu_tss, get_cpu());
+	preempt_disable();
 	tsk->thread.sp0 = vm86->saved_sp0;
 	tsk->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &tsk->thread);
+	update_sp0(tsk);
+	refresh_sysenter_cs(&tsk->thread);
 	vm86->saved_sp0 = 0;
-	put_cpu();
+	preempt_enable();

 	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));

@@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)

 static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
-	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
 	struct kernel_vm86_regs vm86regs;
@@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	vm86->saved_sp0 = tsk->thread.sp0;
 	lazy_save_gs(vm86->regs32.gs);

-	tss = &per_cpu(cpu_tss, get_cpu());
 	/* make room for real-mode segments */
+	preempt_disable();
 	tsk->thread.sp0 += 16;

-	if (static_cpu_has(X86_FEATURE_SEP))
+	if (static_cpu_has(X86_FEATURE_SEP)) {
 		tsk->thread.sysenter_cs = 0;
+		refresh_sysenter_cs(&tsk->thread);
+	}

-	load_sp0(tss, &tsk->thread);
-	put_cpu();
+	update_sp0(tsk);
+	preempt_enable();

 	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);

--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -29,26 +29,6 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>

-/*
- * Page fault error code bits:
- *
- *   bit 0 ==	 0: no page found	1: protection fault
- *   bit 1 ==	 0: read access		1: write access
- *   bit 2 ==	 0: kernel-mode access	1: user-mode access
- *   bit 3 ==				1: use of reserved bit detected
- *   bit 4 ==				1: fault was an instruction fetch
- *   bit 5 ==				1: protection keys block access
- */
-enum x86_pf_error_code {
-
-	PF_PROT		=		1 << 0,
-	PF_WRITE	=		1 << 1,
-	PF_USER		=		1 << 2,
-	PF_RSVD		=		1 << 3,
-	PF_INSTR	=		1 << 4,
-	PF_PK		=		1 << 5,
-};
-
 /*
 * Returns 0 if mmiotrace is disabled, or if the fault is not
 * handled by mmiotrace:
@@ -150,7 +130,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 	 * If it was a exec (instruction fetch) fault on NX page, then
 	 * do not ignore the fault:
 	 */
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		return 0;

 	instr = (void *)convert_ip_to_linear(current, regs);
@@ -180,7 +160,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 * siginfo so userspace can discover which protection key was set
 * on the PTE.
 *
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
 * fault and that there was a VMA once we got in the fault
 * handler.  It does *not* guarantee that the VMA we find here
 * was the one that we faulted on.
@@ -205,7 +185,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 	/*
 	 * force_sig_info_fault() is called from a number of
 	 * contexts, some of which have a VMA and some of which
-	 * do not.  The PF_PK handing happens after we have a
+	 * do not.  The X86_PF_PK handing happens after we have a
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 */
@@ -698,7 +678,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (!oops_may_print())
 		return;

-	if (error_code & PF_INSTR) {
+	if (error_code & X86_PF_INSTR) {
 		unsigned int level;
 		pgd_t *pgd;
 		pte_t *pte;
@@ -780,7 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		 */
 		if (current->thread.sig_on_uaccess_err && signal) {
 			tsk->thread.trap_nr = X86_TRAP_PF;
-			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.error_code = error_code | X86_PF_USER;
 			tsk->thread.cr2 = address;

 			/* XXX: hwpoison faults will set the wrong code. */
@@ -898,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;

 	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		 * It's possible to have interrupts off here:
 		 */
@@ -919,7 +899,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * Instruction fetch faults in the vsyscall page might need
 		 * emulation.
 		 */
-		if (unlikely((error_code & PF_INSTR) &&
+		if (unlikely((error_code & X86_PF_INSTR) &&
 			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
 			if (emulate_vsyscall(regs, address))
 				return;
@@ -932,7 +912,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * are always protection faults.
 		 */
 		if (address >= TASK_SIZE_MAX)
-			error_code |= PF_PROT;
+			error_code |= X86_PF_PROT;

 		if (likely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
@@ -993,11 +973,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,

 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return false;
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return true;
 	/* this checks permission keys on the VMA: */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return true;
 	return false;
 }
@@ -1025,7 +1005,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	int code = BUS_ADRERR;

 	/* Kernel mode? Handle exceptions or die: */
-	if (!(error_code & PF_USER)) {
+	if (!(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 	}
@@ -1053,14 +1033,14 @@ static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, u32 *pkey, unsigned int fault)
 {
-	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
 		return;
 	}

 	if (fault & VM_FAULT_OOM) {
 		/* Kernel mode? Handle exceptions or die: */
-		if (!(error_code & PF_USER)) {
+		if (!(error_code & X86_PF_USER)) {
 			no_context(regs, error_code, address,
 				   SIGSEGV, SEGV_MAPERR);
 			return;
@@ -1085,16 +1065,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,

 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
-	if ((error_code & PF_WRITE) && !pte_write(*pte))
+	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
 		return 0;

-	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
 		return 0;
 	/*
 	 * Note: We do not do lazy flushing on protection key
-	 * changes, so no spurious fault will ever set PF_PK.
+	 * changes, so no spurious fault will ever set X86_PF_PK.
 	 */
-	if ((error_code & PF_PK))
+	if ((error_code & X86_PF_PK))
 		return 1;

 	return 1;
@@ -1140,8 +1120,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	 * change, so user accesses are not expected to cause spurious
 	 * faults.
 	 */
-	if (error_code != (PF_WRITE | PF_PROT)
-	    && error_code != (PF_INSTR | PF_PROT))
+	if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+	    error_code != (X86_PF_INSTR | X86_PF_PROT))
 		return 0;

 	pgd = init_mm.pgd + pgd_index(address);
@@ -1201,19 +1181,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 * always an unconditional error and can never result in
 	 * a follow-up action to resolve the fault, like a COW.
 	 */
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return 1;

 	/*
 	 * Make sure to check the VMA so that we do not perform
-	 * faults just to hit a PF_PK as soon as we fill in a
+	 * faults just to hit a X86_PF_PK as soon as we fill in a
 	 * page.
 	 */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return 1;

-	if (error_code & PF_WRITE) {
+	if (error_code & X86_PF_WRITE) {
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			return 1;
@@ -1221,7 +1201,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	}

 	/* read, present: */
-	if (unlikely(error_code & PF_PROT))
+	if (unlikely(error_code & X86_PF_PROT))
 		return 1;

 	/* read, not present: */
@@ -1244,7 +1224,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 	if (!static_cpu_has(X86_FEATURE_SMAP))
 		return false;

-	if (error_code & PF_USER)
+	if (error_code & X86_PF_USER)
 		return false;

 	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1297,7 +1277,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * protection error (error_code & 9) == 0.
 	 */
 	if (unlikely(fault_in_kernel_space(address))) {
-		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 				return;

@@ -1325,7 +1305,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	if (unlikely(kprobes_fault(regs)))
 		return;

-	if (unlikely(error_code & PF_RSVD))
+	if (unlikely(error_code & X86_PF_RSVD))
 		pgtable_bad(regs, error_code, address);

 	if (unlikely(smap_violation(error_code, regs))) {
@@ -1351,7 +1331,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 */
 	if (user_mode(regs)) {
 		local_irq_enable();
-		error_code |= PF_USER;
+		error_code |= X86_PF_USER;
 		flags |= FAULT_FLAG_USER;
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
@@ -1360,9 +1340,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,

 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

-	if (error_code & PF_WRITE)
+	if (error_code & X86_PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		flags |= FAULT_FLAG_INSTRUCTION;

 	/*
@@ -1382,7 +1362,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * space check, thus avoiding the deadlock:
 	 */
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-		if ((error_code & PF_USER) == 0 &&
+		if (!(error_code & X86_PF_USER) &&
 		    !search_exception_tables(regs->ip)) {
 			bad_area_nosemaphore(regs, error_code, address, NULL);
 			return;
@@ -1409,7 +1389,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		bad_area(regs, error_code, address);
 		return;
 	}
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter

--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <os.h>
@@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm)
 	mm->arch.ldt.entry_count = 0;
 }

-int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+		unsigned long , bytecount)
 {
-	return do_modify_ldt_skas(func, ptr, bytecount);
+	/* See non-um modify_ldt() for why we do this cast */
+	return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
 }
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -601,7 +601,7 @@ static struct trap_array_entry trap_array[] = {
 #ifdef CONFIG_X86_MCE
 	{ machine_check,               xen_machine_check,               true },
 #endif
-	{ nmi,                         xen_nmi,                         true },
+	{ nmi,                         xen_xennmi,                      true },
 	{ overflow,                    xen_overflow,                    false },
 #ifdef CONFIG_IA32_EMULATION
 	{ entry_INT80_compat,          xen_entry_INT80_compat,          false },
@@ -811,15 +811,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 	}
 }

-static void xen_load_sp0(struct tss_struct *tss,
-			 struct thread_struct *thread)
+static void xen_load_sp0(unsigned long sp0)
 {
 	struct multicall_space mcs;

 	mcs = xen_mc_entry(0);
-	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
-	tss->x86_tss.sp0 = thread->sp0;
+	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
 }

 void xen_set_iopl_mask(unsigned mask)

--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -14,6 +14,7 @@
 * single-threaded.
 */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -294,12 +295,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 #endif
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));

+	/*
+	 * Bring up the CPU in cpu_bringup_and_idle() with the stack
+	 * pointing just below where pt_regs would be if it were a normal
+	 * kernel entry.
+	 */
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->flags = VGCF_IN_KERNEL;
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 	ctxt->user_regs.ds = __USER_DS;
 	ctxt->user_regs.es = __USER_DS;
 	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);

 	xen_copy_trap_info(ctxt->trap_ctxt);

@@ -314,8 +322,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;

+	/*
+	 * Set SS:SP that Xen will use when entering guest kernel mode
+	 * from guest user mode.  Subsequent calls to load_sp0() can
+	 * change this value.
+	 */
 	ctxt->kernel_ss = __KERNEL_DS;
-	ctxt->kernel_sp = idle->thread.sp0;
+	ctxt->kernel_sp = task_top_of_stack(idle);

 #ifdef CONFIG_X86_32
 	ctxt->event_callback_cs     = __KERNEL_CS;
@@ -327,10 +340,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 		(unsigned long)xen_hypervisor_callback;
 	ctxt->failsafe_callback_eip =
 		(unsigned long)xen_failsafe_callback;
-	ctxt->user_regs.cs = __KERNEL_CS;
 	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);

-	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
 		BUG();

--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -30,7 +30,7 @@ xen_pv_trap debug
 xen_pv_trap xendebug
 xen_pv_trap int3
 xen_pv_trap xenint3
-xen_pv_trap nmi
+xen_pv_trap xennmi
 xen_pv_trap overflow
 xen_pv_trap bounds
 xen_pv_trap invalid_op

--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -10,6 +10,7 @@
 #include <asm/boot.h>
 #include <asm/asm.h>
 #include <asm/page_types.h>
+#include <asm/unwind_hints.h>

 #include <xen/interface/elfnote.h>
 #include <xen/interface/features.h>
@@ -20,6 +21,7 @@
 #ifdef CONFIG_XEN_PV
 	__INIT
 ENTRY(startup_xen)
+	UNWIND_HINT_EMPTY
 	cld

 	/* Clear .bss */
@@ -34,21 +36,24 @@ ENTRY(startup_xen)
 	mov $init_thread_union+THREAD_SIZE, %_ASM_SP

 	jmp xen_start_kernel
-
+END(startup_xen)
 	__FINIT
 #endif

 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
-	.skip PAGE_SIZE
+	.rept (PAGE_SIZE / 32)
+		UNWIND_HINT_EMPTY
+		.skip 32
+	.endr

 #define HYPERCALL(n) \
 	.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
 	.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
 #include <asm/xen-hypercalls.h>
 #undef HYPERCALL
-
+END(hypercall_page)
 .popsection

 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")

--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -687,7 +687,7 @@
 #define BUG_TABLE
 #endif

-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 #define ORC_UNWIND_TABLE						\
 	. = ALIGN(4);							\
 	.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) {	\

--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -228,6 +228,32 @@ static inline unsigned long __ffs64(u64 word)
 	return __ffs((unsigned long)word);
 }

+/*
+ * clear_bit32 - Clear a bit in memory for u32 array
+ * @nr: Bit to clear
+ * @addr: u32 * address of bitmap
+ *
+ * Same as clear_bit, but avoids needing casts for u32 arrays.
+ */
+
+static __always_inline void clear_bit32(long nr, volatile u32 *addr)
+{
+	clear_bit(nr, (volatile unsigned long *)addr);
+}
+
+/*
+ * set_bit32 - Set a bit in memory for u32 array
+ * @nr: Bit to clear
+ * @addr: u32 * address of bitmap
+ *
+ * Same as set_bit, but avoids needing casts for u32 arrays.
+ */
+
+static __always_inline void set_bit32(long nr, volatile u32 *addr)
+{
+	set_bit(nr, (volatile unsigned long *)addr);
+}
+
 #ifdef __KERNEL__

 #ifndef set_mask_bits

--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -376,7 +376,7 @@ config STACK_VALIDATION
 	  that runtime stack traces are more reliable.

 	  This is also a prerequisite for generation of ORC unwind data, which
-	  is needed for CONFIG_ORC_UNWINDER.
+	  is needed for CONFIG_UNWINDER_ORC.

 	  For more information, see
 	  tools/objtool/Documentation/stack-validation.txt.

--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -259,7 +259,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1)

 __objtool_obj := $(objtree)/tools/objtool/objtool

-objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
+objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check)

 ifndef CONFIG_FRAME_POINTER
 objtool_args += --no-fp

--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1757,11 +1757,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 		if (insn->dead_end)
 			return 0;

-		insn = next_insn;
-		if (!insn) {
+		if (!next_insn) {
+			if (state.cfa.base == CFI_UNDEFINED)
+				return 0;
 			WARN("%s: unexpected end of section", sec->name);
 			return 1;
 		}
+
+		insn = next_insn;
 	}

 	return 0;

--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -70,7 +70,7 @@ static void cmd_usage(void)

 	printf("\n");

-	exit(1);
+	exit(129);
 }

 static void handle_options(int *argc, const char ***argv)
@@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv)
 			break;
 		} else {
 			fprintf(stderr, "Unknown option: %s\n", cmd);
-			fprintf(stderr, "\n Usage: %s\n",
-				objtool_usage_string);
-			exit(1);
+			cmd_usage();
 		}

 		(*argv)++;