entry_64.S 43 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7
/*
 *  linux/arch/x86_64/entry.S
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
8
 *
Linus Torvalds's avatar
Linus Torvalds committed
9 10
 * entry.S contains the system-call and fault low-level handling routines.
 *
11
 * Some of this is documented in Documentation/arch/x86/entry_64.rst
12
 *
13
 * A note on terminology:
14 15
 * - iret frame:	Architecture defined interrupt frame from SS to RIP
 *			at the top of the kernel process stack.
16 17
 *
 * Some macro usage:
18
 * - SYM_FUNC_START/END:Define functions in the symbol table.
19
 * - idtentry:		Define exception entry points.
Linus Torvalds's avatar
Linus Torvalds committed
20
 */
21
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23 24 25
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
26
#include <asm/asm-offsets.h>
Linus Torvalds's avatar
Linus Torvalds committed
27 28 29 30
#include <asm/msr.h>
#include <asm/unistd.h>
#include <asm/thread_info.h>
#include <asm/hw_irq.h>
31
#include <asm/page_types.h>
32
#include <asm/irqflags.h>
33
#include <asm/paravirt.h>
34
#include <asm/percpu.h>
35
#include <asm/asm.h>
36
#include <asm/smap.h>
37
#include <asm/pgtable_types.h>
38
#include <asm/frame.h>
39
#include <asm/trapnr.h>
40
#include <asm/nospec-branch.h>
41
#include <asm/fsgsbase.h>
42
#include <linux/err.h>
Linus Torvalds's avatar
Linus Torvalds committed
43

44 45
#include "calling.h"

46 47
.code64
.section .entry.text, "ax"
48

Linus Torvalds's avatar
Linus Torvalds committed
49
/*
50
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
Linus Torvalds's avatar
Linus Torvalds committed
51
 *
52 53 54 55 56 57 58 59 60 61
 * This is the only entry point used for 64-bit system calls.  The
 * hardware interface is reasonably well designed and the register to
 * argument mapping Linux uses fits well with the registers that are
 * available when SYSCALL is used.
 *
 * SYSCALL instructions can be found inlined in libc implementations as
 * well as some other programs and libraries.  There are also a handful
 * of SYSCALL instructions in the vDSO used, for example, as a
 * clock_gettimeofday fallback.
 *
62
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
63 64 65 66 67 68
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
 * are not needed). SYSCALL does not save anything on the stack
 * and does not change rsp.
 *
 * Registers on entry:
Linus Torvalds's avatar
Linus Torvalds committed
69
 * rax  system call number
70 71
 * rcx  return address
 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
Linus Torvalds's avatar
Linus Torvalds committed
72 73
 * rdi  arg0
 * rsi  arg1
74
 * rdx  arg2
75
 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
Linus Torvalds's avatar
Linus Torvalds committed
76 77
 * r8   arg4
 * r9   arg5
78
 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
79
 *
Linus Torvalds's avatar
Linus Torvalds committed
80 81
 * Only called from user space.
 *
82
 * When user can change pt_regs->foo always force IRET. That is because
83 84
 * it deals with uncanonical addresses better. SYSRET has trouble
 * with them due to bugs in both AMD and Intel CPUs.
85
 */
Linus Torvalds's avatar
Linus Torvalds committed
86

87
SYM_CODE_START(entry_SYSCALL_64)
88
	UNWIND_HINT_ENTRY
89
	ENDBR
90

91
	swapgs
92
	/* tss.sp2 is scratch space. */
93
	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
94
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
95
	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
96

97
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
Peter Zijlstra's avatar
Peter Zijlstra committed
98
	ANNOTATE_NOENDBR
99

100
	/* Construct struct pt_regs on stack */
101 102 103 104 105
	pushq	$__USER_DS				/* pt_regs->ss */
	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
	pushq	%r11					/* pt_regs->flags */
	pushq	$__USER_CS				/* pt_regs->cs */
	pushq	%rcx					/* pt_regs->ip */
106
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
107
	pushq	%rax					/* pt_regs->orig_ax */
108 109

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
110

111
	/* IRQs are off. */
112
	movq	%rsp, %rdi
113 114
	/* Sign extend the lower 32bit as syscall numbers are treated as int */
	movslq	%eax, %rsi
115 116 117 118 119

	/* clobbers %rax, make sure it is after saving the syscall nr */
	IBRS_ENTER
	UNTRAIN_RET

120 121
	call	do_syscall_64		/* returns with IRQs disabled */

122 123
	/*
	 * Try to use SYSRET instead of IRET if we're returning to
124 125
	 * a completely clean 64-bit userspace context.  If we're not,
	 * go to the slow exit path.
126
	 * In the Xen PV case we must use iret anyway.
127
	 */
128 129 130 131

	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
		X86_FEATURE_XENPV

132 133
	movq	RCX(%rsp), %rcx
	movq	RIP(%rsp), %r11
134 135 136

	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
	jne	swapgs_restore_regs_and_return_to_usermode
137 138 139 140

	/*
	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
	 * in kernel space.  This essentially lets the user take over
141
	 * the kernel, since userspace controls RSP.
142
	 *
143
	 * If width of "canonical tail" ever becomes variable, this will need
144
	 * to be updated to remain correct on both old and new CPUs.
145
	 *
146 147
	 * Change top bits to match most significant bit (47th or 56th bit
	 * depending on paging mode) in the address.
148
	 */
149
#ifdef CONFIG_X86_5LEVEL
150 151
	ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
		"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
152
#else
153 154
	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
155
#endif
156

157 158
	/* If this changed %rcx, it was not canonical */
	cmpq	%rcx, %r11
159
	jne	swapgs_restore_regs_and_return_to_usermode
160

161
	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
162
	jne	swapgs_restore_regs_and_return_to_usermode
163

164 165
	movq	R11(%rsp), %r11
	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
166
	jne	swapgs_restore_regs_and_return_to_usermode
167 168

	/*
169 170 171 172 173 174 175 176 177
	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
	 * restore RF properly. If the slowpath sets it for whatever reason, we
	 * need to restore it correctly.
	 *
	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
	 * trap from userspace immediately after SYSRET.  This would cause an
	 * infinite loop whenever #DB happens with register state that satisfies
	 * the opportunistic SYSRET conditions.  For example, single-stepping
	 * this user code:
178
	 *
179
	 *           movq	$stuck_here, %rcx
180 181 182 183 184 185
	 *           pushfq
	 *           popq %r11
	 *   stuck_here:
	 *
	 * would never get past 'stuck_here'.
	 */
186
	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
187
	jnz	swapgs_restore_regs_and_return_to_usermode
188 189 190

	/* nothing to check for RSP */

191
	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
192
	jne	swapgs_restore_regs_and_return_to_usermode
193 194

	/*
195 196
	 * We win! This label is here just for ease of understanding
	 * perf profiles. Nothing jumps here.
197 198
	 */
syscall_return_via_sysret:
199
	IBRS_EXIT
200
	POP_REGS pop_rdi=0
201 202 203 204 205 206

	/*
	 * Now all regs are restored except RSP and RDI.
	 * Save old stack pointer and switch to trampoline stack.
	 */
	movq	%rsp, %rdi
207
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
208
	UNWIND_HINT_END_OF_STACK
209 210 211 212 213 214 215 216

	pushq	RSP-RDI(%rdi)	/* RSP */
	pushq	(%rdi)		/* RDI */

	/*
	 * We are on the trampoline stack.  All regs except RDI are live.
	 * We can do future final exit work right here.
	 */
217 218
	STACKLEAK_ERASE_NOCLOBBER

219
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
220

221
	popq	%rdi
222
	popq	%rsp
223
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
224
	ANNOTATE_NOENDBR
225 226
	swapgs
	sysretq
227
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
228 229
	ANNOTATE_NOENDBR
	int3
230
SYM_CODE_END(entry_SYSCALL_64)
231

232 233 234 235
/*
 * %rdi: prev task
 * %rsi: next task
 */
236
.pushsection .text, "ax"
237
SYM_FUNC_START(__switch_to_asm)
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
	/*
	 * Save callee-saved registers
	 * This must match the order in inactive_task_frame
	 */
	pushq	%rbp
	pushq	%rbx
	pushq	%r12
	pushq	%r13
	pushq	%r14
	pushq	%r15

	/* switch stack */
	movq	%rsp, TASK_threadsp(%rdi)
	movq	TASK_threadsp(%rsi), %rsp

253
#ifdef CONFIG_STACKPROTECTOR
254
	movq	TASK_stack_canary(%rsi), %rbx
255
	movq	%rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
256 257
#endif

258 259 260 261 262 263 264
	/*
	 * When switching from a shallower to a deeper call stack
	 * the RSB may either underflow or use entries populated
	 * with userspace addresses. On CPUs where those concerns
	 * exist, overwrite the RSB with entries which capture
	 * speculative execution to prevent attack.
	 */
265
	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
266

267 268 269 270 271 272 273 274 275
	/* restore callee-saved registers */
	popq	%r15
	popq	%r14
	popq	%r13
	popq	%r12
	popq	%rbx
	popq	%rbp

	jmp	__switch_to
276
SYM_FUNC_END(__switch_to_asm)
277
.popsection
278

279 280 281
/*
 * A newly forked process directly context switches into this address.
 *
282
 * rax: prev task we switched from
283 284
 * rbx: kernel thread func (NULL for user thread)
 * r12: kernel thread arg
285
 */
286
.pushsection .text, "ax"
287
SYM_CODE_START(ret_from_fork_asm)
Peter Zijlstra's avatar
Peter Zijlstra committed
288 289 290 291 292 293 294 295 296
	/*
	 * This is the start of the kernel stack; even through there's a
	 * register set at the top, the regset isn't necessarily coherent
	 * (consider kthreads) and one cannot unwind further.
	 *
	 * This ensures stack unwinds of kernel threads terminate in a known
	 * good state.
	 */
	UNWIND_HINT_END_OF_STACK
297
	ANNOTATE_NOENDBR // copy_thread
298
	CALL_DEPTH_ACCOUNT
299

300 301 302 303 304
	movq	%rax, %rdi		/* prev */
	movq	%rsp, %rsi		/* regs */
	movq	%rbx, %rdx		/* fn */
	movq	%r12, %rcx		/* fn_arg */
	call	ret_from_fork
305

Peter Zijlstra's avatar
Peter Zijlstra committed
306 307 308 309 310 311
	/*
	 * Set the stack state to what is expected for the target function
	 * -- at this point the register set should be a valid user set
	 * and unwind should work normally.
	 */
	UNWIND_HINT_REGS
312
	jmp	swapgs_restore_regs_and_return_to_usermode
313
SYM_CODE_END(ret_from_fork_asm)
314
.popsection
315

316 317
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
318
	pushq %rax
319
	SAVE_FLAGS
320
	testl $X86_EFLAGS_IF, %eax
321 322 323
	jz .Lokay_\@
	ud2
.Lokay_\@:
324
	popq %rax
325 326 327
#endif
.endm

328 329
SYM_CODE_START(xen_error_entry)
	ANNOTATE_NOENDBR
Peter Zijlstra's avatar
Peter Zijlstra committed
330
	UNWIND_HINT_FUNC
331 332
	PUSH_AND_CLEAR_REGS save_ret=1
	ENCODE_FRAME_POINTER 8
333
	UNTRAIN_RET_FROM_CALL
Peter Zijlstra's avatar
Peter Zijlstra committed
334 335 336
	RET
SYM_CODE_END(xen_error_entry)

337 338 339 340 341
/**
 * idtentry_body - Macro to emit code calling the C function
 * @cfunc:		C function to be called
 * @has_error_code:	Hardware pushed error code on stack
 */
342
.macro idtentry_body cfunc has_error_code:req
343

344 345 346 347 348 349 350 351 352
	/*
	 * Call error_entry() and switch to the task stack if from userspace.
	 *
	 * When in XENPV, it is already in the task stack, and it can't fault
	 * for native_iret() nor native_load_gs_index() since XENPV uses its
	 * own pvops for IRET and load_gs_index().  And it doesn't need to
	 * switch the CR3.  So it can skip invoking error_entry().
	 */
	ALTERNATIVE "call error_entry; movq %rax, %rsp", \
Peter Zijlstra's avatar
Peter Zijlstra committed
353
		    "call xen_error_entry", X86_FEATURE_XENPV
354

355
	ENCODE_FRAME_POINTER
356 357 358 359 360 361 362 363 364 365 366
	UNWIND_HINT_REGS

	movq	%rsp, %rdi			/* pt_regs pointer into 1st argument*/

	.if \has_error_code == 1
		movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
		movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
	.endif

	call	\cfunc

367 368 369
	/* For some configurations \cfunc ends up being a noreturn. */
	REACHABLE

370
	jmp	error_return
371 372 373 374 375 376 377 378 379 380 381 382
.endm

/**
 * idtentry - Macro to generate entry stubs for simple IDT entries
 * @vector:		Vector number
 * @asmsym:		ASM symbol for the entry point
 * @cfunc:		C function to be called
 * @has_error_code:	Hardware pushed error code on stack
 *
 * The macro emits code to set up the kernel context for straight forward
 * and simple IDT entries. No IST stack, no paranoid entry checks.
 */
383
.macro idtentry vector asmsym cfunc has_error_code:req
384
SYM_CODE_START(\asmsym)
385 386 387

	.if \vector == X86_TRAP_BP
		/* #BP advances %rip to the next instruction */
388
		UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
389
	.else
390
		UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
391 392
	.endif

393
	ENDBR
394
	ASM_CLAC
395
	cld
396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414

	.if \has_error_code == 0
		pushq	$-1			/* ORIG_RAX: no syscall to restart */
	.endif

	.if \vector == X86_TRAP_BP
		/*
		 * If coming from kernel space, create a 6-word gap to allow the
		 * int3 handler to emulate a call instruction.
		 */
		testb	$3, CS-ORIG_RAX(%rsp)
		jnz	.Lfrom_usermode_no_gap_\@
		.rept	6
		pushq	5*8(%rsp)
		.endr
		UNWIND_HINT_IRET_REGS offset=8
.Lfrom_usermode_no_gap_\@:
	.endif

415
	idtentry_body \cfunc \has_error_code
416 417 418 419 420

_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm

421 422 423 424 425 426 427 428 429 430 431 432 433 434
/*
 * Interrupt entry/exit.
 *
 + The interrupt stubs push (vector) onto the stack, which is the error_code
 * position of idtentry exceptions, and jump to one of the two idtentry points
 * (common/spurious).
 *
 * common_interrupt is a hotpath, align it to a cache line
 */
.macro idtentry_irq vector cfunc
	.p2align CONFIG_X86_L1_CACHE_SHIFT
	idtentry \vector asm_\cfunc \cfunc has_error_code=1
.endm

435 436 437 438 439 440 441 442
/*
 * System vectors which invoke their handlers directly and are not
 * going through the regular common device interrupt handling code.
 */
.macro idtentry_sysvec vector cfunc
	idtentry \vector asm_\cfunc \cfunc has_error_code=0
.endm

443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
/**
 * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
 * @vector:		Vector number
 * @asmsym:		ASM symbol for the entry point
 * @cfunc:		C function to be called
 *
 * The macro emits code to set up the kernel context for #MC and #DB
 *
 * If the entry comes from user space it uses the normal entry path
 * including the return to user space work and preemption checks on
 * exit.
 *
 * If hits in kernel mode then it needs to go through the paranoid
 * entry as the exception can hit any random state. No preemption
 * check on exit to keep the paranoid path simple.
 */
.macro idtentry_mce_db vector asmsym cfunc
SYM_CODE_START(\asmsym)
461
	UNWIND_HINT_IRET_ENTRY
462
	ENDBR
463
	ASM_CLAC
464
	cld
465 466 467 468 469 470 471 472 473 474

	pushq	$-1			/* ORIG_RAX: no syscall to restart */

	/*
	 * If the entry is from userspace, switch stacks and treat it as
	 * a normal entry.
	 */
	testb	$3, CS-ORIG_RAX(%rsp)
	jnz	.Lfrom_usermode_switch_stack_\@

475
	/* paranoid_entry returns GS information for paranoid_exit in EBX. */
476 477 478 479 480 481 482 483 484 485 486 487
	call	paranoid_entry

	UNWIND_HINT_REGS

	movq	%rsp, %rdi		/* pt_regs pointer */

	call	\cfunc

	jmp	paranoid_exit

	/* Switch to the regular task stack and use the noist entry point */
.Lfrom_usermode_switch_stack_\@:
488
	idtentry_body noist_\cfunc, has_error_code=0
489 490 491 492 493

_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm

494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
#ifdef CONFIG_AMD_MEM_ENCRYPT
/**
 * idtentry_vc - Macro to generate entry stub for #VC
 * @vector:		Vector number
 * @asmsym:		ASM symbol for the entry point
 * @cfunc:		C function to be called
 *
 * The macro emits code to set up the kernel context for #VC. The #VC handler
 * runs on an IST stack and needs to be able to cause nested #VC exceptions.
 *
 * To make this work the #VC entry code tries its best to pretend it doesn't use
 * an IST stack by switching to the task stack if coming from user-space (which
 * includes early SYSCALL entry path) or back to the stack in the IRET frame if
 * entered from kernel-mode.
 *
 * If entered from kernel-mode the return stack is validated first, and if it is
 * not safe to use (e.g. because it points to the entry stack) the #VC handler
 * will switch to a fall-back stack (VC2) and call a special handler function.
 *
 * The macro is only used for one vector, but it is planned to be extended in
 * the future for the #HV exception.
 */
.macro idtentry_vc vector asmsym cfunc
SYM_CODE_START(\asmsym)
518
	UNWIND_HINT_IRET_ENTRY
519
	ENDBR
520
	ASM_CLAC
521
	cld
522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547

	/*
	 * If the entry is from userspace, switch stacks and treat it as
	 * a normal entry.
	 */
	testb	$3, CS-ORIG_RAX(%rsp)
	jnz	.Lfrom_usermode_switch_stack_\@

	/*
	 * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
	 * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
	 */
	call	paranoid_entry

	UNWIND_HINT_REGS

	/*
	 * Switch off the IST stack to make it free for nested exceptions. The
	 * vc_switch_off_ist() function will switch back to the interrupted
	 * stack if it is safe to do so. If not it switches to the VC fall-back
	 * stack.
	 */
	movq	%rsp, %rdi		/* pt_regs pointer */
	call	vc_switch_off_ist
	movq	%rax, %rsp		/* Switch to new stack */

548
	ENCODE_FRAME_POINTER
549 550 551 552 553 554 555 556
	UNWIND_HINT_REGS

	/* Update pt_regs */
	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */

	movq	%rsp, %rdi		/* pt_regs pointer */

557
	call	kernel_\cfunc
558 559 560 561

	/*
	 * No need to switch back to the IST stack. The current stack is either
	 * identical to the stack in the IRET frame or the VC fall-back stack,
562
	 * so it is definitely mapped even with PTI enabled.
563 564 565 566 567
	 */
	jmp	paranoid_exit

	/* Switch to the regular task stack */
.Lfrom_usermode_switch_stack_\@:
568
	idtentry_body user_\cfunc, has_error_code=1
569 570 571 572 573 574

_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm
#endif

575 576 577 578 579 580 581
/*
 * Double fault entry. Straight paranoid. No checks from which context
 * this comes because for the espfix induced #DF this would do the wrong
 * thing.
 */
.macro idtentry_df vector asmsym cfunc
SYM_CODE_START(\asmsym)
582
	UNWIND_HINT_IRET_ENTRY offset=8
583
	ENDBR
584
	ASM_CLAC
585
	cld
586

587
	/* paranoid_entry returns GS information for paranoid_exit in EBX. */
588 589 590 591 592 593 594 595
	call	paranoid_entry
	UNWIND_HINT_REGS

	movq	%rsp, %rdi		/* pt_regs pointer into first argument */
	movq	ORIG_RAX(%rsp), %rsi	/* get error code into 2nd argument*/
	movq	$-1, ORIG_RAX(%rsp)	/* no syscall to restart */
	call	\cfunc

596 597 598
	/* For some configurations \cfunc ends up being a noreturn. */
	REACHABLE

599 600 601 602 603 604
	jmp	paranoid_exit

_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm

605 606
/*
 * Include the defines which emit the idt entries which are shared
607 608
 * shared between 32 and 64 bit and emit the __irqentry_text_* markers
 * so the stacktrace boundary checks work.
609
 */
610
	__ALIGN
611 612 613
	.globl __irqentry_text_start
__irqentry_text_start:

614 615
#include <asm/idtentry.h>

616
	__ALIGN
617 618
	.globl __irqentry_text_end
__irqentry_text_end:
619
	ANNOTATE_NOENDBR
620

621
SYM_CODE_START_LOCAL(common_interrupt_return)
622
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
623
	IBRS_EXIT
624 625
#ifdef CONFIG_DEBUG_ENTRY
	/* Assert that pt_regs indicates user mode. */
626
	testb	$3, CS(%rsp)
627 628 629 630
	jnz	1f
	ud2
1:
#endif
631 632 633 634
#ifdef CONFIG_XEN_PV
	ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif

635
	POP_REGS pop_rdi=0
636 637 638 639 640 641

	/*
	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
	 * Save old stack pointer and switch to trampoline stack.
	 */
	movq	%rsp, %rdi
642
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
643
	UNWIND_HINT_END_OF_STACK
644 645 646 647 648 649 650 651 652 653 654 655 656 657 658

	/* Copy the IRET frame to the trampoline stack. */
	pushq	6*8(%rdi)	/* SS */
	pushq	5*8(%rdi)	/* RSP */
	pushq	4*8(%rdi)	/* EFLAGS */
	pushq	3*8(%rdi)	/* CS */
	pushq	2*8(%rdi)	/* RIP */

	/* Push user RDI on the trampoline stack. */
	pushq	(%rdi)

	/*
	 * We are on the trampoline stack.  All regs except RDI are live.
	 * We can do future final exit work right here.
	 */
659
	STACKLEAK_ERASE_NOCLOBBER
660

661
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
662

663 664
	/* Restore RDI. */
	popq	%rdi
665
	swapgs
666
	jmp	.Lnative_iret
667

668

669
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
670 671
#ifdef CONFIG_DEBUG_ENTRY
	/* Assert that pt_regs indicates kernel mode. */
672
	testb	$3, CS(%rsp)
673 674 675 676
	jz	1f
	ud2
1:
#endif
677
	POP_REGS
678
	addq	$8, %rsp	/* skip regs->orig_ax */
679 680 681 682
	/*
	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
	 * when returning from IPI handler.
	 */
683 684 685 686 687 688
#ifdef CONFIG_XEN_PV
SYM_INNER_LABEL(early_xen_iret_patch, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR
	.byte 0xe9
	.long .Lnative_iret - (. + 4)
#endif
689

690
.Lnative_iret:
691
	UNWIND_HINT_IRET_REGS
692 693 694 695
	/*
	 * Are we returning to a stack segment from the LDT?  Note: in
	 * 64-bit mode SS:RSP on the exception stack is always valid.
	 */
696
#ifdef CONFIG_X86_ESPFIX64
697 698
	testb	$4, (SS-RIP)(%rsp)
	jnz	native_irq_return_ldt
699
#endif
700

701
SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
702
	ANNOTATE_NOENDBR // exc_double_fault
703 704 705
	/*
	 * This may fault.  Non-paranoid faults on return to userspace are
	 * handled by fixup_bad_iret.  These include #SS, #GP, and #NP.
706
	 * Double-faults due to espfix64 are handled in exc_double_fault.
707 708
	 * Other faults here are fatal.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
709
	iretq
710

711
#ifdef CONFIG_X86_ESPFIX64
712
native_irq_return_ldt:
713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
	/*
	 * We are running with user GSBASE.  All GPRs contain their user
	 * values.  We have a percpu ESPFIX stack that is eight slots
	 * long (see ESPFIX_STACK_SIZE).  espfix_waddr points to the bottom
	 * of the ESPFIX stack.
	 *
	 * We clobber RAX and RDI in this code.  We stash RDI on the
	 * normal stack and RAX on the ESPFIX stack.
	 *
	 * The ESPFIX stack layout we set up looks like this:
	 *
	 * --- top of ESPFIX stack ---
	 * SS
	 * RSP
	 * RFLAGS
	 * CS
	 * RIP  <-- RSP points here when we're done
	 * RAX  <-- espfix_waddr points here
	 * --- bottom of ESPFIX stack ---
	 */

	pushq	%rdi				/* Stash user RDI */
735
	swapgs					/* to kernel GS */
736 737
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */

738
	movq	PER_CPU_VAR(espfix_waddr), %rdi
739 740
	movq	%rax, (0*8)(%rdi)		/* user RAX */
	movq	(1*8)(%rsp), %rax		/* user RIP */
741
	movq	%rax, (1*8)(%rdi)
742
	movq	(2*8)(%rsp), %rax		/* user CS */
743
	movq	%rax, (2*8)(%rdi)
744
	movq	(3*8)(%rsp), %rax		/* user RFLAGS */
745
	movq	%rax, (3*8)(%rdi)
746
	movq	(5*8)(%rsp), %rax		/* user SS */
747
	movq	%rax, (5*8)(%rdi)
748
	movq	(4*8)(%rsp), %rax		/* user RSP */
749
	movq	%rax, (4*8)(%rdi)
750 751 752 753 754 755 756 757 758 759 760 761
	/* Now RAX == RSP. */

	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */

	/*
	 * espfix_stack[31:16] == 0.  The page tables are set up such that
	 * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
	 * espfix_waddr for any X.  That is, there are 65536 RO aliases of
	 * the same page.  Set up RSP so that RSP[31:16] contains the
	 * respective 16 bits of the /userspace/ RSP and RSP nonetheless
	 * still points to an RO alias of the ESPFIX stack.
	 */
762
	orq	PER_CPU_VAR(espfix_stack), %rax
763

764
	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
765
	swapgs					/* to user GS */
766 767
	popq	%rdi				/* Restore user RDI */

768
	movq	%rax, %rsp
769
	UNWIND_HINT_IRET_REGS offset=8
770 771 772 773 774 775 776 777 778 779 780 781

	/*
	 * At this point, we cannot write to the stack any more, but we can
	 * still read.
	 */
	popq	%rax				/* Restore user RAX */

	/*
	 * RSP now points to an ordinary IRET frame, except that the page
	 * is read-only and RSP[31:16] are preloaded with the userspace
	 * values.  We can now IRET back to userspace.
	 */
782
	jmp	native_irq_return_iret
783
#endif
784 785
SYM_CODE_END(common_interrupt_return)
_ASM_NOKPROBE(common_interrupt_return)
786

787 788
/*
 * Reload gs selector with exception handling
789
 *  di:  new selector
790 791 792
 *
 * Is in entry.text as it shouldn't be instrumented.
 */
793
SYM_FUNC_START(asm_load_gs_index)
794
	FRAME_BEGIN
795
	swapgs
796
.Lgs_change:
797
	ANNOTATE_NOENDBR // error_entry
798
	movl	%edi, %gs
799
2:	ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
800
	swapgs
801
	FRAME_END
802
	RET
803

Linus Torvalds's avatar
Linus Torvalds committed
804
	/* running with kernelgs */
805
.Lbad_gs:
806
	swapgs					/* switch back to user gs */
807 808 809 810 811 812
.macro ZAP_GS
	/* This can't be a string because the preprocessor needs to see it. */
	movl $__USER_DS, %eax
	movl %eax, %gs
.endm
	ALTERNATIVE "", "ZAP_GS", X86_BUG_NULL_SEG
813 814 815
	xorl	%eax, %eax
	movl	%eax, %gs
	jmp	2b
816 817 818 819 820

	_ASM_EXTABLE(.Lgs_change, .Lbad_gs)

SYM_FUNC_END(asm_load_gs_index)
EXPORT_SYMBOL(asm_load_gs_index)
821

822
#ifdef CONFIG_XEN_PV
823
/*
824 825 826 827 828 829 830 831 832 833 834
 * A note on the "critical region" in our callback handler.
 * We want to avoid stacking callback handlers due to events occurring
 * during handling of the last event. To do this, we keep events disabled
 * until we've done all processing. HOWEVER, we must enable events before
 * popping the stack frame (can't be done atomically) and so it would still
 * be possible to get enough handler activations to overflow the stack.
 * Although unlikely, bugs of that kind are hard to track down, so we'd
 * like to avoid the possibility.
 * So, on entry to the handler we detect whether we interrupted an
 * existing activation in its critical region -- if so, we pop the current
 * activation and restart the handler using the previous one.
835 836
 *
 * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
837
 */
838 839
	__FUNC_ALIGN
SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
840

841 842 843 844
/*
 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
 * see the correct pointer to the pt_regs
 */
845
	UNWIND_HINT_FUNC
846
	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
847
	UNWIND_HINT_REGS
848

849
	call	xen_pv_evtchn_do_upcall
850

851 852
	jmp	error_return
SYM_CODE_END(exc_xen_hypervisor_callback)
853 854

/*
855 856 857 858 859 860 861 862 863 864 865 866
 * Hypervisor uses this for application faults while it executes.
 * We get here for two reasons:
 *  1. Fault while reloading DS, ES, FS or GS
 *  2. Fault while executing IRET
 * Category 1 we do not need to fix up as Xen has already reloaded all segment
 * registers that could be reloaded and zeroed the others.
 * Category 2 we fix up by killing the current process. We cannot use the
 * normal Linux return path in this case because if we use the IRET hypercall
 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
 * We distinguish between categories by comparing each saved segment register
 * with its current contents: any discrepancy means we in category 1.
 */
867 868
	__FUNC_ALIGN
SYM_CODE_START_NOALIGN(xen_failsafe_callback)
869
	UNWIND_HINT_UNDEFINED
870
	ENDBR
871 872 873 874 875 876 877 878 879 880 881 882
	movl	%ds, %ecx
	cmpw	%cx, 0x10(%rsp)
	jne	1f
	movl	%es, %ecx
	cmpw	%cx, 0x18(%rsp)
	jne	1f
	movl	%fs, %ecx
	cmpw	%cx, 0x20(%rsp)
	jne	1f
	movl	%gs, %ecx
	cmpw	%cx, 0x28(%rsp)
	jne	1f
883
	/* All segments match their saved values => Category 2 (Bad IRET). */
884 885 886 887
	movq	(%rsp), %rcx
	movq	8(%rsp), %r11
	addq	$0x30, %rsp
	pushq	$0				/* RIP */
888
	UNWIND_HINT_IRET_REGS offset=8
889
	jmp	asm_exc_general_protection
890
1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
891 892 893
	movq	(%rsp), %rcx
	movq	8(%rsp), %r11
	addq	$0x30, %rsp
894
	UNWIND_HINT_IRET_REGS
895
	pushq	$-1 /* orig_ax = -1 => not a system call */
896
	PUSH_AND_CLEAR_REGS
897
	ENCODE_FRAME_POINTER
898
	jmp	error_return
899
SYM_CODE_END(xen_failsafe_callback)
900
#endif /* CONFIG_XEN_PV */
901

902
/*
903 904 905 906 907 908 909 910
 * Save all registers in pt_regs. Return GSBASE related information
 * in EBX depending on the availability of the FSGSBASE instructions:
 *
 * FSGSBASE	R/EBX
 *     N        0 -> SWAPGS on exit
 *              1 -> no SWAPGS on exit
 *
 *     Y        GSBASE value at entry, must be restored in paranoid_exit
911 912 913
 *
 * R14 - old CR3
 * R15 - old SPEC_CTRL
914
 */
915 916
SYM_CODE_START(paranoid_entry)
	ANNOTATE_NOENDBR
917
	UNWIND_HINT_FUNC
918 919
	PUSH_AND_CLEAR_REGS save_ret=1
	ENCODE_FRAME_POINTER 8
920

921 922
	/*
	 * Always stash CR3 in %r14.  This value will be restored,
923 924 925
	 * verbatim, at exit.  Needed if paranoid_entry interrupted
	 * another entry that already switched to the user CR3 value
	 * but has not yet returned to userspace.
926 927 928
	 *
	 * This is also why CS (stashed in the "iret frame" by the
	 * hardware at entry) can not be used: this may be a return
929
	 * to kernel code, but with a user CR3 value.
930 931 932 933 934
	 *
	 * Switching CR3 does not depend on kernel GSBASE so it can
	 * be done before switching to the kernel GSBASE. This is
	 * required for FSGSBASE because the kernel GSBASE has to
	 * be retrieved from a kernel internal table.
935
	 */
936 937
	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14

938 939 940 941 942 943 944 945 946 947 948 949 950 951 952
	/*
	 * Handling GSBASE depends on the availability of FSGSBASE.
	 *
	 * Without FSGSBASE the kernel enforces that negative GSBASE
	 * values indicate kernel GSBASE. With FSGSBASE no assumptions
	 * can be made about the GSBASE value when entering from user
	 * space.
	 */
	ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE

	/*
	 * Read the current GSBASE and store it in %rbx unconditionally,
	 * retrieve and set the current CPUs kernel GSBASE. The stored value
	 * has to be restored in paranoid_exit unconditionally.
	 *
953 954 955
	 * The unconditional write to GS base below ensures that no subsequent
	 * loads based on a mispredicted GS base can happen, therefore no LFENCE
	 * is needed here.
956 957
	 */
	SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
958
	jmp .Lparanoid_gsbase_done
959 960

.Lparanoid_entry_checkgs:
961 962
	/* EBX = 1 -> kernel GSBASE active, no restore required */
	movl	$1, %ebx
963

964 965 966 967 968 969 970
	/*
	 * The kernel-enforced convention is a negative GSBASE indicates
	 * a kernel value. No SWAPGS needed on entry and exit.
	 */
	movl	$MSR_GS_BASE, %ecx
	rdmsr
	testl	%edx, %edx
971
	js	.Lparanoid_kernel_gsbase
972

973 974
	/* EBX = 0 -> SWAPGS required on exit */
	xorl	%ebx, %ebx
975
	swapgs
976
.Lparanoid_kernel_gsbase:
977
	FENCE_SWAPGS_KERNEL_ENTRY
978 979 980 981 982 983 984
.Lparanoid_gsbase_done:

	/*
	 * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
	 * CR3 above, keep the old value in a callee saved register.
	 */
	IBRS_ENTER save_reg=%r15
985
	UNTRAIN_RET_FROM_CALL
986

987
	RET
988
SYM_CODE_END(paranoid_entry)
989

990 991 992 993 994 995 996
/*
 * "Paranoid" exit path from exception stack.  This is invoked
 * only on return from non-NMI IST interrupts that came
 * from kernel space.
 *
 * We may be returning to very strange contexts (e.g. very early
 * in syscall entry), so checking for preemption here would
997 998 999 1000 1001 1002 1003 1004 1005
 * be complicated.  Fortunately, there's no good reason to try
 * to handle preemption here.
 *
 * R/EBX contains the GSBASE related information depending on the
 * availability of the FSGSBASE instructions:
 *
 * FSGSBASE	R/EBX
 *     N        0 -> SWAPGS on exit
 *              1 -> no SWAPGS on exit
1006
 *
1007
 *     Y        User space GSBASE, must be restored unconditionally
1008 1009 1010
 *
 * R14 - old CR3
 * R15 - old SPEC_CTRL
1011
 */
1012
SYM_CODE_START_LOCAL(paranoid_exit)
1013
	UNWIND_HINT_REGS
1014 1015 1016 1017 1018 1019 1020

	/*
	 * Must restore IBRS state before both CR3 and %GS since we need access
	 * to the per-CPU x86_spec_ctrl_shadow variable.
	 */
	IBRS_EXIT save_reg=%r15

1021 1022 1023 1024 1025 1026
	/*
	 * The order of operations is important. RESTORE_CR3 requires
	 * kernel GSBASE.
	 *
	 * NB to anyone to try to optimize this code: this code does
	 * not execute at all for exceptions from user mode. Those
1027
	 * exceptions go through error_return instead.
1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	 */
	RESTORE_CR3	scratch_reg=%rax save_reg=%r14

	/* Handle the three GSBASE cases */
	ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE

	/* With FSGSBASE enabled, unconditionally restore GSBASE */
	wrgsbase	%rbx
	jmp		restore_regs_and_return_to_kernel

.Lparanoid_exit_checkgs:
	/* On non-FSGSBASE systems, conditionally do SWAPGS */
	testl		%ebx, %ebx
	jnz		restore_regs_and_return_to_kernel

	/* We are returning to a context with user GSBASE */
1044
	swapgs
1045
	jmp		restore_regs_and_return_to_kernel
1046
SYM_CODE_END(paranoid_exit)
1047 1048

/*
1049
 * Switch GS and CR3 if needed.
1050
 */
1051 1052
SYM_CODE_START(error_entry)
	ANNOTATE_NOENDBR
1053
	UNWIND_HINT_FUNC
1054 1055 1056 1057

	PUSH_AND_CLEAR_REGS save_ret=1
	ENCODE_FRAME_POINTER 8

1058
	testb	$3, CS+8(%rsp)
1059
	jz	.Lerror_kernelspace
1060

1061 1062 1063 1064
	/*
	 * We entered from user mode or we're pretending to have entered
	 * from user mode due to an IRET fault.
	 */
1065
	swapgs
1066
	FENCE_SWAPGS_USER_ENTRY
1067 1068
	/* We have user CR3.  Change to kernel CR3. */
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1069
	IBRS_ENTER
1070
	UNTRAIN_RET_FROM_CALL
1071

1072
	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
1073
	/* Put us onto the real thread stack. */
1074
	jmp	sync_regs
1075

1076 1077 1078 1079 1080 1081
	/*
	 * There are two places in the kernel that can potentially fault with
	 * usergs. Handle them here.  B stepping K8s sometimes report a
	 * truncated RIP for IRET exceptions returning to compat mode. Check
	 * for these here too.
	 */
1082
.Lerror_kernelspace:
1083 1084
	leaq	native_irq_return_iret(%rip), %rcx
	cmpq	%rcx, RIP+8(%rsp)
1085
	je	.Lerror_bad_iret
1086 1087
	movl	%ecx, %eax			/* zero extend */
	cmpq	%rax, RIP+8(%rsp)
1088
	je	.Lbstep_iret
1089
	cmpq	$.Lgs_change, RIP+8(%rsp)
1090
	jne	.Lerror_entry_done_lfence
1091 1092

	/*
1093
	 * hack: .Lgs_change can fail with user gsbase.  If this happens, fix up
1094
	 * gsbase and proceed.  We'll fix up the exception and land in
1095
	 * .Lgs_change's error handler with kernel gsbase.
1096
	 */
1097
	swapgs
1098 1099 1100 1101 1102 1103 1104

	/*
	 * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
	 * kernel or user gsbase.
	 */
.Lerror_entry_done_lfence:
	FENCE_SWAPGS_KERNEL_ENTRY
1105
	CALL_DEPTH_ACCOUNT
1106
	leaq	8(%rsp), %rax			/* return pt_regs pointer */
1107
	VALIDATE_UNRET_END
1108
	RET
1109

1110
.Lbstep_iret:
1111
	/* Fix truncated RIP */
1112
	movq	%rcx, RIP+8(%rsp)
1113 1114
	/* fall through */

1115
.Lerror_bad_iret:
1116
	/*
1117 1118
	 * We came from an IRET to user mode, so we have user
	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
1119
	 */
1120
	swapgs
1121
	FENCE_SWAPGS_USER_ENTRY
1122
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
1123
	IBRS_ENTER
1124
	UNTRAIN_RET_FROM_CALL
1125 1126 1127

	/*
	 * Pretend that the exception came from user mode: set up pt_regs
1128
	 * as if we faulted immediately after IRET.
1129
	 */
1130
	leaq	8(%rsp), %rdi			/* arg0 = pt_regs pointer */
1131
	call	fixup_bad_iret
1132
	mov	%rax, %rdi
1133
	jmp	sync_regs
1134
SYM_CODE_END(error_entry)
1135

1136 1137 1138 1139 1140 1141 1142 1143
SYM_CODE_START_LOCAL(error_return)
	UNWIND_HINT_REGS
	DEBUG_ENTRY_ASSERT_IRQS_OFF
	testb	$3, CS(%rsp)
	jz	restore_regs_and_return_to_kernel
	jmp	swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(error_return)

1144 1145 1146
/*
 * Runs on exception stack.  Xen PV does not go through this path at all,
 * so we can use real assembly here.
1147 1148 1149 1150
 *
 * Registers:
 *	%r14: Used to save/restore the CR3 of the interrupted context
 *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
1151
 */
1152
SYM_CODE_START(asm_exc_nmi)
1153
	UNWIND_HINT_IRET_ENTRY
1154
	ENDBR
1155

1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
	/*
	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
	 * the iretq it performs will take us out of NMI context.
	 * This means that we can have nested NMIs where the next
	 * NMI is using the top of the stack of the previous NMI. We
	 * can't let it execute because the nested NMI will corrupt the
	 * stack of the previous NMI. NMI handlers are not re-entrant
	 * anyway.
	 *
	 * To handle this case we do the following:
	 *  Check the a special location on the stack that contains
	 *  a variable that is set when NMIs are executing.
	 *  The interrupted task's stack is also checked to see if it
	 *  is an NMI stack.
	 *  If the variable is not set and the stack is not the NMI
	 *  stack then:
	 *    o Set the special variable on the stack
1173 1174 1175
	 *    o Copy the interrupt frame into an "outermost" location on the
	 *      stack
	 *    o Copy the interrupt frame into an "iret" location on the stack
1176 1177
	 *    o Continue processing the NMI
	 *  If the variable is set or the previous stack is the NMI stack:
1178
	 *    o Modify the "iret" location to jump to the repeat_nmi
1179 1180 1181 1182 1183 1184 1185 1186
	 *    o return back to the first NMI
	 *
	 * Now on exit of the first NMI, we first clear the stack variable
	 * The NMI stack will tell any nested NMIs at that point that it is
	 * nested. Then we pop the stack normally with iret, and if there was
	 * a nested NMI that updated the copy interrupt stack frame, a
	 * jump will be made to the repeat_nmi code that will handle the second
	 * NMI.
1187 1188 1189 1190 1191
	 *
	 * However, espfix prevents us from directly returning to userspace
	 * with a single IRET instruction.  Similarly, IRET to user mode
	 * can fault.  We therefore handle NMIs from user space like
	 * other IST entries.
1192 1193
	 */

1194
	ASM_CLAC
1195
	cld
1196

1197
	/* Use %rdx as our temp variable throughout */
1198
	pushq	%rdx
1199

1200 1201 1202 1203 1204 1205 1206 1207 1208
	testb	$3, CS-RIP+8(%rsp)
	jz	.Lnmi_from_kernel

	/*
	 * NMI from user mode.  We need to run on the thread stack, but we
	 * can't go through the normal entry paths: NMIs are masked, and
	 * we don't want to enable interrupts, because then we'll end
	 * up in an awkward situation in which IRQs are on but NMIs
	 * are off.
1209 1210 1211
	 *
	 * We also must not push anything to the stack before switching
	 * stacks lest we corrupt the "NMI executing" variable.
1212 1213
	 */

1214
	swapgs
1215
	FENCE_SWAPGS_USER_ENTRY
1216
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
1217
	movq	%rsp, %rdx
1218
	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
1219
	UNWIND_HINT_IRET_REGS base=%rdx offset=8
1220 1221 1222 1223 1224
	pushq	5*8(%rdx)	/* pt_regs->ss */
	pushq	4*8(%rdx)	/* pt_regs->rsp */
	pushq	3*8(%rdx)	/* pt_regs->flags */
	pushq	2*8(%rdx)	/* pt_regs->cs */
	pushq	1*8(%rdx)	/* pt_regs->rip */
1225
	UNWIND_HINT_IRET_REGS
1226
	pushq   $-1		/* pt_regs->orig_ax */
1227
	PUSH_AND_CLEAR_REGS rdx=(%rdx)
1228
	ENCODE_FRAME_POINTER
1229

1230 1231 1232
	IBRS_ENTER
	UNTRAIN_RET

1233 1234 1235 1236 1237 1238 1239 1240
	/*
	 * At this point we no longer need to worry about stack damage
	 * due to nesting -- we're on the normal thread stack and we're
	 * done with the NMI stack.
	 */

	movq	%rsp, %rdi
	movq	$-1, %rsi
1241
	call	exc_nmi
1242

1243
	/*
1244
	 * Return back to user mode.  We must *not* do the normal exit
1245
	 * work, because we don't want to enable interrupts.
1246
	 */
1247
	jmp	swapgs_restore_regs_and_return_to_usermode
1248

1249
.Lnmi_from_kernel:
1250
	/*
1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
	 * Here's what our stack frame will look like:
	 * +---------------------------------------------------------+
	 * | original SS                                             |
	 * | original Return RSP                                     |
	 * | original RFLAGS                                         |
	 * | original CS                                             |
	 * | original RIP                                            |
	 * +---------------------------------------------------------+
	 * | temp storage for rdx                                    |
	 * +---------------------------------------------------------+
	 * | "NMI executing" variable                                |
	 * +---------------------------------------------------------+
	 * | iret SS          } Copied from "outermost" frame        |
	 * | iret Return RSP  } on each loop iteration; overwritten  |
	 * | iret RFLAGS      } by a nested NMI to force another     |
	 * | iret CS          } iteration if needed.                 |
	 * | iret RIP         }                                      |
	 * +---------------------------------------------------------+
	 * | outermost SS          } initialized in first_nmi;       |
	 * | outermost Return RSP  } will not be changed before      |
	 * | outermost RFLAGS      } NMI processing is done.         |
	 * | outermost CS          } Copied to "iret" frame on each  |
	 * | outermost RIP         } iteration.                      |
	 * +---------------------------------------------------------+
	 * | pt_regs                                                 |
	 * +---------------------------------------------------------+
	 *
	 * The "original" frame is used by hardware.  Before re-enabling
	 * NMIs, we need to be done with it, and we need to leave enough
	 * space for the asm code here.
	 *
	 * We return by executing IRET while RSP points to the "iret" frame.
	 * That will either return for real or it will loop back into NMI
	 * processing.
	 *
	 * The "outermost" frame is copied to the "iret" frame on each
	 * iteration of the loop, so each iteration starts with the "iret"
	 * frame pointing to the final return target.
	 */

1291
	/*
1292 1293
	 * Determine whether we're a nested NMI.
	 *
1294 1295 1296 1297
	 * If we interrupted kernel code between repeat_nmi and
	 * end_repeat_nmi, then we are a nested NMI.  We must not
	 * modify the "iret" frame because it's being written by
	 * the outer NMI.  That's okay; the outer NMI handler is
1298
	 * about to about to call exc_nmi() anyway, so we can just
1299
	 * resume the outer NMI.
1300
	 */
1301 1302 1303 1304 1305 1306 1307 1308

	movq	$repeat_nmi, %rdx
	cmpq	8(%rsp), %rdx
	ja	1f
	movq	$end_repeat_nmi, %rdx
	cmpq	8(%rsp), %rdx
	ja	nested_nmi_out
1:
1309

1310
	/*
1311
	 * Now check "NMI executing".  If it's set, then we're nested.
1312 1313
	 * This will not detect if we interrupted an outer NMI just
	 * before IRET.
1314
	 */
1315 1316
	cmpl	$1, -8(%rsp)
	je	nested_nmi
1317 1318

	/*
1319 1320
	 * Now test if the previous stack was an NMI stack.  This covers
	 * the case where we interrupt an outer NMI after it clears
1321 1322 1323 1324 1325 1326 1327 1328
	 * "NMI executing" but before IRET.  We need to be careful, though:
	 * there is one case in which RSP could point to the NMI stack
	 * despite there being no NMI active: naughty userspace controls
	 * RSP at the very beginning of the SYSCALL targets.  We can
	 * pull a fast one on naughty userspace, though: we program
	 * SYSCALL to mask DF, so userspace cannot cause DF to be set
	 * if it controls the kernel's RSP.  We set DF before we clear
	 * "NMI executing".
1329
	 */
1330 1331 1332 1333 1334
	lea	6*8(%rsp), %rdx
	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
	cmpq	%rdx, 4*8(%rsp)
	/* If the stack pointer is above the NMI stack, this is a normal NMI */
	ja	first_nmi
1335

1336 1337 1338 1339
	subq	$EXCEPTION_STKSZ, %rdx
	cmpq	%rdx, 4*8(%rsp)
	/* If it is below the NMI stack, it is a normal NMI */
	jb	first_nmi
1340 1341 1342 1343 1344 1345 1346

	/* Ah, it is within the NMI stack. */

	testb	$(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
	jz	first_nmi	/* RSP was user controlled. */

	/* This is a nested NMI. */
1347

1348 1349
nested_nmi:
	/*
1350 1351
	 * Modify the "iret" frame to point to repeat_nmi, forcing another
	 * iteration of NMI handling.
1352
	 */
1353
	subq	$8, %rsp
1354 1355 1356
	leaq	-10*8(%rsp), %rdx
	pushq	$__KERNEL_DS
	pushq	%rdx
1357
	pushfq
1358 1359
	pushq	$__KERNEL_CS
	pushq	$repeat_nmi
1360 1361

	/* Put stack back */
1362
	addq	$(6*8), %rsp
1363 1364

nested_nmi_out:
1365
	popq	%rdx
1366

1367
	/* We are returning to kernel mode, so this cannot result in a fault. */
1368
	iretq
1369 1370

first_nmi:
1371
	/* Restore rdx. */
1372
	movq	(%rsp), %rdx
1373

1374 1375
	/* Make room for "NMI executing". */
	pushq	$0
1376

1377
	/* Leave room for the "iret" frame */
1378
	subq	$(5*8), %rsp
1379

1380
	/* Copy the "original" frame to the "outermost" frame */
1381
	.rept 5
1382
	pushq	11*8(%rsp)
1383
	.endr
1384
	UNWIND_HINT_IRET_REGS
1385

1386 1387
	/* Everything up to here is safe from nested NMIs */

1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
#ifdef CONFIG_DEBUG_ENTRY
	/*
	 * For ease of testing, unmask NMIs right away.  Disabled by
	 * default because IRET is very expensive.
	 */
	pushq	$0		/* SS */
	pushq	%rsp		/* RSP (minus 8 because of the previous push) */
	addq	$8, (%rsp)	/* Fix up RSP */
	pushfq			/* RFLAGS */
	pushq	$__KERNEL_CS	/* CS */
	pushq	$1f		/* RIP */
1399
	iretq			/* continues at repeat_nmi below */
1400
	UNWIND_HINT_IRET_REGS
1401 1402 1403
1:
#endif

1404
repeat_nmi:
1405
	ANNOTATE_NOENDBR // this code
1406 1407 1408 1409 1410 1411 1412 1413
	/*
	 * If there was a nested NMI, the first NMI's iret will return
	 * here. But NMIs are still enabled and we can take another
	 * nested NMI. The nested NMI checks the interrupted RIP to see
	 * if it is between repeat_nmi and end_repeat_nmi, and if so
	 * it will just return, as we are about to repeat an NMI anyway.
	 * This makes it safe to copy to the stack frame that a nested
	 * NMI will update.
1414 1415 1416 1417
	 *
	 * RSP is pointing to "outermost RIP".  gsbase is unknown, but, if
	 * we're repeating an NMI, gsbase has the same value that it had on
	 * the first iteration.  paranoid_entry will load the kernel
1418
	 * gsbase if needed before we call exc_nmi().  "NMI executing"
1419
	 * is zero.
1420
	 */
1421
	movq	$1, 10*8(%rsp)		/* Set "NMI executing". */
1422

1423
	/*
1424 1425 1426
	 * Copy the "outermost" frame to the "iret" frame.  NMIs that nest
	 * here must not modify the "iret" frame while we're writing to
	 * it or it will end up containing garbage.
1427
	 */
1428
	addq	$(10*8), %rsp
1429
	.rept 5
1430
	pushq	-6*8(%rsp)
1431
	.endr
1432
	subq	$(5*8), %rsp
1433
end_repeat_nmi:
1434
	ANNOTATE_NOENDBR // this code
1435 1436

	/*
1437 1438 1439
	 * Everything below this point can be preempted by a nested NMI.
	 * If this happens, then the inner NMI will change the "iret"
	 * frame to point back to repeat_nmi.
1440
	 */
1441
	pushq	$-1				/* ORIG_RAX: no syscall to restart */
1442

1443
	/*
1444
	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
1445 1446 1447 1448 1449
	 * as we should not be calling schedule in NMI context.
	 * Even with normal interrupts enabled. An NMI should not be
	 * setting NEED_RESCHED or anything that normal interrupts and
	 * exceptions might do.
	 */
1450
	call	paranoid_entry
1451
	UNWIND_HINT_REGS
1452

1453 1454
	movq	%rsp, %rdi
	movq	$-1, %rsi
1455
	call	exc_nmi
1456

1457 1458 1459
	/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
	IBRS_EXIT save_reg=%r15

1460
	/* Always restore stashed CR3 value (see paranoid_entry) */
1461
	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
1462

1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478
	/*
	 * The above invocation of paranoid_entry stored the GSBASE
	 * related information in R/EBX depending on the availability
	 * of FSGSBASE.
	 *
	 * If FSGSBASE is enabled, restore the saved GSBASE value
	 * unconditionally, otherwise take the conditional SWAPGS path.
	 */
	ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE

	wrgsbase	%rbx
	jmp	nmi_restore

nmi_no_fsgsbase:
	/* EBX == 0 -> invoke SWAPGS */
	testl	%ebx, %ebx
1479
	jnz	nmi_restore
1480

1481
nmi_swapgs:
1482
	swapgs
1483

1484
nmi_restore:
1485
	POP_REGS
1486

1487 1488 1489 1490 1491
	/*
	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
	 * at the "iret" frame.
	 */
	addq	$6*8, %rsp
1492

1493 1494 1495
	/*
	 * Clear "NMI executing".  Set DF first so that we can easily
	 * distinguish the remaining code between here and IRET from
1496 1497 1498 1499 1500
	 * the SYSCALL entry and exit paths.
	 *
	 * We arguably should just inspect RIP instead, but I (Andy) wrote
	 * this code when I had the misapprehension that Xen PV supported
	 * NMIs, and Xen PV would break that approach.
1501 1502 1503
	 */
	std
	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
1504 1505

	/*
1506 1507 1508 1509
	 * iretq reads the "iret" frame and exits the NMI stack in a
	 * single instruction.  We are returning to kernel mode, so this
	 * cannot result in a fault.  Similarly, we don't need to worry
	 * about espfix64 on the way back to kernel mode.
1510
	 */
1511
	iretq
1512
SYM_CODE_END(asm_exc_nmi)
1513

1514 1515 1516 1517 1518
#ifndef CONFIG_IA32_EMULATION
/*
 * This handles SYSCALL from 32-bit code.  There is no way to program
 * MSRs to fully disable 32-bit SYSCALL.
 */
1519
SYM_CODE_START(ignore_sysret)
1520
	UNWIND_HINT_END_OF_STACK
1521
	ENDBR
1522
	mov	$-ENOSYS, %eax
1523
	sysretl
1524
SYM_CODE_END(ignore_sysret)
1525
#endif
1526

1527
.pushsection .text, "ax"
1528 1529
	__FUNC_ALIGN
SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
1530
	UNWIND_HINT_FUNC
1531 1532 1533
	/* Prevent any naive code from trying to unwind to our caller. */
	xorl	%ebp, %ebp

1534
	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax
1535
	leaq	-PTREGS_SIZE(%rax), %rsp
1536
	UNWIND_HINT_REGS
1537

1538 1539
	call	make_task_dead
SYM_CODE_END(rewind_stack_and_make_dead)
1540
.popsection