Commit f79aba67 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Luis Henriques

x86/nmi/64: Improve nested NMI comments

commit 0b22930e upstream.

I found the nested NMI documentation to be difficult to follow.
Improve the comments.
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Reviewed-by: default avatarSteven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
[bwh: Backported to 4.0: adjust filename, context]
Signed-off-by: default avatarBen Hutchings <ben@decadent.org.uk>
[ luis: backported to 3.16: Used Ben's backport to 4.0 ]
Signed-off-by: default avatarLuis Henriques <luis.henriques@canonical.com>
parent 67b91ab3
...@@ -1444,11 +1444,12 @@ ENTRY(nmi) ...@@ -1444,11 +1444,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI * If the variable is not set and the stack is not the NMI
* stack then: * stack then:
* o Set the special variable on the stack * o Set the special variable on the stack
* o Copy the interrupt frame into a "saved" location on the stack * o Copy the interrupt frame into an "outermost" location on the
* o Copy the interrupt frame into a "copy" location on the stack * stack
* o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI * o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack: * If the variable is set or the previous stack is the NMI stack:
* o Modify the "copy" location to jump to the repeate_nmi * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI * o return back to the first NMI
* *
* Now on exit of the first NMI, we first clear the stack variable * Now on exit of the first NMI, we first clear the stack variable
...@@ -1542,18 +1543,60 @@ ENTRY(nmi) ...@@ -1542,18 +1543,60 @@ ENTRY(nmi)
.Lnmi_from_kernel: .Lnmi_from_kernel:
/* /*
* Check the special variable on the stack to see if NMIs are * Here's what our stack frame will look like:
* executing. * +---------------------------------------------------------+
* | original SS |
* | original Return RSP |
* | original RFLAGS |
* | original CS |
* | original RIP |
* +---------------------------------------------------------+
* | temp storage for rdx |
* +---------------------------------------------------------+
* | "NMI executing" variable |
* +---------------------------------------------------------+
* | iret SS } Copied from "outermost" frame |
* | iret Return RSP } on each loop iteration; overwritten |
* | iret RFLAGS } by a nested NMI to force another |
* | iret CS } iteration if needed. |
* | iret RIP } |
* +---------------------------------------------------------+
* | outermost SS } initialized in first_nmi; |
* | outermost Return RSP } will not be changed before |
* | outermost RFLAGS } NMI processing is done. |
* | outermost CS } Copied to "iret" frame on each |
* | outermost RIP } iteration. |
* +---------------------------------------------------------+
* | pt_regs |
* +---------------------------------------------------------+
*
* The "original" frame is used by hardware. Before re-enabling
* NMIs, we need to be done with it, and we need to leave enough
* space for the asm code here.
*
* We return by executing IRET while RSP points to the "iret" frame.
* That will either return for real or it will loop back into NMI
* processing.
*
* The "outermost" frame is copied to the "iret" frame on each
* iteration of the loop, so each iteration starts with the "iret"
* frame pointing to the final return target.
*/
/*
* Determine whether we're a nested NMI.
*
* First check "NMI executing". If it's set, then we're nested.
* This will not detect if we interrupted an outer NMI just
* before IRET.
*/ */
cmpl $1, -8(%rsp) cmpl $1, -8(%rsp)
je nested_nmi je nested_nmi
/* /*
* Now test if the previous stack was an NMI stack. * Now test if the previous stack was an NMI stack. This covers
* We need the double check. We check the NMI stack to satisfy the * the case where we interrupt an outer NMI after it clears
* race when the first NMI clears the variable before returning. * "NMI executing" but before IRET.
* We check the variable because the first NMI could be in a
* breakpoint routine using a breakpoint stack.
*/ */
lea 6*8(%rsp), %rdx lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
...@@ -1570,9 +1613,11 @@ ENTRY(nmi) ...@@ -1570,9 +1613,11 @@ ENTRY(nmi)
nested_nmi: nested_nmi:
/* /*
* Do nothing if we interrupted the fixup in repeat_nmi. * If we interrupted an NMI that is between repeat_nmi and
* It's about to repeat the NMI handler, so we are fine * end_repeat_nmi, then we must not modify the "iret" frame
* with ignoring this one. * because it's being written by the outer NMI. That's okay:
* the outer NMI handler is about to call do_nmi anyway,
* so we can just resume the outer NMI.
*/ */
movq $repeat_nmi, %rdx movq $repeat_nmi, %rdx
cmpq 8(%rsp), %rdx cmpq 8(%rsp), %rdx
...@@ -1582,7 +1627,10 @@ nested_nmi: ...@@ -1582,7 +1627,10 @@ nested_nmi:
ja nested_nmi_out ja nested_nmi_out
1: 1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */ /*
* Modify the "iret" frame to point to repeat_nmi, forcing another
* iteration of NMI handling.
*/
leaq -1*8(%rsp), %rdx leaq -1*8(%rsp), %rdx
movq %rdx, %rsp movq %rdx, %rsp
CFI_ADJUST_CFA_OFFSET 1*8 CFI_ADJUST_CFA_OFFSET 1*8
...@@ -1601,60 +1649,23 @@ nested_nmi_out: ...@@ -1601,60 +1649,23 @@ nested_nmi_out:
popq_cfi %rdx popq_cfi %rdx
CFI_RESTORE rdx CFI_RESTORE rdx
/* No need to check faults here */ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN INTERRUPT_RETURN
CFI_RESTORE_STATE CFI_RESTORE_STATE
first_nmi: first_nmi:
/* /* Restore rdx. */
* Because nested NMIs will use the pushed location that we
* stored in rdx, we must keep that space available.
* Here's what our stack frame will look like:
* +-------------------------+
* | original SS |
* | original Return RSP |
* | original RFLAGS |
* | original CS |
* | original RIP |
* +-------------------------+
* | temp storage for rdx |
* +-------------------------+
* | NMI executing variable |
* +-------------------------+
* | copied SS |
* | copied Return RSP |
* | copied RFLAGS |
* | copied CS |
* | copied RIP |
* +-------------------------+
* | Saved SS |
* | Saved Return RSP |
* | Saved RFLAGS |
* | Saved CS |
* | Saved RIP |
* +-------------------------+
* | pt_regs |
* +-------------------------+
*
* The saved stack frame is used to fix up the copied stack frame
* that a nested NMI may change to make the interrupted NMI iret jump
* to the repeat_nmi. The original stack frame and the temp storage
* is also used by nested NMIs and can not be trusted on exit.
*/
/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
movq (%rsp), %rdx movq (%rsp), %rdx
CFI_RESTORE rdx CFI_RESTORE rdx
/* Set the NMI executing variable on the stack. */ /* Set "NMI executing" on the stack. */
pushq_cfi $1 pushq_cfi $1
/* /* Leave room for the "iret" frame */
* Leave room for the "copied" frame
*/
subq $(5*8), %rsp subq $(5*8), %rsp
CFI_ADJUST_CFA_OFFSET 5*8 CFI_ADJUST_CFA_OFFSET 5*8
/* Copy the stack frame to the Saved frame */ /* Copy the "original" frame to the "outermost" frame */
.rept 5 .rept 5
pushq_cfi 11*8(%rsp) pushq_cfi 11*8(%rsp)
.endr .endr
...@@ -1662,6 +1673,7 @@ first_nmi: ...@@ -1662,6 +1673,7 @@ first_nmi:
/* Everything up to here is safe from nested NMIs */ /* Everything up to here is safe from nested NMIs */
repeat_nmi:
/* /*
* If there was a nested NMI, the first NMI's iret will return * If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another * here. But NMIs are still enabled and we can take another
...@@ -1670,16 +1682,21 @@ first_nmi: ...@@ -1670,16 +1682,21 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway. * it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested * This makes it safe to copy to the stack frame that a nested
* NMI will update. * NMI will update.
*/ *
repeat_nmi: * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
/* * we're repeating an NMI, gsbase has the same value that it had on
* Update the stack variable to say we are still in NMI (the update * the first iteration. paranoid_entry will load the kernel
* is benign for the non-repeat case, where 1 was pushed just above * gsbase if needed before we call do_nmi.
* to this very stack slot). *
* Set "NMI executing" in case we came back here via IRET.
*/ */
movq $1, 10*8(%rsp) movq $1, 10*8(%rsp)
/* Make another copy, this one may be modified by nested NMIs */ /*
* Copy the "outermost" frame to the "iret" frame. NMIs that nest
* here must not modify the "iret" frame while we're writing to
* it or it will end up containing garbage.
*/
addq $(10*8), %rsp addq $(10*8), %rsp
CFI_ADJUST_CFA_OFFSET -10*8 CFI_ADJUST_CFA_OFFSET -10*8
.rept 5 .rept 5
...@@ -1690,9 +1707,9 @@ repeat_nmi: ...@@ -1690,9 +1707,9 @@ repeat_nmi:
end_repeat_nmi: end_repeat_nmi:
/* /*
* Everything below this point can be preempted by a nested * Everything below this point can be preempted by a nested NMI.
* NMI if the first NMI took an exception and reset our iret stack * If this happens, then the inner NMI will change the "iret"
* so that we repeat another NMI. * frame to point back to repeat_nmi.
*/ */
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp subq $ORIG_RAX-R15, %rsp
...@@ -1717,11 +1734,17 @@ end_repeat_nmi: ...@@ -1717,11 +1734,17 @@ end_repeat_nmi:
nmi_swapgs: nmi_swapgs:
SWAPGS_UNSAFE_STACK SWAPGS_UNSAFE_STACK
nmi_restore: nmi_restore:
/* Pop the extra iret frame at once */
RESTORE_ALL 6*8 RESTORE_ALL 6*8
/* Clear the NMI executing stack variable */ /* Clear "NMI executing". */
movq $0, 5*8(%rsp) movq $0, 5*8(%rsp)
/*
* INTERRUPT_RETURN reads the "iret" frame and exits the NMI
* stack in a single instruction. We are returning to kernel
* mode, so this cannot result in a fault.
*/
jmp irq_return jmp irq_return
CFI_ENDPROC CFI_ENDPROC
END(nmi) END(nmi)
......
...@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_regs *regs) ...@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi); NOKPROBE_SYMBOL(default_do_nmi);
/* /*
* NMIs can hit breakpoints which will cause it to lose its NMI context * NMIs can page fault or hit breakpoints which will cause it to lose
* with the CPU when the breakpoint or page fault does an IRET. * its NMI context with the CPU when the breakpoint or page fault does an IRET.
* *
* As a result, NMIs can nest if NMIs get unmasked due an IRET during * As a result, NMIs can nest if NMIs get unmasked due an IRET during
* NMI processing. On x86_64, the asm glue protects us from nested NMIs * NMI processing. On x86_64, the asm glue protects us from nested NMIs
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment