Commit 2a594d4c authored by Thomas Gleixner's avatar Thomas Gleixner Committed by Borislav Petkov

x86/exceptions: Split debug IST stack

The debug IST stack is actually two separate debug stacks to handle #DB
recursion. This is required because the CPU starts always at top of stack
on exception entry, which means on #DB recursion the second #DB would
overwrite the stack of the first.

The low level entry code therefore adjusts the top of stack on entry so a
secondary #DB starts from a different stack page. But the stack pages are
adjacent without a guard page between them.

Split the debug stack into 3 stacks which are separated by guard pages. The
3rd stack is never mapped into the cpu_entry_area and is only there to
catch triple #DB nesting:

      --- top of DB_stack	<- Initial stack
      --- end of DB_stack
      	  guard page

      --- top of DB1_stack	<- Top of stack after entering first #DB
      --- end of DB1_stack
      	  guard page

      --- top of DB2_stack	<- Top of stack after entering second #DB
      --- end of DB2_stack
      	  guard page

If DB2 would not act as the final guard hole, a second #DB would point the
top of #DB stack to the stack below #DB1 which would be valid and not catch
the not so desired triple nesting.

The backing store does not allocate any memory for DB2 and its guard page
as it is not going to be mapped into the cpu_entry_area.

 - Adjust the low level entry code so it adjusts top of #DB with the offset
   between the stacks instead of exception stack size.

 - Make the dumpstack code aware of the new stacks.

 - Adjust the in_debug_stack() implementation and move it into the NMI code
   where it belongs. As this is NMI hotpath code, it just checks the full
   area between top of DB_stack and bottom of DB1_stack without checking
   for the guard page. That's correct because the NMI cannot hit a
   stackpointer pointing to the guard page between DB and DB1 stack.  Even
   if it would, then the NMI operation still is unaffected, but the resume
   of the debug exception on the topmost DB stack will crash by touching
   the guard page.

  [ bp: Make exception_stack_names static const char * const ]
Suggested-by: default avatarAndy Lutomirski <luto@kernel.org>
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
Reviewed-by: default avatarSean Christopherson <sean.j.christopherson@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Chang S. Bae" <chang.seok.bae@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: linux-doc@vger.kernel.org
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Sean Christopherson <sean.j.christopherson@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: https://lkml.kernel.org/r/20190414160145.439944544@linutronix.de
parent 1bdb67e5
...@@ -76,7 +76,7 @@ The currently assigned IST stacks are :- ...@@ -76,7 +76,7 @@ The currently assigned IST stacks are :-
middle of switching stacks. Using IST for NMI events avoids making middle of switching stacks. Using IST for NMI events avoids making
assumptions about the previous state of the kernel stack. assumptions about the previous state of the kernel stack.
* ESTACK_DB. DEBUG_STKSZ * ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE).
Used for hardware debug interrupts (interrupt 1) and for software Used for hardware debug interrupts (interrupt 1) and for software
debug interrupts (INT3). debug interrupts (INT3).
...@@ -86,6 +86,11 @@ The currently assigned IST stacks are :- ...@@ -86,6 +86,11 @@ The currently assigned IST stacks are :-
avoids making assumptions about the previous state of the kernel avoids making assumptions about the previous state of the kernel
stack. stack.
To handle nested #DB correctly there exist two instances of DB stacks. On
#DB entry the IST stackpointer for #DB is switched to the second instance
so a nested #DB starts from a clean stack. The nested #DB switches
the IST stackpointer to a guard hole to catch triple nesting.
* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). * ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE).
Used for interrupt 18 - Machine Check Exception (#MC). Used for interrupt 18 - Machine Check Exception (#MC).
......
...@@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt ...@@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
* @paranoid == 2 is special: the stub will never switch stacks. This is for * @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/ */
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0
ENTRY(\sym) ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
...@@ -925,13 +925,13 @@ ENTRY(\sym) ...@@ -925,13 +925,13 @@ ENTRY(\sym)
.endif .endif
.if \shift_ist != -1 .if \shift_ist != -1
subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) subq $\ist_offset, CPU_TSS_IST(\shift_ist)
.endif .endif
call \do_sym call \do_sym
.if \shift_ist != -1 .if \shift_ist != -1
addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) addq $\ist_offset, CPU_TSS_IST(\shift_ist)
.endif .endif
/* these procedures expect "no swapgs" flag in ebx */ /* these procedures expect "no swapgs" flag in ebx */
...@@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ ...@@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
hv_stimer0_callback_vector hv_stimer0_vector_handler hv_stimer0_callback_vector hv_stimer0_vector_handler
#endif /* CONFIG_HYPERV */ #endif /* CONFIG_HYPERV */
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
idtentry int3 do_int3 has_error_code=0 idtentry int3 do_int3 has_error_code=0
idtentry stack_segment do_stack_segment has_error_code=1 idtentry stack_segment do_stack_segment has_error_code=1
......
...@@ -10,25 +10,29 @@ ...@@ -10,25 +10,29 @@
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* Macro to enforce the same ordering and stack sizes */ /* Macro to enforce the same ordering and stack sizes */
#define ESTACKS_MEMBERS(guardsize) \ #define ESTACKS_MEMBERS(guardsize, db2_holesize)\
char DF_stack_guard[guardsize]; \ char DF_stack_guard[guardsize]; \
char DF_stack[EXCEPTION_STKSZ]; \ char DF_stack[EXCEPTION_STKSZ]; \
char NMI_stack_guard[guardsize]; \ char NMI_stack_guard[guardsize]; \
char NMI_stack[EXCEPTION_STKSZ]; \ char NMI_stack[EXCEPTION_STKSZ]; \
char DB2_stack_guard[guardsize]; \
char DB2_stack[db2_holesize]; \
char DB1_stack_guard[guardsize]; \
char DB1_stack[EXCEPTION_STKSZ]; \
char DB_stack_guard[guardsize]; \ char DB_stack_guard[guardsize]; \
char DB_stack[DEBUG_STKSZ]; \ char DB_stack[EXCEPTION_STKSZ]; \
char MCE_stack_guard[guardsize]; \ char MCE_stack_guard[guardsize]; \
char MCE_stack[EXCEPTION_STKSZ]; \ char MCE_stack[EXCEPTION_STKSZ]; \
char IST_top_guard[guardsize]; \ char IST_top_guard[guardsize]; \
/* The exception stacks' physical storage. No guard pages required */ /* The exception stacks' physical storage. No guard pages required */
struct exception_stacks { struct exception_stacks {
ESTACKS_MEMBERS(0) ESTACKS_MEMBERS(0, 0)
}; };
/* The effective cpu entry area mapping with guard pages. */ /* The effective cpu entry area mapping with guard pages. */
struct cea_exception_stacks { struct cea_exception_stacks {
ESTACKS_MEMBERS(PAGE_SIZE) ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
}; };
/* /*
...@@ -37,6 +41,8 @@ struct cea_exception_stacks { ...@@ -37,6 +41,8 @@ struct cea_exception_stacks {
enum exception_stack_ordering { enum exception_stack_ordering {
ESTACK_DF, ESTACK_DF,
ESTACK_NMI, ESTACK_NMI,
ESTACK_DB2,
ESTACK_DB1,
ESTACK_DB, ESTACK_DB,
ESTACK_MCE, ESTACK_MCE,
N_EXCEPTION_STACKS N_EXCEPTION_STACKS
......
...@@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void) ...@@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void)
{ {
__this_cpu_dec(debug_stack_usage); __this_cpu_dec(debug_stack_usage);
} }
int is_debug_stack(unsigned long addr);
void debug_stack_set_zero(void); void debug_stack_set_zero(void);
void debug_stack_reset(void); void debug_stack_reset(void);
#else /* !X86_64 */ #else /* !X86_64 */
static inline int is_debug_stack(unsigned long addr) { return 0; }
static inline void debug_stack_set_zero(void) { } static inline void debug_stack_set_zero(void) { }
static inline void debug_stack_reset(void) { } static inline void debug_stack_reset(void) { }
static inline void debug_stack_usage_inc(void) { } static inline void debug_stack_usage_inc(void) { }
......
...@@ -18,9 +18,6 @@ ...@@ -18,9 +18,6 @@
#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
......
...@@ -68,6 +68,8 @@ int main(void) ...@@ -68,6 +68,8 @@ int main(void)
#undef ENTRY #undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_ist, tss_struct, x86_tss.ist);
DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
offsetof(struct cea_exception_stacks, DB1_stack));
BLANK(); BLANK();
#ifdef CONFIG_STACKPROTECTOR #ifdef CONFIG_STACKPROTECTOR
......
...@@ -1549,17 +1549,7 @@ void syscall_init(void) ...@@ -1549,17 +1549,7 @@ void syscall_init(void)
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
} }
static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
DEFINE_PER_CPU(int, debug_stack_usage); DEFINE_PER_CPU(int, debug_stack_usage);
int is_debug_stack(unsigned long addr)
{
return __this_cpu_read(debug_stack_usage) ||
(addr <= __this_cpu_read(debug_stack_addr) &&
addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
}
NOKPROBE_SYMBOL(is_debug_stack);
DEFINE_PER_CPU(u32, debug_idt_ctr); DEFINE_PER_CPU(u32, debug_idt_ctr);
void debug_stack_set_zero(void) void debug_stack_set_zero(void)
...@@ -1735,7 +1725,6 @@ void cpu_init(void) ...@@ -1735,7 +1725,6 @@ void cpu_init(void)
t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[IST_INDEX_DB];
} }
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
......
...@@ -19,16 +19,18 @@ ...@@ -19,16 +19,18 @@
#include <asm/cpu_entry_area.h> #include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
static const char *exception_stack_names[N_EXCEPTION_STACKS] = { static const char * const exception_stack_names[] = {
[ ESTACK_DF ] = "#DF", [ ESTACK_DF ] = "#DF",
[ ESTACK_NMI ] = "NMI", [ ESTACK_NMI ] = "NMI",
[ ESTACK_DB2 ] = "#DB2",
[ ESTACK_DB1 ] = "#DB1",
[ ESTACK_DB ] = "#DB", [ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC", [ ESTACK_MCE ] = "#MC",
}; };
const char *stack_type_name(enum stack_type type) const char *stack_type_name(enum stack_type type)
{ {
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
if (type == STACK_TYPE_IRQ) if (type == STACK_TYPE_IRQ)
return "IRQ"; return "IRQ";
...@@ -58,9 +60,11 @@ struct estack_layout { ...@@ -58,9 +60,11 @@ struct estack_layout {
.end = offsetof(struct cea_exception_stacks, x## _stack_guard) \ .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \
} }
static const struct estack_layout layout[N_EXCEPTION_STACKS] = { static const struct estack_layout layout[] = {
[ ESTACK_DF ] = ESTACK_ENTRY(DF), [ ESTACK_DF ] = ESTACK_ENTRY(DF),
[ ESTACK_NMI ] = ESTACK_ENTRY(NMI), [ ESTACK_NMI ] = ESTACK_ENTRY(NMI),
[ ESTACK_DB2 ] = { .begin = 0, .end = 0},
[ ESTACK_DB1 ] = ESTACK_ENTRY(DB1),
[ ESTACK_DB ] = ESTACK_ENTRY(DB), [ ESTACK_DB ] = ESTACK_ENTRY(DB),
[ ESTACK_MCE ] = ESTACK_ENTRY(MCE), [ ESTACK_MCE ] = ESTACK_ENTRY(MCE),
}; };
...@@ -71,7 +75,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) ...@@ -71,7 +75,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
struct pt_regs *regs; struct pt_regs *regs;
unsigned int k; unsigned int k;
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
estacks = (unsigned long)__this_cpu_read(cea_exception_stacks); estacks = (unsigned long)__this_cpu_read(cea_exception_stacks);
......
...@@ -21,13 +21,14 @@ ...@@ -21,13 +21,14 @@
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/atomic.h>
#include <linux/sched/clock.h> #include <linux/sched/clock.h>
#if defined(CONFIG_EDAC) #if defined(CONFIG_EDAC)
#include <linux/edac.h> #include <linux/edac.h>
#endif #endif
#include <linux/atomic.h> #include <asm/cpu_entry_area.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/mach_traps.h> #include <asm/mach_traps.h>
#include <asm/nmi.h> #include <asm/nmi.h>
...@@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); ...@@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2);
* switch back to the original IDT. * switch back to the original IDT.
*/ */
static DEFINE_PER_CPU(int, update_debug_stack); static DEFINE_PER_CPU(int, update_debug_stack);
static bool notrace is_debug_stack(unsigned long addr)
{
struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
unsigned long top = CEA_ESTACK_TOP(cs, DB);
unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
if (__this_cpu_read(debug_stack_usage))
return true;
/*
* Note, this covers the guard page between DB and DB1 as well to
* avoid two checks. But by all means @addr can never point into
* the guard page.
*/
return addr >= bot && addr < top;
}
NOKPROBE_SYMBOL(is_debug_stack);
#endif #endif
dotraplinkage notrace void dotraplinkage notrace void
......
...@@ -98,10 +98,12 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) ...@@ -98,10 +98,12 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
/* /*
* The exceptions stack mappings in the per cpu area are protected * The exceptions stack mappings in the per cpu area are protected
* by guard pages so each stack must be mapped separately. * by guard pages so each stack must be mapped separately. DB2 is
* not mapped; it just exists to catch triple nesting of #DB.
*/ */
cea_map_stack(DF); cea_map_stack(DF);
cea_map_stack(NMI); cea_map_stack(NMI);
cea_map_stack(DB1);
cea_map_stack(DB); cea_map_stack(DB);
cea_map_stack(MCE); cea_map_stack(MCE);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment