Commit 7d8d8cfd authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar

x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit

The old x86_32 doublefault_fn() was old and crufty, and it did not
even try to recover.  do_double_fault() is much nicer.  Rewrite the
32-bit double fault code to sanitize CPU state and call
do_double_fault().  This is mostly an exercise i386 archaeology.

With this patch applied, 32-bit double faults get a real stack trace,
just like 64-bit double faults.

[ mingo: merged the patch to a later kernel base. ]
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent dc4e0021
...@@ -1537,6 +1537,48 @@ SYM_CODE_START(debug) ...@@ -1537,6 +1537,48 @@ SYM_CODE_START(debug)
jmp common_exception jmp common_exception
SYM_CODE_END(debug) SYM_CODE_END(debug)
#ifdef CONFIG_DOUBLEFAULT
SYM_CODE_START(double_fault)
1:
/*
* This is a task gate handler, not an interrupt gate handler.
* The error code is on the stack, but the stack is otherwise
* empty. Interrupts are off. Our state is sane with the following
* exceptions:
*
* - CR0.TS is set. "TS" literally means "task switched".
* - EFLAGS.NT is set because we're a "nested task".
* - The doublefault TSS has back_link set and has been marked busy.
* - TR points to the doublefault TSS and the normal TSS is busy.
* - CR3 is the normal kernel PGD. This would be delightful, except
* that the CPU didn't bother to save the old CR3 anywhere. This
* would make it very awkward to return back to the context we came
* from.
*
* The rest of EFLAGS is sanitized for us, so we don't need to
* worry about AC or DF.
*
* Don't even bother popping the error code. It's always zero,
* and ignoring it makes us a bit more robust against buggy
* hypervisor task gate implementations.
*
* We will manually undo the task switch instead of doing a
* task-switching IRET.
*/
clts /* clear CR0.TS */
pushl $X86_EFLAGS_FIXED
popfl /* clear EFLAGS.NT */
call doublefault_shim
/* We don't support returning, so we have no IRET here. */
1:
hlt
jmp 1b
SYM_CODE_END(double_fault)
#endif
/* /*
* NMI is doubly nasty. It can happen on the first instruction of * NMI is doubly nasty. It can happen on the first instruction of
* entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
......
...@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); ...@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
#endif
dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
......
...@@ -9,42 +9,83 @@ ...@@ -9,42 +9,83 @@
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/traps.h>
extern void double_fault(void);
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
static void doublefault_fn(void) #define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
{
struct desc_ptr gdt_desc = {0, 0};
unsigned long gdt, tss;
BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE); static void set_df_gdt_entry(unsigned int cpu);
native_store_gdt(&gdt_desc); /*
gdt = gdt_desc.address; * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
* we're running the doublefault task. Cannot return.
*/
asmlinkage notrace void __noreturn doublefault_shim(void)
{
unsigned long cr2;
struct pt_regs regs;
printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
if (ptr_ok(gdt)) { cr2 = native_read_cr2();
gdt += GDT_ENTRY_TSS << 3;
tss = get_desc_base((struct desc_struct *)gdt);
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
if (ptr_ok(tss)) { /* Reset back to the normal kernel task. */
struct x86_hw_tss *t = (struct x86_hw_tss *)tss; force_reload_TR();
set_df_gdt_entry(smp_processor_id());
printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", trace_hardirqs_off();
t->ip, t->sp);
printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", /*
t->ax, t->bx, t->cx, t->dx); * Fill in pt_regs. A downside of doing this in C is that the unwinder
printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
t->si, t->di); * won't successfully unwind to the source of the double fault.
} * The main dump from do_double_fault() is fine, though, since it
} * uses these regs directly.
*
* If anyone ever cares, this could be moved to asm.
*/
regs.ss = TSS(ss);
regs.__ssh = 0;
regs.sp = TSS(sp);
regs.flags = TSS(flags);
regs.cs = TSS(cs);
/* We won't go through the entry asm, so we can leave __csh as 0. */
regs.__csh = 0;
regs.ip = TSS(ip);
regs.orig_ax = 0;
regs.gs = TSS(gs);
regs.__gsh = 0;
regs.fs = TSS(fs);
regs.__fsh = 0;
regs.es = TSS(es);
regs.__esh = 0;
regs.ds = TSS(ds);
regs.__dsh = 0;
regs.ax = TSS(ax);
regs.bp = TSS(bp);
regs.di = TSS(di);
regs.si = TSS(si);
regs.dx = TSS(dx);
regs.cx = TSS(cx);
regs.bx = TSS(bx);
do_double_fault(&regs, 0, cr2);
for (;;) /*
cpu_relax(); * x86_32 does not save the original CR3 anywhere on a task switch.
* This means that, even if we wanted to return, we would need to find
* some way to reconstruct CR3. We could make a credible guess based
* on cpu_tlbstate, but that would be racy and would not account for
* PTI.
*
* Instead, don't bother. We can return through
* rewind_stack_do_exit() instead.
*/
panic("cannot return from double fault\n");
} }
NOKPROBE_SYMBOL(doublefault_shim);
DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.tss = { .tss = {
...@@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { ...@@ -55,9 +96,8 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.ldt = 0, .ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) doublefault_fn, .ip = (unsigned long) double_fault,
/* 0x2 bit is always set */ .flags = X86_EFLAGS_FIXED,
.flags = X86_EFLAGS_SF | 0x2,
.es = __USER_DS, .es = __USER_DS,
.cs = __KERNEL_CS, .cs = __KERNEL_CS,
.ss = __KERNEL_DS, .ss = __KERNEL_DS,
...@@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { ...@@ -71,6 +111,14 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
}, },
}; };
static void set_df_gdt_entry(unsigned int cpu)
{
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
&get_cpu_entry_area(cpu)->doublefault_stack.tss);
}
void doublefault_init_cpu_tss(void) void doublefault_init_cpu_tss(void)
{ {
unsigned int cpu = smp_processor_id(); unsigned int cpu = smp_processor_id();
...@@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void) ...@@ -84,8 +132,5 @@ void doublefault_init_cpu_tss(void)
(unsigned long)&cea->doublefault_stack.stack + (unsigned long)&cea->doublefault_stack.stack +
sizeof(doublefault_stack.stack)); sizeof(doublefault_stack.stack));
/* Set up doublefault TSS pointer in the GDT */ set_df_gdt_entry(cpu);
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
&get_cpu_entry_area(cpu)->doublefault_stack.tss);
} }
...@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, ...@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
} }
#endif #endif
#ifdef CONFIG_X86_64 #if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
/* Runs on IST stack */ /*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
* SDM's warnings about double faults being unrecoverable, returning works as
* expected. Presumably what the SDM actually means is that the CPU may get
* the register state wrong on entry, so returning could be a bad idea.
*
* Various CPU engineers have promised that double faults due to an IRET fault
* while the stack is read-only are, in fact, recoverable.
*
* On x86_32, this is entered through a task gate, and regs are synthesized
* from the TSS. Returning is, in principle, okay, but changes to regs will
* be lost. If, for some reason, we need to return to a context with modified
* regs, the shim code could be adjusted to synchronize the registers.
*/
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{ {
static const char str[] = "double fault"; static const char str[] = "double fault";
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment