Commit e5b3fc12 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "Various fixes:

   - Fix the PAT performance regression that downgraded write-combining
     device memory regions to uncached.

   - There's been a number of bugs in 32-bit double fault handling -
     hopefully all fixed now.

   - Fix an LDT crash

   - Fix an FPU over-optimization that broke with GCC9 code
     optimizations.

   - Misc cleanups"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/pat: Fix off-by-one bugs in interval tree search
  x86/ioperm: Save an indentation level in tss_update_io_bitmap()
  x86/fpu: Don't cache access to fpu_fpregs_owner_ctx
  x86/entry/32: Remove unused 'restore_all_notrace' local label
  x86/ptrace: Document FSBASE and GSBASE ABI oddities
  x86/ptrace: Remove set_segment_reg() implementations for current
  x86/traps: die() instead of panicking on a double fault
  x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit
  x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area
  x86/doublefault/32: Rename doublefault.c to doublefault_32.c
  x86/traps: Disentangle the 32-bit and 64-bit doublefault code
  lkdtm: Add a DOUBLE_FAULT crash type on x86
  selftests/x86/single_step_syscall: Check SYSENTER directly
  x86/mm/32: Sync only to VMALLOC_END in vmalloc_sync_all()
parents b7fcf31f 91298f1a
...@@ -117,7 +117,7 @@ config DEBUG_WX ...@@ -117,7 +117,7 @@ config DEBUG_WX
config DOUBLEFAULT config DOUBLEFAULT
default y default y
bool "Enable doublefault exception handler" if EXPERT bool "Enable doublefault exception handler" if EXPERT && X86_32
---help--- ---help---
This option allows trapping of rare doublefault exceptions that This option allows trapping of rare doublefault exceptions that
would otherwise cause a system to silently reboot. Disabling this would otherwise cause a system to silently reboot. Disabling this
......
...@@ -1090,7 +1090,6 @@ SYM_FUNC_START(entry_INT80_32) ...@@ -1090,7 +1090,6 @@ SYM_FUNC_START(entry_INT80_32)
restore_all: restore_all:
TRACE_IRQS_IRET TRACE_IRQS_IRET
SWITCH_TO_ENTRY_STACK SWITCH_TO_ENTRY_STACK
.Lrestore_all_notrace:
CHECK_AND_APPLY_ESPFIX CHECK_AND_APPLY_ESPFIX
.Lrestore_nocheck: .Lrestore_nocheck:
/* Switch back to user CR3 */ /* Switch back to user CR3 */
...@@ -1537,6 +1536,48 @@ SYM_CODE_START(debug) ...@@ -1537,6 +1536,48 @@ SYM_CODE_START(debug)
jmp common_exception jmp common_exception
SYM_CODE_END(debug) SYM_CODE_END(debug)
#ifdef CONFIG_DOUBLEFAULT
SYM_CODE_START(double_fault)
1:
/*
* This is a task gate handler, not an interrupt gate handler.
* The error code is on the stack, but the stack is otherwise
* empty. Interrupts are off. Our state is sane with the following
* exceptions:
*
* - CR0.TS is set. "TS" literally means "task switched".
* - EFLAGS.NT is set because we're a "nested task".
* - The doublefault TSS has back_link set and has been marked busy.
* - TR points to the doublefault TSS and the normal TSS is busy.
* - CR3 is the normal kernel PGD. This would be delightful, except
* that the CPU didn't bother to save the old CR3 anywhere. This
* would make it very awkward to return back to the context we came
* from.
*
* The rest of EFLAGS is sanitized for us, so we don't need to
* worry about AC or DF.
*
* Don't even bother popping the error code. It's always zero,
* and ignoring it makes us a bit more robust against buggy
* hypervisor task gate implementations.
*
* We will manually undo the task switch instead of doing a
* task-switching IRET.
*/
clts /* clear CR0.TS */
pushl $X86_EFLAGS_FIXED
popfl /* clear EFLAGS.NT */
call doublefault_shim
/* We don't support returning, so we have no IRET here. */
1:
hlt
jmp 1b
SYM_CODE_END(double_fault)
#endif
/* /*
* NMI is doubly nasty. It can happen on the first instruction of * NMI is doubly nasty. It can happen on the first instruction of
* entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
......
...@@ -65,6 +65,13 @@ enum exception_stack_ordering { ...@@ -65,6 +65,13 @@ enum exception_stack_ordering {
#endif #endif
#ifdef CONFIG_X86_32
struct doublefault_stack {
unsigned long stack[(PAGE_SIZE - sizeof(struct x86_hw_tss)) / sizeof(unsigned long)];
struct x86_hw_tss tss;
} __aligned(PAGE_SIZE);
#endif
/* /*
* cpu_entry_area is a percpu region that contains things needed by the CPU * cpu_entry_area is a percpu region that contains things needed by the CPU
* and early entry/exit code. Real types aren't used for all fields here * and early entry/exit code. Real types aren't used for all fields here
...@@ -86,6 +93,11 @@ struct cpu_entry_area { ...@@ -86,6 +93,11 @@ struct cpu_entry_area {
#endif #endif
struct entry_stack_page entry_stack_page; struct entry_stack_page entry_stack_page;
#ifdef CONFIG_X86_32
char guard_doublefault_stack[PAGE_SIZE];
struct doublefault_stack doublefault_stack;
#endif
/* /*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS. * we need task switches to work, and task switches write to the TSS.
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DOUBLEFAULT_H
#define _ASM_X86_DOUBLEFAULT_H
#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
extern void doublefault_init_cpu_tss(void);
#else
static inline void doublefault_init_cpu_tss(void)
{
}
#endif
#endif /* _ASM_X86_DOUBLEFAULT_H */
...@@ -509,7 +509,7 @@ static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) ...@@ -509,7 +509,7 @@ static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{ {
return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
} }
/* /*
......
...@@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ...@@ -41,10 +41,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
#endif #endif
/* /*
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE.
* to avoid include recursion hell * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c
* to avoid include recursion hell.
*/ */
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 41) #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 43)
/* The +1 is for the readonly IDT page: */ /* The +1 is for the readonly IDT page: */
#define CPU_ENTRY_AREA_BASE \ #define CPU_ENTRY_AREA_BASE \
......
...@@ -166,7 +166,6 @@ enum cpuid_regs_idx { ...@@ -166,7 +166,6 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data; extern struct cpuinfo_x86 new_cpu_data;
extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
...@@ -997,7 +996,6 @@ bool xen_set_default_idle(void); ...@@ -997,7 +996,6 @@ bool xen_set_default_idle(void);
#endif #endif
void stop_this_cpu(void *dummy); void stop_this_cpu(void *dummy);
void df_debug(struct pt_regs *regs, long error_code);
void microcode_check(void); void microcode_check(void);
enum l1tf_mitigations { enum l1tf_mitigations {
......
...@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); ...@@ -69,6 +69,9 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code);
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code);
dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code);
dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code);
#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2);
#endif
dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code);
dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
......
...@@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o ...@@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/ obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o ifeq ($(CONFIG_X86_32),y)
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
endif
obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <asm/stackprotector.h> #include <asm/stackprotector.h>
#include <asm/perf_event.h> #include <asm/perf_event.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/doublefault.h>
#include <asm/archrandom.h> #include <asm/archrandom.h>
#include <asm/hypervisor.h> #include <asm/hypervisor.h>
#include <asm/processor.h> #include <asm/processor.h>
...@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) ...@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
} }
static inline void gdt_setup_doublefault_tss(int cpu) { }
#else /* CONFIG_X86_64 */ #else /* CONFIG_X86_64 */
static inline void setup_getcpu(int cpu) { } static inline void setup_getcpu(int cpu) { }
...@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu) ...@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu)
static inline void tss_setup_ist(struct tss_struct *tss) { } static inline void tss_setup_ist(struct tss_struct *tss) { }
static inline void gdt_setup_doublefault_tss(int cpu)
{
#ifdef CONFIG_DOUBLEFAULT
/* Set up the doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
#endif
}
#endif /* !CONFIG_X86_64 */ #endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss) static inline void tss_setup_io_bitmap(struct tss_struct *tss)
...@@ -1923,7 +1915,7 @@ void cpu_init(void) ...@@ -1923,7 +1915,7 @@ void cpu_init(void)
clear_all_debug_regs(); clear_all_debug_regs();
dbg_restore_debug_regs(); dbg_restore_debug_regs();
gdt_setup_doublefault_tss(cpu); doublefault_init_cpu_tss();
fpu__init_cpu(); fpu__init_cpu();
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#ifdef CONFIG_X86_32
#define DOUBLEFAULT_STACKSIZE (1024)
static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
static void doublefault_fn(void)
{
struct desc_ptr gdt_desc = {0, 0};
unsigned long gdt, tss;
native_store_gdt(&gdt_desc);
gdt = gdt_desc.address;
printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
if (ptr_ok(gdt)) {
gdt += GDT_ENTRY_TSS << 3;
tss = get_desc_base((struct desc_struct *)gdt);
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
if (ptr_ok(tss)) {
struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
t->ip, t->sp);
printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
t->ax, t->bx, t->cx, t->dx);
printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
t->si, t->di);
}
}
for (;;)
cpu_relax();
}
struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.sp0 = STACK_START,
.ss0 = __KERNEL_DS,
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
#ifndef CONFIG_X86_32_LAZY_GS
.gs = __KERNEL_STACK_CANARY,
#endif
.__cr3 = __pa_nodebug(swapper_pg_dir),
};
/* dummy for do_double_fault() call */
void df_debug(struct pt_regs *regs, long error_code) {}
#else /* !CONFIG_X86_32 */
void df_debug(struct pt_regs *regs, long error_code)
{
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
show_regs(regs);
panic("Machine halted.");
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/init_task.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#include <asm/traps.h>
extern void double_fault(void);
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
static void set_df_gdt_entry(unsigned int cpu);
/*
* Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
* we're running the doublefault task. Cannot return.
*/
asmlinkage notrace void __noreturn doublefault_shim(void)
{
unsigned long cr2;
struct pt_regs regs;
BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
cr2 = native_read_cr2();
/* Reset back to the normal kernel task. */
force_reload_TR();
set_df_gdt_entry(smp_processor_id());
trace_hardirqs_off();
/*
* Fill in pt_regs. A downside of doing this in C is that the unwinder
* won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
* won't successfully unwind to the source of the double fault.
* The main dump from do_double_fault() is fine, though, since it
* uses these regs directly.
*
* If anyone ever cares, this could be moved to asm.
*/
regs.ss = TSS(ss);
regs.__ssh = 0;
regs.sp = TSS(sp);
regs.flags = TSS(flags);
regs.cs = TSS(cs);
/* We won't go through the entry asm, so we can leave __csh as 0. */
regs.__csh = 0;
regs.ip = TSS(ip);
regs.orig_ax = 0;
regs.gs = TSS(gs);
regs.__gsh = 0;
regs.fs = TSS(fs);
regs.__fsh = 0;
regs.es = TSS(es);
regs.__esh = 0;
regs.ds = TSS(ds);
regs.__dsh = 0;
regs.ax = TSS(ax);
regs.bp = TSS(bp);
regs.di = TSS(di);
regs.si = TSS(si);
regs.dx = TSS(dx);
regs.cx = TSS(cx);
regs.bx = TSS(bx);
do_double_fault(&regs, 0, cr2);
/*
* x86_32 does not save the original CR3 anywhere on a task switch.
* This means that, even if we wanted to return, we would need to find
* some way to reconstruct CR3. We could make a credible guess based
* on cpu_tlbstate, but that would be racy and would not account for
* PTI.
*
* Instead, don't bother. We can return through
* rewind_stack_do_exit() instead.
*/
panic("cannot return from double fault\n");
}
NOKPROBE_SYMBOL(doublefault_shim);
DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.tss = {
/*
* No sp0 or ss0 -- we never run CPL != 0 with this TSS
* active. sp is filled in later.
*/
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) double_fault,
.flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
#ifndef CONFIG_X86_32_LAZY_GS
.gs = __KERNEL_STACK_CANARY,
#endif
.__cr3 = __pa_nodebug(swapper_pg_dir),
},
};
static void set_df_gdt_entry(unsigned int cpu)
{
/* Set up doublefault TSS pointer in the GDT */
__set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
&get_cpu_entry_area(cpu)->doublefault_stack.tss);
}
void doublefault_init_cpu_tss(void)
{
unsigned int cpu = smp_processor_id();
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
/*
* The linker isn't smart enough to initialize percpu variables that
* point to other places in percpu space.
*/
this_cpu_write(doublefault_stack.tss.sp,
(unsigned long)&cea->doublefault_stack.stack +
sizeof(doublefault_stack.stack));
set_df_gdt_entry(cpu);
}
...@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type) ...@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_ENTRY) if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE"; return "ENTRY_TRAMPOLINE";
if (type == STACK_TYPE_EXCEPTION)
return "#DF";
return NULL; return NULL;
} }
...@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) ...@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
return true; return true;
} }
static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
{
#ifdef CONFIG_DOUBLEFAULT
struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
struct doublefault_stack *ss = &cea->doublefault_stack;
void *begin = ss->stack;
void *end = begin + sizeof(ss->stack);
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_EXCEPTION;
info->begin = begin;
info->end = end;
info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
return true;
#else
return false;
#endif
}
int get_stack_info(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask) struct stack_info *info, unsigned long *visit_mask)
{ {
...@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, ...@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_softirq_stack(stack, info)) if (in_softirq_stack(stack, info))
goto recursion_check; goto recursion_check;
if (in_doublefault_stack(stack, info))
goto recursion_check;
goto unknown; goto unknown;
recursion_check: recursion_check:
......
...@@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) ...@@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
void tss_update_io_bitmap(void) void tss_update_io_bitmap(void)
{ {
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
struct thread_struct *t = &current->thread;
u16 *base = &tss->x86_tss.io_bitmap_base; u16 *base = &tss->x86_tss.io_bitmap_base;
if (test_thread_flag(TIF_IO_BITMAP)) { if (!test_thread_flag(TIF_IO_BITMAP)) {
struct thread_struct *t = &current->thread; tss_invalidate_io_bitmap(tss);
return;
if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { }
*base = IO_BITMAP_OFFSET_VALID_ALL;
} else { if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
struct io_bitmap *iobm = t->io_bitmap; *base = IO_BITMAP_OFFSET_VALID_ALL;
/* } else {
* Only copy bitmap data when the sequence number struct io_bitmap *iobm = t->io_bitmap;
* differs. The update time is accounted to the
* incoming task.
*/
if (tss->io_bitmap.prev_sequence != iobm->sequence)
tss_copy_io_bitmap(tss, iobm);
/* Enable the bitmap */
*base = IO_BITMAP_OFFSET_VALID_MAP;
}
/* /*
* Make sure that the TSS limit is covering the io bitmap. * Only copy bitmap data when the sequence number differs. The
* It might have been cut down by a VMEXIT to 0x67 which * update time is accounted to the incoming task.
* would cause a subsequent I/O access from user space to
* trigger a #GP because tbe bitmap is outside the TSS
* limit.
*/ */
refresh_tss_limit(); if (tss->io_bitmap.prev_sequence != iobm->sequence)
} else { tss_copy_io_bitmap(tss, iobm);
tss_invalidate_io_bitmap(tss);
/* Enable the bitmap */
*base = IO_BITMAP_OFFSET_VALID_MAP;
} }
/*
* Make sure that the TSS limit is covering the IO bitmap. It might have
* been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
* access from user space to trigger a #GP because tbe bitmap is outside
* the TSS limit.
*/
refresh_tss_limit();
} }
#else /* CONFIG_X86_IOPL_IOPERM */ #else /* CONFIG_X86_IOPL_IOPERM */
static inline void switch_to_bitmap(unsigned long tifp) { } static inline void switch_to_bitmap(unsigned long tifp) { }
......
...@@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) ...@@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task, static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value) unsigned long offset, u16 value)
{ {
if (WARN_ON_ONCE(task == current))
return -EIO;
/* /*
* The value argument was already truncated to 16 bits. * The value argument was already truncated to 16 bits.
*/ */
...@@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task, ...@@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task,
break; break;
case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, gs):
if (task == current) task_user_gs(task) = value;
set_user_gs(task_pt_regs(task), value);
else
task_user_gs(task) = value;
} }
return 0; return 0;
...@@ -272,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) ...@@ -272,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task, static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value) unsigned long offset, u16 value)
{ {
if (WARN_ON_ONCE(task == current))
return -EIO;
/* /*
* The value argument was already truncated to 16 bits. * The value argument was already truncated to 16 bits.
*/ */
if (invalid_selector(value)) if (invalid_selector(value))
return -EIO; return -EIO;
/*
* This function has some ABI oddities.
*
* A 32-bit ptracer probably expects that writing FS or GS will change
* FSBASE or GSBASE respectively. In the absence of FSGSBASE support,
* this code indeed has that effect. When FSGSBASE is added, this
* will require a special case.
*
* For existing 64-bit ptracers, writing FS or GS *also* currently
* changes the base if the selector is nonzero the next time the task
* is run. This behavior may not be needed, and trying to preserve it
* when FSGSBASE is added would be complicated at best.
*/
switch (offset) { switch (offset) {
case offsetof(struct user_regs_struct,fs): case offsetof(struct user_regs_struct,fs):
task->thread.fsindex = value; task->thread.fsindex = value;
if (task == current)
loadsegment(fs, task->thread.fsindex);
break; break;
case offsetof(struct user_regs_struct,gs): case offsetof(struct user_regs_struct,gs):
task->thread.gsindex = value; task->thread.gsindex = value;
if (task == current)
load_gs_index(task->thread.gsindex);
break; break;
case offsetof(struct user_regs_struct,ds): case offsetof(struct user_regs_struct,ds):
task->thread.ds = value; task->thread.ds = value;
if (task == current)
loadsegment(ds, task->thread.ds);
break; break;
case offsetof(struct user_regs_struct,es): case offsetof(struct user_regs_struct,es):
task->thread.es = value; task->thread.es = value;
if (task == current)
loadsegment(es, task->thread.es);
break; break;
/* /*
...@@ -375,6 +384,9 @@ static int putreg(struct task_struct *child, ...@@ -375,6 +384,9 @@ static int putreg(struct task_struct *child,
* When changing the FS base, use do_arch_prctl_64() * When changing the FS base, use do_arch_prctl_64()
* to set the index to zero and to set the base * to set the index to zero and to set the base
* as requested. * as requested.
*
* NB: This behavior is nonsensical and likely needs to
* change when FSGSBASE support is added.
*/ */
if (child->thread.fsbase != value) if (child->thread.fsbase != value)
return do_arch_prctl_64(child, ARCH_SET_FS, value); return do_arch_prctl_64(child, ARCH_SET_FS, value);
......
...@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message, ...@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
} }
#endif #endif
#ifdef CONFIG_X86_64 #if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
/* Runs on IST stack */ /*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
* On x86_64, this is more or less a normal kernel entry. Notwithstanding the
* SDM's warnings about double faults being unrecoverable, returning works as
* expected. Presumably what the SDM actually means is that the CPU may get
* the register state wrong on entry, so returning could be a bad idea.
*
* Various CPU engineers have promised that double faults due to an IRET fault
* while the stack is read-only are, in fact, recoverable.
*
* On x86_32, this is entered through a task gate, and regs are synthesized
* from the TSS. Returning is, in principle, okay, but changes to regs will
* be lost. If, for some reason, we need to return to a context with modified
* regs, the shim code could be adjusted to synchronize the registers.
*/
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{ {
static const char str[] = "double fault"; static const char str[] = "double fault";
...@@ -411,15 +426,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign ...@@ -411,15 +426,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif #endif
#ifdef CONFIG_DOUBLEFAULT pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
df_debug(regs, error_code); die("double fault", regs, error_code);
#endif panic("Machine halted.");
/*
* This is always a kernel trap and never fixable (and thus must
* never return).
*/
for (;;)
die(str, regs, error_code);
} }
#endif #endif
......
...@@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); ...@@ -17,6 +17,10 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
#endif #endif
#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
#endif
struct cpu_entry_area *get_cpu_entry_area(int cpu) struct cpu_entry_area *get_cpu_entry_area(int cpu)
{ {
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
...@@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) ...@@ -108,7 +112,15 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
cea_map_stack(MCE); cea_map_stack(MCE);
} }
#else #else
static inline void percpu_setup_exception_stacks(unsigned int cpu) {} static inline void percpu_setup_exception_stacks(unsigned int cpu)
{
#ifdef CONFIG_DOUBLEFAULT
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
cea_map_percpu_pages(&cea->doublefault_stack,
&per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
#endif
}
#endif #endif
/* Setup the fixmap mappings only once per-processor */ /* Setup the fixmap mappings only once per-processor */
......
...@@ -197,7 +197,7 @@ void vmalloc_sync_all(void) ...@@ -197,7 +197,7 @@ void vmalloc_sync_all(void)
return; return;
for (address = VMALLOC_START & PMD_MASK; for (address = VMALLOC_START & PMD_MASK;
address >= TASK_SIZE_MAX && address < FIXADDR_TOP; address >= TASK_SIZE_MAX && address < VMALLOC_END;
address += PMD_SIZE) { address += PMD_SIZE) {
struct page *page; struct page *page;
......
...@@ -56,7 +56,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type) ...@@ -56,7 +56,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type)
{ {
struct memtype *match; struct memtype *match;
match = memtype_interval_iter_first(&memtype_rbroot, start, end); match = memtype_interval_iter_first(&memtype_rbroot, start, end-1);
while (match != NULL && match->start < end) { while (match != NULL && match->start < end) {
if ((match_type == MEMTYPE_EXACT_MATCH) && if ((match_type == MEMTYPE_EXACT_MATCH) &&
(match->start == start) && (match->end == end)) (match->start == start) && (match->end == end))
...@@ -66,7 +66,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type) ...@@ -66,7 +66,7 @@ static struct memtype *memtype_match(u64 start, u64 end, int match_type)
(match->start < start) && (match->end == end)) (match->start < start) && (match->end == end))
return match; return match;
match = memtype_interval_iter_next(match, start, end); match = memtype_interval_iter_next(match, start, end-1);
} }
return NULL; /* Returns NULL if there is no match */ return NULL; /* Returns NULL if there is no match */
...@@ -79,7 +79,7 @@ static int memtype_check_conflict(u64 start, u64 end, ...@@ -79,7 +79,7 @@ static int memtype_check_conflict(u64 start, u64 end,
struct memtype *match; struct memtype *match;
enum page_cache_mode found_type = reqtype; enum page_cache_mode found_type = reqtype;
match = memtype_interval_iter_first(&memtype_rbroot, start, end); match = memtype_interval_iter_first(&memtype_rbroot, start, end-1);
if (match == NULL) if (match == NULL)
goto success; goto success;
...@@ -89,12 +89,12 @@ static int memtype_check_conflict(u64 start, u64 end, ...@@ -89,12 +89,12 @@ static int memtype_check_conflict(u64 start, u64 end,
dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
found_type = match->type; found_type = match->type;
match = memtype_interval_iter_next(match, start, end); match = memtype_interval_iter_next(match, start, end-1);
while (match) { while (match) {
if (match->type != found_type) if (match->type != found_type)
goto failure; goto failure;
match = memtype_interval_iter_next(match, start, end); match = memtype_interval_iter_next(match, start, end-1);
} }
success: success:
if (newtype) if (newtype)
...@@ -160,7 +160,7 @@ struct memtype *memtype_erase(u64 start, u64 end) ...@@ -160,7 +160,7 @@ struct memtype *memtype_erase(u64 start, u64 end)
struct memtype *memtype_lookup(u64 addr) struct memtype *memtype_lookup(u64 addr)
{ {
return memtype_interval_iter_first(&memtype_rbroot, addr, return memtype_interval_iter_first(&memtype_rbroot, addr,
addr + PAGE_SIZE); addr + PAGE_SIZE-1);
} }
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
......
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
#include <linux/sched/task_stack.h> #include <linux/sched/task_stack.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#ifdef CONFIG_X86_32
#include <asm/desc.h>
#endif
struct lkdtm_list { struct lkdtm_list {
struct list_head node; struct list_head node;
}; };
...@@ -337,3 +341,38 @@ void lkdtm_UNSET_SMEP(void) ...@@ -337,3 +341,38 @@ void lkdtm_UNSET_SMEP(void)
pr_err("FAIL: this test is x86_64-only\n"); pr_err("FAIL: this test is x86_64-only\n");
#endif #endif
} }
#ifdef CONFIG_X86_32
void lkdtm_DOUBLE_FAULT(void)
{
/*
* Trigger #DF by setting the stack limit to zero. This clobbers
* a GDT TLS slot, which is okay because the current task will die
* anyway due to the double fault.
*/
struct desc_struct d = {
.type = 3, /* expand-up, writable, accessed data */
.p = 1, /* present */
.d = 1, /* 32-bit */
.g = 0, /* limit in bytes */
.s = 1, /* not system */
};
local_irq_disable();
write_gdt_entry(get_cpu_gdt_rw(smp_processor_id()),
GDT_ENTRY_TLS_MIN, &d, DESCTYPE_S);
/*
* Put our zero-limit segment in SS and then trigger a fault. The
* 4-byte access to (%esp) will fault with #SS, and the attempt to
* deliver the fault will recursively cause #SS and result in #DF.
* This whole process happens while NMIs and MCEs are blocked by the
* MOV SS window. This is nice because an NMI with an invalid SS
* would also double-fault, resulting in the NMI or MCE being lost.
*/
asm volatile ("movw %0, %%ss; addl $0, (%%esp)" ::
"r" ((unsigned short)(GDT_ENTRY_TLS_MIN << 3)));
panic("tried to double fault but didn't die\n");
}
#endif
...@@ -171,6 +171,9 @@ static const struct crashtype crashtypes[] = { ...@@ -171,6 +171,9 @@ static const struct crashtype crashtypes[] = {
CRASHTYPE(USERCOPY_KERNEL_DS), CRASHTYPE(USERCOPY_KERNEL_DS),
CRASHTYPE(STACKLEAK_ERASING), CRASHTYPE(STACKLEAK_ERASING),
CRASHTYPE(CFI_FORWARD_PROTO), CRASHTYPE(CFI_FORWARD_PROTO),
#ifdef CONFIG_X86_32
CRASHTYPE(DOUBLE_FAULT),
#endif
}; };
......
...@@ -28,6 +28,9 @@ void lkdtm_CORRUPT_USER_DS(void); ...@@ -28,6 +28,9 @@ void lkdtm_CORRUPT_USER_DS(void);
void lkdtm_STACK_GUARD_PAGE_LEADING(void); void lkdtm_STACK_GUARD_PAGE_LEADING(void);
void lkdtm_STACK_GUARD_PAGE_TRAILING(void); void lkdtm_STACK_GUARD_PAGE_TRAILING(void);
void lkdtm_UNSET_SMEP(void); void lkdtm_UNSET_SMEP(void);
#ifdef CONFIG_X86_32
void lkdtm_DOUBLE_FAULT(void);
#endif
/* lkdtm_heap.c */ /* lkdtm_heap.c */
void __init lkdtm_heap_init(void); void __init lkdtm_heap_init(void);
......
...@@ -43,7 +43,19 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), ...@@ -43,7 +43,19 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
err(1, "sigaction"); err(1, "sigaction");
} }
static volatile sig_atomic_t sig_traps; static void clearhandler(int sig)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static volatile sig_atomic_t sig_traps, sig_eflags;
sigjmp_buf jmpbuf;
static unsigned char altstack_data[SIGSTKSZ];
#ifdef __x86_64__ #ifdef __x86_64__
# define REG_IP REG_RIP # define REG_IP REG_RIP
...@@ -90,6 +102,25 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) ...@@ -90,6 +102,25 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
} }
} }
static char const * const signames[] = {
[SIGSEGV] = "SIGSEGV",
[SIGBUS] = "SIBGUS",
[SIGTRAP] = "SIGTRAP",
[SIGILL] = "SIGILL",
};
static void print_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
{
ucontext_t *ctx = ctx_void;
printf("\tGot %s with RIP=%lx, TF=%ld\n", signames[sig],
(unsigned long)ctx->uc_mcontext.gregs[REG_IP],
(unsigned long)ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_TF);
sig_eflags = (unsigned long)ctx->uc_mcontext.gregs[REG_EFL];
siglongjmp(jmpbuf, 1);
}
static void check_result(void) static void check_result(void)
{ {
unsigned long new_eflags = get_eflags(); unsigned long new_eflags = get_eflags();
...@@ -109,6 +140,22 @@ static void check_result(void) ...@@ -109,6 +140,22 @@ static void check_result(void)
sig_traps = 0; sig_traps = 0;
} }
static void fast_syscall_no_tf(void)
{
sig_traps = 0;
printf("[RUN]\tFast syscall with TF cleared\n");
fflush(stdout); /* Force a syscall */
if (get_eflags() & X86_EFLAGS_TF) {
printf("[FAIL]\tTF is now set\n");
exit(1);
}
if (sig_traps) {
printf("[FAIL]\tGot SIGTRAP\n");
exit(1);
}
printf("[OK]\tNothing unexpected happened\n");
}
int main() int main()
{ {
#ifdef CAN_BUILD_32 #ifdef CAN_BUILD_32
...@@ -163,17 +210,46 @@ int main() ...@@ -163,17 +210,46 @@ int main()
check_result(); check_result();
/* Now make sure that another fast syscall doesn't set TF again. */ /* Now make sure that another fast syscall doesn't set TF again. */
printf("[RUN]\tFast syscall with TF cleared\n"); fast_syscall_no_tf();
fflush(stdout); /* Force a syscall */
if (get_eflags() & X86_EFLAGS_TF) { /*
printf("[FAIL]\tTF is now set\n"); * And do a forced SYSENTER to make sure that this works even if
exit(1); * fast syscalls don't use SYSENTER.
*
* Invoking SYSENTER directly breaks all the rules. Just handle
* the SIGSEGV.
*/
if (sigsetjmp(jmpbuf, 1) == 0) {
unsigned long nr = SYS_getpid;
printf("[RUN]\tSet TF and check SYSENTER\n");
stack_t stack = {
.ss_sp = altstack_data,
.ss_size = SIGSTKSZ,
};
if (sigaltstack(&stack, NULL) != 0)
err(1, "sigaltstack");
sethandler(SIGSEGV, print_and_longjmp,
SA_RESETHAND | SA_ONSTACK);
sethandler(SIGILL, print_and_longjmp, SA_RESETHAND);
set_eflags(get_eflags() | X86_EFLAGS_TF);
/* Clear EBP first to make sure we segfault cleanly. */
asm volatile ("xorl %%ebp, %%ebp; SYSENTER" : "+a" (nr) :: "flags", "rcx"
#ifdef __x86_64__
, "r11"
#endif
);
/* We're unreachable here. SYSENTER forgets RIP. */
} }
if (sig_traps) { clearhandler(SIGSEGV);
printf("[FAIL]\tGot SIGTRAP\n"); clearhandler(SIGILL);
if (!(sig_eflags & X86_EFLAGS_TF)) {
printf("[FAIL]\tTF was cleared\n");
exit(1); exit(1);
} }
printf("[OK]\tNothing unexpected happened\n");
/* Now make sure that another fast syscall doesn't set TF again. */
fast_syscall_no_tf();
return 0; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment