Commit 7453311d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 asm changes from Ingo Molnar:
 "The main changes in this cycle were the x86/entry and sysret
  enhancements from Andy Lutomirski, see merge commits 772a9aca and
  b57c0b51 for details"

[ Exectutive summary: IST exceptions that interrupt user space will run
  on the regular kernel stack instead of the IST stack.  Which
  simplifies things particularly on return to user space.

  The sysret cleanup ends up simplifying the logic on when we can use
  sysret vs when we have to use iret.                - Linus ]

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86_64, entry: Remove the syscall exit audit and schedule optimizations
  x86_64, entry: Use sysret to return to userspace when possible
  x86, traps: Fix ist_enter from userspace
  x86, vdso: teach 'make clean' remove vdso64 binaries
  x86_64 entry: Fix RCX for ptraced syscalls
  x86: entry_64.S: fold SAVE_ARGS_IRQ macro into its sole user
  x86: ia32entry.S: fix wrong symbolic constant usage: R11->ARGOFFSET
  x86: entry_64.S: delete unused code
  x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks
  x86, traps: Add ist_begin_non_atomic and ist_end_non_atomic
  x86: Clean up current_stack_pointer
  x86, traps: Track entry into and exit from IST context
  x86, entry: Switch stacks on a paranoid entry from userspace
parents 9d43bade b57c0b51
......@@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value
xorl %ebx,%ebx
1: ret
and the whole paranoid non-paranoid macro complexity is about whether
to suffer that RDMSR cost.
If we are at an interrupt or user-trap/gate-alike boundary then we can
use the faster check: the stack will be a reliable indicator of
whether SWAPGS was already done: if we see that we are a secondary
......@@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the
stack but before we executed SWAPGS, then the only safe way to check
for GS is the slower method: the RDMSR.
So we try only to mark those entry methods 'paranoid' that absolutely
need the more expensive check for the GS base - and we generate all
'normal' entry points with the regular (faster) entry macros.
Therefore, super-atomic entries (except NMI, which is handled separately)
must use idtentry with paranoid=1 to handle gsbase correctly. This
triggers three main behavior changes:
- Interrupt entry will use the slower gsbase check.
- Interrupt entry from user mode will switch off the IST stack.
- Interrupt exit to kernel mode will not attempt to reschedule.
We try to only use IST entries and the paranoid entry code for vectors
that absolutely need the more expensive check for the GS base - and we
generate all 'normal' entry points with the regular (faster) paranoid=0
variant.
......@@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an
interrupt-gate descriptor. When an interrupt occurs and the hardware
loads such a descriptor, the hardware automatically sets the new stack
pointer based on the IST value, then invokes the interrupt handler. If
software wants to allow nested IST interrupts then the handler must
adjust the IST values on entry to and exit from the interrupt handler.
(This is occasionally done, e.g. for debug exceptions.)
the interrupt came from user mode, then the interrupt handler prologue
will switch back to the per-thread stack. If software wants to allow
nested IST interrupts then the handler must adjust the IST values on
entry to and exit from the interrupt handler. (This is occasionally
done, e.g. for debug exceptions.)
Events with different IST codes (i.e. with different stacks) can be
nested. For example, a debug interrupt can safely be interrupted by an
......
......@@ -179,8 +179,8 @@ sysenter_dispatch:
sysexit_from_sys_call:
andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
/* clear IF, that popfq doesn't enable interrupts early */
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
RESTORE_ARGS 0,24,0,0,0,0
xorq %r8,%r8
......
......@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SS 160
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
subq $9*8+\addskip, %rsp
......
......@@ -190,7 +190,6 @@ enum mcp_flags {
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
void mce_notify_process(void);
DECLARE_PER_CPU(struct mce, injectm);
......
......@@ -75,7 +75,6 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
......@@ -100,7 +99,6 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
......@@ -140,7 +138,7 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
_TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
/* flags to check in __switch_to() */
......@@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void)
return ti;
}
static inline unsigned long current_stack_pointer(void)
{
unsigned long sp;
#ifdef CONFIG_X86_64
asm("mov %%rsp,%0" : "=g" (sp));
#else
asm("mov %%esp,%0" : "=g" (sp));
#endif
return sp;
}
#else /* !__ASSEMBLY__ */
/* how to get the thread information struct from ASM */
......
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H
#include <linux/context_tracking_state.h>
#include <linux/kprobes.h>
#include <asm/debugreg.h>
......@@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void);
asmlinkage void mce_threshold_interrupt(void);
#endif
extern enum ctx_state ist_enter(struct pt_regs *regs);
extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
extern void ist_begin_non_atomic(struct pt_regs *regs);
extern void ist_end_non_atomic(void);
/* Interrupts/Exceptions */
enum {
X86_TRAP_DE = 0, /* 0, Divide-by-zero */
......
......@@ -43,6 +43,7 @@
#include <linux/export.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/mce.h>
#include <asm/msr.h>
......@@ -1002,51 +1003,6 @@ static void mce_clear_state(unsigned long *toclear)
}
}
/*
* Need to save faulting physical address associated with a process
* in the machine check handler some place where we can grab it back
* later in mce_notify_process()
*/
#define MCE_INFO_MAX 16
struct mce_info {
atomic_t inuse;
struct task_struct *t;
__u64 paddr;
int restartable;
} mce_info[MCE_INFO_MAX];
static void mce_save_info(__u64 addr, int c)
{
struct mce_info *mi;
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
mi->t = current;
mi->paddr = addr;
mi->restartable = c;
return;
}
}
mce_panic("Too many concurrent recoverable errors", NULL, NULL);
}
static struct mce_info *mce_find_info(void)
{
struct mce_info *mi;
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
if (atomic_read(&mi->inuse) && mi->t == current)
return mi;
return NULL;
}
static void mce_clear_info(struct mce_info *mi)
{
atomic_set(&mi->inuse, 0);
}
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
......@@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
enum ctx_state prev_state;
int i;
int worst = 0;
int severity;
......@@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
prev_state = ist_enter(regs);
this_cpu_inc(mce_exception_count);
......@@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (no_way_out)
mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst == MCE_AR_SEVERITY) {
/* schedule action before return to userland */
mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
set_thread_flag(TIF_MCE_NOTIFY);
recover_paddr = m.addr;
if (!(m.mcgstatus & MCG_STATUS_RIPV))
flags |= MF_MUST_KILL;
} else if (kill_it) {
force_sig(SIGBUS, current);
}
......@@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
sync_core();
if (recover_paddr == ~0ull)
goto done;
pr_err("Uncorrected hardware memory error in user-access at %llx",
recover_paddr);
/*
* We must call memory_failure() here even if the current process is
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
ist_begin_non_atomic(regs);
local_irq_enable();
if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
local_irq_disable();
ist_end_non_atomic();
done:
ist_exit(regs, prev_state);
}
EXPORT_SYMBOL_GPL(do_machine_check);
......@@ -1232,42 +1214,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
}
#endif
/*
* Called in process context that interrupted by MCE and marked with
* TIF_MCE_NOTIFY, just before returning to erroneous userland.
* This code is allowed to sleep.
* Attempt possible recovery such as calling the high level VM handler to
* process any corrupted pages, and kill/signal current process if required.
* Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
struct mce_info *mi = mce_find_info();
int flags = MF_ACTION_REQUIRED;
if (!mi)
mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
pfn = mi->paddr >> PAGE_SHIFT;
clear_thread_flag(TIF_MCE_NOTIFY);
pr_err("Uncorrected hardware memory error in user-access at %llx",
mi->paddr);
/*
* We must call memory_failure() here even if the current process is
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
if (!mi->restartable)
flags |= MF_MUST_KILL;
if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
mce_clear_info(mi);
}
/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
......
......@@ -8,6 +8,7 @@
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/mce.h>
#include <asm/msr.h>
......@@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
u32 loaddr, hi, lotype;
prev_state = ist_enter(regs);
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
......@@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs, prev_state);
}
/* Set up machine check reporting for processors with Intel style MCE: */
......
......@@ -7,14 +7,19 @@
#include <linux/types.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state = ist_enter(regs);
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs, prev_state);
}
/* Set up machine check reporting on the Winchip C6 series */
......
This diff is collapsed.
......@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
: "memory", "cc", "edx", "ecx", "eax");
}
/* how to get the current stack pointer from C */
#define current_stack_pointer ({ \
unsigned long sp; \
asm("mov %%esp,%0" : "=g" (sp)); \
sp; \
})
static inline void *current_stack(void)
{
return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
}
static inline int
......@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
/* Save the next esp at the bottom of the stack */
prev_esp = (u32 *)irqstk;
*prev_esp = current_stack_pointer;
*prev_esp = current_stack_pointer();
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
......@@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
/* Push the previous esp onto the stack */
prev_esp = (u32 *)irqstk;
*prev_esp = current_stack_pointer;
*prev_esp = current_stack_pointer();
call_on_stack(__do_softirq, isp);
}
......
......@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
......
......@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
enum ctx_state ist_enter(struct pt_regs *regs)
{
enum ctx_state prev_state;
if (user_mode_vm(regs)) {
/* Other than that, we're just an exception. */
prev_state = exception_enter();
} else {
/*
* We might have interrupted pretty much anything. In
* fact, if we're a machine check, we can even interrupt
* NMI processing. We don't want in_nmi() to return true,
* but we need to notify RCU.
*/
rcu_nmi_enter();
prev_state = IN_KERNEL; /* the value is irrelevant. */
}
/*
* We are atomic because we're on the IST stack (or we're on x86_32,
* in which case we still shouldn't schedule).
*
* This must be after exception_enter(), because exception_enter()
* won't do anything if in_interrupt() returns true.
*/
preempt_count_add(HARDIRQ_OFFSET);
/* This code is a bit fragile. Test it. */
rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
return prev_state;
}
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
{
/* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
if (user_mode_vm(regs))
return exception_exit(prev_state);
else
rcu_nmi_exit();
}
/**
* ist_begin_non_atomic() - begin a non-atomic section in an IST exception
* @regs: regs passed to the IST exception handler
*
* IST exception handlers normally cannot schedule. As a special
* exception, if the exception interrupted userspace code (i.e.
* user_mode_vm(regs) would return true) and the exception was not
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
* the non-atomic section, and callers must call is_end_non_atomic()
* before ist_exit().
*/
void ist_begin_non_atomic(struct pt_regs *regs)
{
BUG_ON(!user_mode_vm(regs));
/*
* Sanity check: we need to be on the normal thread stack. This
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
& ~(THREAD_SIZE - 1)) != 0);
preempt_count_sub(HARDIRQ_OFFSET);
}
/**
* ist_end_non_atomic() - begin a non-atomic section in an IST exception
*
* Ends a non-atomic section started with ist_begin_non_atomic().
*/
void ist_end_non_atomic(void)
{
preempt_count_add(HARDIRQ_OFFSET);
}
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
......@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret.
*
* No need for ist_enter here because we don't use RCU.
*/
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
......@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
return;
}
#endif
exception_enter();
/* Return not checked because double check cannot be ignored */
ist_enter(regs); /* Discard prev_state because we won't return. */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
......@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
prev_state = exception_enter();
prev_state = ist_enter(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
......@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
exception_exit(prev_state);
ist_exit(regs, prev_state);
}
NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
* Help handler running on IST stack to switch back to user stack
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
* Help handler running on IST stack to switch off the IST stack if the
* interrupted code was in user mode. The actual stack switch is done in
* entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
if (eregs == (struct pt_regs *)eregs->sp)
;
/* Exception from user space */
else if (user_mode(eregs))
regs = task_pt_regs(current);
/*
* Exception from kernel and interrupts are enabled. Move to
* kernel process stack.
*/
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
*regs = *eregs;
struct pt_regs *regs = task_pt_regs(current);
*regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
......@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
prev_state = exception_enter();
prev_state = ist_enter(regs);
get_debugreg(dr6, 6);
......@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
exception_exit(prev_state);
ist_exit(regs, prev_state);
}
NOKPROBE_SYMBOL(do_debug);
......
......@@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment