Commit a75a3f6f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 asm changes from Ingo Molnar:
 "The main change in this cycle is another step in the big x86 system
  call interface rework by Andy Lutomirski, which moves most of the low
  level x86 entry code from assembly to C, for all syscall entries
  except native 64-bit system calls:

    arch/x86/entry/entry_32.S        | 182 ++++------
    arch/x86/entry/entry_64_compat.S | 547 ++++++++-----------------------
    194 insertions(+), 535 deletions(-)

  ... our hope is that the final remaining step (converting native
  64-bit system calls) will be less painful as all the previous steps,
  given that most of the legacies and quirks are concentrated around
  native 32-bit and compat environments"

* 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (47 commits)
  x86/entry/32: Fix FS and GS restore in opportunistic SYSEXIT
  x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on
  um/x86: Fix build after x86 syscall changes
  x86/asm: Remove the xyz_cfi macros from dwarf2.h
  selftests/x86: Style fixes for the 'unwind_vdso' test
  x86/entry/64/compat: Document sysenter_fix_flags's reason for existence
  x86/entry: Split and inline syscall_return_slowpath()
  x86/entry: Split and inline prepare_exit_to_usermode()
  x86/entry: Use pt_regs_to_thread_info() in syscall entry tracing
  x86/entry: Hide two syscall entry assertions behind CONFIG_DEBUG_ENTRY
  x86/entry: Micro-optimize compat fast syscall arg fetch
  x86/entry: Force inlining of 32-bit syscall code
  x86/entry: Make irqs_disabled checks in exit code depend on lockdep
  x86/entry: Remove unnecessary IRQ twiddling in fast 32-bit syscalls
  x86/asm: Remove thread_info.sysenter_return
  x86/entry/32: Re-implement SYSENTER using the new C path
  x86/entry/32: Switch INT80 to the new C syscall path
  x86/entry/32: Open-code return tracking from fork and kthreads
  x86/entry/compat: Implement opportunistic SYSRETL for compat syscalls
  x86/vdso/compat: Wire up SYSENTER and SYSCSALL for compat userspace
  ...
parents d2bea739 3bd29515
...@@ -2027,6 +2027,55 @@ config COMPAT_VDSO ...@@ -2027,6 +2027,55 @@ config COMPAT_VDSO
If unsure, say N: if you are compiling your own kernel, you If unsure, say N: if you are compiling your own kernel, you
are unlikely to be using a buggy version of glibc. are unlikely to be using a buggy version of glibc.
choice
prompt "vsyscall table for legacy applications"
depends on X86_64
default LEGACY_VSYSCALL_EMULATE
help
Legacy user code that does not know how to find the vDSO expects
to be able to issue three syscalls by calling fixed addresses in
kernel space. Since this location is not randomized with ASLR,
it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command
line parameter vsyscall=[native|emulate|none].
On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty
to improve security.
If unsure, select "Emulate".
config LEGACY_VSYSCALL_NATIVE
bool "Native"
help
Actual executable code is located in the fixed vsyscall
address mapping, implementing time() efficiently. Since
this makes the mapping executable, it can be used during
security vulnerability exploitation (traditionally as
ROP gadgets). This configuration is not recommended.
config LEGACY_VSYSCALL_EMULATE
bool "Emulate"
help
The kernel traps and emulates calls into the fixed
vsyscall address mapping. This makes the mapping
non-executable, but it still contains known contents,
which could be used in certain rare security vulnerability
exploits. This configuration is recommended when userspace
still uses the vsyscall area.
config LEGACY_VSYSCALL_NONE
bool "None"
help
There will be no vsyscall mapping at all. This will
eliminate any risk of ASLR bypass due to the vsyscall
fixed address mapping. Attempts to use the vsyscalls
will be reported to dmesg, so that either old or
malicious userspace programs can be identified.
endchoice
config CMDLINE_BOOL config CMDLINE_BOOL
bool "Built-in kernel command line" bool "Built-in kernel command line"
---help--- ---help---
......
...@@ -159,6 +159,12 @@ endif ...@@ -159,6 +159,12 @@ endif
sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp sp-$(CONFIG_X86_64) := rsp
# do binutils support CFI?
cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
# is .cfi_signal_frame supported too?
cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
# does binutils support specific instructions? # does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
...@@ -166,8 +172,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) ...@@ -166,8 +172,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS := -m elf_$(UTS_MACHINE)
......
...@@ -24,10 +24,19 @@ ...@@ -24,10 +24,19 @@
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/vdso.h>
#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h> #include <trace/events/syscalls.h>
static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
{
unsigned long top_of_stack =
(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
return (struct thread_info *)(top_of_stack - THREAD_SIZE);
}
#ifdef CONFIG_CONTEXT_TRACKING #ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */ /* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void) __visible void enter_from_user_mode(void)
...@@ -66,13 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) ...@@ -66,13 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
*/ */
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{ {
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long ret = 0; unsigned long ret = 0;
u32 work; u32 work;
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
BUG_ON(regs != task_pt_regs(current)); BUG_ON(regs != task_pt_regs(current));
work = ACCESS_ONCE(current_thread_info()->flags) & work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
_TIF_WORK_SYSCALL_ENTRY;
#ifdef CONFIG_CONTEXT_TRACKING #ifdef CONFIG_CONTEXT_TRACKING
/* /*
...@@ -154,10 +164,11 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) ...@@ -154,10 +164,11 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result) unsigned long phase1_result)
{ {
struct thread_info *ti = pt_regs_to_thread_info(regs);
long ret = 0; long ret = 0;
u32 work = ACCESS_ONCE(current_thread_info()->flags) & u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
_TIF_WORK_SYSCALL_ENTRY;
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
BUG_ON(regs != task_pt_regs(current)); BUG_ON(regs != task_pt_regs(current));
/* /*
...@@ -207,19 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs) ...@@ -207,19 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs)
return syscall_trace_enter_phase2(regs, arch, phase1_result); return syscall_trace_enter_phase2(regs, arch, phase1_result);
} }
static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) #define EXIT_TO_USERMODE_LOOP_FLAGS \
{ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
unsigned long top_of_stack = _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
return (struct thread_info *)(top_of_stack - THREAD_SIZE);
}
/* Called with IRQs disabled. */ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
__visible void prepare_exit_to_usermode(struct pt_regs *regs)
{ {
if (WARN_ON(!irqs_disabled()))
local_irq_disable();
/* /*
* In order to return to user mode, we need to have IRQs off with * In order to return to user mode, we need to have IRQs off with
* none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
...@@ -229,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) ...@@ -229,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
* work to clear some of the flags can sleep. * work to clear some of the flags can sleep.
*/ */
while (true) { while (true) {
u32 cached_flags =
READ_ONCE(pt_regs_to_thread_info(regs)->flags);
if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
_TIF_UPROBE | _TIF_NEED_RESCHED |
_TIF_USER_RETURN_NOTIFY)))
break;
/* We have work to do. */ /* We have work to do. */
local_irq_enable(); local_irq_enable();
...@@ -260,33 +256,42 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) ...@@ -260,33 +256,42 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
/* Disable IRQs and retry */ /* Disable IRQs and retry */
local_irq_disable(); local_irq_disable();
cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
break;
} }
}
/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
u32 cached_flags;
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
lockdep_sys_exit();
cached_flags =
READ_ONCE(pt_regs_to_thread_info(regs)->flags);
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
user_enter(); user_enter();
} }
/* #define SYSCALL_EXIT_WORK_FLAGS \
* Called with IRQs on and fully valid regs. Returns with IRQs off in a (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
* state such that we can immediately switch to user mode. _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
*/
__visible void syscall_return_slowpath(struct pt_regs *regs) static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
{ {
struct thread_info *ti = pt_regs_to_thread_info(regs);
u32 cached_flags = READ_ONCE(ti->flags);
bool step; bool step;
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
regs->orig_ax))
local_irq_enable();
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
*/
if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
_TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
audit_syscall_exit(regs); audit_syscall_exit(regs);
if (cached_flags & _TIF_SYSCALL_TRACEPOINT) if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
...@@ -303,7 +308,29 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) ...@@ -303,7 +308,29 @@ __visible void syscall_return_slowpath(struct pt_regs *regs)
== _TIF_SINGLESTEP); == _TIF_SINGLESTEP);
if (step || cached_flags & _TIF_SYSCALL_TRACE) if (step || cached_flags & _TIF_SYSCALL_TRACE)
tracehook_report_syscall_exit(regs, step); tracehook_report_syscall_exit(regs, step);
} }
/*
* Called with IRQs on and fully valid regs. Returns with IRQs off in a
* state such that we can immediately switch to user mode.
*/
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
u32 cached_flags = READ_ONCE(ti->flags);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
local_irq_enable();
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
*/
if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
syscall_slow_exit_work(regs, cached_flags);
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
/* /*
...@@ -316,3 +343,144 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) ...@@ -316,3 +343,144 @@ __visible void syscall_return_slowpath(struct pt_regs *regs)
local_irq_disable(); local_irq_disable();
prepare_exit_to_usermode(regs); prepare_exit_to_usermode(regs);
} }
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
/*
* Does a 32-bit syscall. Called with IRQs on and does all entry and
* exit work and returns with IRQs off. This function is extremely hot
* in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
#ifdef CONFIG_X86_32
/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
__visible
#else
/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
static
#endif
__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned int nr = (unsigned int)regs->orig_ax;
#ifdef CONFIG_IA32_EMULATION
ti->status |= TS_COMPAT;
#endif
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
/*
* Subtlety here: if ptrace pokes something larger than
* 2^32-1 into orig_ax, this truncates it. This may or
* may not be necessary, but it matches the old asm
* behavior.
*/
nr = syscall_trace_enter(regs);
}
if (likely(nr < IA32_NR_syscalls)) {
/*
* It's possible that a 32-bit syscall implementation
* takes a 64-bit parameter but nonetheless assumes that
* the high bits are zero. Make sure we zero-extend all
* of the args.
*/
regs->ax = ia32_sys_call_table[nr](
(unsigned int)regs->bx, (unsigned int)regs->cx,
(unsigned int)regs->dx, (unsigned int)regs->si,
(unsigned int)regs->di, (unsigned int)regs->bp);
}
syscall_return_slowpath(regs);
}
#ifdef CONFIG_X86_64
/* Handles INT80 on 64-bit kernels */
__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
{
local_irq_enable();
do_syscall_32_irqs_on(regs);
}
#endif
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
__visible long do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
* convention. Adjust regs so it looks like we entered using int80.
*/
unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
vdso_image_32.sym_int80_landing_pad;
/*
* SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
* so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
* Fix it up.
*/
regs->ip = landing_pad;
/*
* Fetch ECX from where the vDSO stashed it.
*
* WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
*/
local_irq_enable();
if (
#ifdef CONFIG_X86_64
/*
* Micro-optimization: the pointer we're following is explicitly
* 32 bits, so it can't be out of range.
*/
__get_user(*(u32 *)&regs->cx,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#else
get_user(*(u32 *)&regs->cx,
(u32 __user __force *)(unsigned long)(u32)regs->sp)
#endif
) {
/* User code screwed up. */
local_irq_disable();
regs->ax = -EFAULT;
#ifdef CONFIG_CONTEXT_TRACKING
enter_from_user_mode();
#endif
prepare_exit_to_usermode(regs);
return 0; /* Keep it simple: use IRET. */
}
/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs);
#ifdef CONFIG_X86_64
/*
* Opportunistic SYSRETL: if possible, try to return using SYSRETL.
* SYSRETL is available on all 64-bit CPUs, so we don't need to
* bother with SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
*/
return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
#else
/*
* Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
*
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
* because the ECX fixup above will ensure that this is essentially
* never the case.
*
* We don't allow syscalls at all from VM86 mode, but we still
* need to check VM, because we might be returning from sys_vm86.
*/
return static_cpu_has(X86_FEATURE_SEP) &&
regs->cs == __USER_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
}
#endif
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
* *
* entry_32.S contains the system-call and low-level fault and trap handling routines. * entry_32.S contains the system-call and low-level fault and trap handling routines.
* *
* Stack layout in 'syscall_exit': * Stack layout while running C code:
* ptrace needs to have all registers on the stack. * ptrace needs to have all registers on the stack.
* If the order here is changed, it needs to be * If the order here is changed, it needs to be
* updated in fork.c:copy_process(), signal.c:do_signal(), * updated in fork.c:copy_process(), signal.c:do_signal(),
...@@ -153,13 +153,13 @@ ...@@ -153,13 +153,13 @@
#endif /* CONFIG_X86_32_LAZY_GS */ #endif /* CONFIG_X86_32_LAZY_GS */
.macro SAVE_ALL .macro SAVE_ALL pt_regs_ax=%eax
cld cld
PUSH_GS PUSH_GS
pushl %fs pushl %fs
pushl %es pushl %es
pushl %ds pushl %ds
pushl %eax pushl \pt_regs_ax
pushl %ebp pushl %ebp
pushl %edi pushl %edi
pushl %esi pushl %esi
...@@ -211,7 +211,11 @@ ENTRY(ret_from_fork) ...@@ -211,7 +211,11 @@ ENTRY(ret_from_fork)
popl %eax popl %eax
pushl $0x0202 # Reset kernel eflags pushl $0x0202 # Reset kernel eflags
popfl popfl
jmp syscall_exit
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
jmp restore_all
END(ret_from_fork) END(ret_from_fork)
ENTRY(ret_from_kernel_thread) ENTRY(ret_from_kernel_thread)
...@@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread) ...@@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread)
movl PT_EBP(%esp), %eax movl PT_EBP(%esp), %eax
call *PT_EBX(%esp) call *PT_EBX(%esp)
movl $0, PT_EAX(%esp) movl $0, PT_EAX(%esp)
jmp syscall_exit
/*
* Kernel threads return to userspace as if returning from a syscall.
* We should check whether anything actually uses this path and, if so,
* consider switching it over to ret_from_fork.
*/
movl %esp, %eax
call syscall_return_slowpath
jmp restore_all
ENDPROC(ret_from_kernel_thread) ENDPROC(ret_from_kernel_thread)
/* /*
...@@ -255,7 +267,6 @@ ret_from_intr: ...@@ -255,7 +267,6 @@ ret_from_intr:
jb resume_kernel # not returning to v8086 or userspace jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace) ENTRY(resume_userspace)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF TRACE_IRQS_OFF
movl %esp, %eax movl %esp, %eax
...@@ -276,76 +287,47 @@ need_resched: ...@@ -276,76 +287,47 @@ need_resched:
END(resume_kernel) END(resume_kernel)
#endif #endif
/*
* SYSENTER_RETURN points to after the SYSENTER instruction
* in the vsyscall page. See vsyscall-sysentry.S, which defines
* the symbol.
*/
# SYSENTER call handler stub # SYSENTER call handler stub
ENTRY(entry_SYSENTER_32) ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp: sysenter_past_esp:
/* pushl $__USER_DS /* pt_regs->ss */
* Interrupts are disabled here, but we can't trace it until pushl %ecx /* pt_regs->cx */
* enough kernel state to call TRACE_IRQS_OFF can be called - but pushfl /* pt_regs->flags (except IF = 0) */
* we immediately enable interrupts at that point anyway. orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
*/ pushl $__USER_CS /* pt_regs->cs */
pushl $__USER_DS pushl $0 /* pt_regs->ip = 0 (placeholder) */
pushl %ebp pushl %eax /* pt_regs->orig_ax */
pushfl SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
orl $X86_EFLAGS_IF, (%esp)
pushl $__USER_CS
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary: TI_sysenter_return
* is relative to thread_info, which is at the bottom of the
* kernel stack page. 4*4 means the 4 words pushed above;
* TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
* and THREAD_SIZE takes us to the bottom.
*/
pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
pushl %eax
SAVE_ALL
ENABLE_INTERRUPTS(CLBR_NONE)
/* /*
* Load the potential sixth argument from user stack. * User mode is traced as though IRQs are on, and SYSENTER
* Careful about security. * turned them off.
*/ */
cmpl $__PAGE_OFFSET-3, %ebp TRACE_IRQS_OFF
jae syscall_fault
ASM_STAC
1: movl (%ebp), %ebp
ASM_CLAC
movl %ebp, PT_EBP(%esp)
_ASM_EXTABLE(1b, syscall_fault)
GET_THREAD_INFO(%ebp) movl %esp, %eax
call do_fast_syscall_32
testl %eax, %eax
jz .Lsyscall_32_done
testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) /* Opportunistic SYSEXIT */
jnz syscall_trace_entry TRACE_IRQS_ON /* User mode traces as IRQs on. */
sysenter_do_call: movl PT_EIP(%esp), %edx /* pt_regs->ip */
cmpl $(NR_syscalls), %eax movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
jae sysenter_badsys
call *sys_call_table(, %eax, 4)
sysenter_after_call:
movl %eax, PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
jnz syscall_exit_work_irqs_off
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
movl PT_OLDESP(%esp), %ecx
xorl %ebp, %ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs 1: mov PT_FS(%esp), %fs
PTGS_TO_GS PTGS_TO_GS
popl %ebx /* pt_regs->bx */
addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
popl %esi /* pt_regs->si */
popl %edi /* pt_regs->di */
popl %ebp /* pt_regs->bp */
popl %eax /* pt_regs->ax */
/*
* Return back to the vDSO, which will pop ecx and edx.
* Don't bother with DS and ES (they already contain __USER_DS).
*/
ENABLE_INTERRUPTS_SYSEXIT ENABLE_INTERRUPTS_SYSEXIT
.pushsection .fixup, "ax" .pushsection .fixup, "ax"
...@@ -359,21 +341,18 @@ ENDPROC(entry_SYSENTER_32) ...@@ -359,21 +341,18 @@ ENDPROC(entry_SYSENTER_32)
# system call handler stub # system call handler stub
ENTRY(entry_INT80_32) ENTRY(entry_INT80_32)
ASM_CLAC ASM_CLAC
pushl %eax # save orig_eax pushl %eax /* pt_regs->orig_ax */
SAVE_ALL SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
GET_THREAD_INFO(%ebp)
# system call tracing in operation / emulation /*
testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) * User mode is traced as though IRQs are on. Unlike the 64-bit
jnz syscall_trace_entry * case, INT80 is a trap gate on 32-bit kernels, so interrupts
cmpl $(NR_syscalls), %eax * are already on (unless user code is messing around with iopl).
jae syscall_badsys */
syscall_call:
call *sys_call_table(, %eax, 4) movl %esp, %eax
syscall_after_call: call do_syscall_32_irqs_on
movl %eax, PT_EAX(%esp) # store the return value .Lsyscall_32_done:
syscall_exit:
LOCKDEP_SYS_EXIT
jmp syscall_exit_work
restore_all: restore_all:
TRACE_IRQS_IRET TRACE_IRQS_IRET
...@@ -450,47 +429,6 @@ ldt_ss: ...@@ -450,47 +429,6 @@ ldt_ss:
#endif #endif
ENDPROC(entry_INT80_32) ENDPROC(entry_INT80_32)
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
movl $-ENOSYS, PT_EAX(%esp)
movl %esp, %eax
call syscall_trace_enter
/* What it returned is what we'll actually use. */
cmpl $(NR_syscalls), %eax
jnae syscall_call
jmp syscall_exit
END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
syscall_exit_work_irqs_off:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
syscall_exit_work:
movl %esp, %eax
call syscall_return_slowpath
jmp restore_all
END(syscall_exit_work)
syscall_fault:
ASM_CLAC
GET_THREAD_INFO(%ebp)
movl $-EFAULT, PT_EAX(%esp)
jmp resume_userspace
END(syscall_fault)
syscall_badsys:
movl $-ENOSYS, %eax
jmp syscall_after_call
END(syscall_badsys)
sysenter_badsys:
movl $-ENOSYS, %eax
jmp sysenter_after_call
END(sysenter_badsys)
.macro FIXUP_ESPFIX_STACK .macro FIXUP_ESPFIX_STACK
/* /*
* Switch back for ESPFIX stack to the normal zerobased stack * Switch back for ESPFIX stack to the normal zerobased stack
......
...@@ -391,20 +391,16 @@ GLOBAL(stub_execveat) ...@@ -391,20 +391,16 @@ GLOBAL(stub_execveat)
jmp return_from_execve jmp return_from_execve
END(stub_execveat) END(stub_execveat)
#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) #if defined(CONFIG_X86_X32_ABI)
.align 8 .align 8
GLOBAL(stub_x32_execve) GLOBAL(stub_x32_execve)
GLOBAL(stub32_execve)
call compat_sys_execve call compat_sys_execve
jmp return_from_execve jmp return_from_execve
END(stub32_execve)
END(stub_x32_execve) END(stub_x32_execve)
.align 8 .align 8
GLOBAL(stub_x32_execveat) GLOBAL(stub_x32_execveat)
GLOBAL(stub32_execveat)
call compat_sys_execveat call compat_sys_execveat
jmp return_from_execve jmp return_from_execve
END(stub32_execveat)
END(stub_x32_execveat) END(stub_x32_execveat)
#endif #endif
...@@ -557,7 +553,6 @@ ret_from_intr: ...@@ -557,7 +553,6 @@ ret_from_intr:
jz retint_kernel jz retint_kernel
/* Interrupt came from user space */ /* Interrupt came from user space */
LOCKDEP_SYS_EXIT_IRQ
GLOBAL(retint_user) GLOBAL(retint_user)
mov %rsp,%rdi mov %rsp,%rdi
call prepare_exit_to_usermode call prepare_exit_to_usermode
...@@ -587,7 +582,7 @@ retint_kernel: ...@@ -587,7 +582,7 @@ retint_kernel:
* At this label, code paths which return to kernel and to user, * At this label, code paths which return to kernel and to user,
* which come from interrupts/exception and from syscalls, merge. * which come from interrupts/exception and from syscalls, merge.
*/ */
restore_regs_and_iret: GLOBAL(restore_regs_and_iret)
RESTORE_EXTRA_REGS RESTORE_EXTRA_REGS
restore_c_regs_and_iret: restore_c_regs_and_iret:
RESTORE_C_REGS RESTORE_C_REGS
......
This diff is collapsed.
...@@ -4,24 +4,21 @@ ...@@ -4,24 +4,21 @@
#include <linux/sys.h> #include <linux/sys.h>
#include <linux/cache.h> #include <linux/cache.h>
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION #ifdef CONFIG_IA32_EMULATION
#define SYM(sym, compat) compat #define SYM(sym, compat) compat
#else #else
#define SYM(sym, compat) sym #define SYM(sym, compat) sym
#define ia32_sys_call_table sys_call_table
#define __NR_syscall_compat_max __NR_syscall_max
#endif #endif
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h> #include <asm/syscalls_32.h>
#undef __SYSCALL_I386 #undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
extern asmlinkage void sys_ni_syscall(void);
__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
/* /*
......
...@@ -14,13 +14,13 @@ ...@@ -14,13 +14,13 @@
# define __SYSCALL_X32(nr, sym, compat) /* nothing */ # define __SYSCALL_X32(nr, sym, compat) /* nothing */
#endif #endif
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h> #include <asm/syscalls_64.h>
#undef __SYSCALL_64 #undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [nr] = sym, #define __SYSCALL_64(nr, sym, compat) [nr] = sym,
extern void sys_ni_syscall(void); extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/* /*
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
# #
0 i386 restart_syscall sys_restart_syscall 0 i386 restart_syscall sys_restart_syscall
1 i386 exit sys_exit 1 i386 exit sys_exit
2 i386 fork sys_fork stub32_fork 2 i386 fork sys_fork sys_fork
3 i386 read sys_read 3 i386 read sys_read
4 i386 write sys_write 4 i386 write sys_write
5 i386 open sys_open compat_sys_open 5 i386 open sys_open compat_sys_open
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
8 i386 creat sys_creat 8 i386 creat sys_creat
9 i386 link sys_link 9 i386 link sys_link
10 i386 unlink sys_unlink 10 i386 unlink sys_unlink
11 i386 execve sys_execve stub32_execve 11 i386 execve sys_execve compat_sys_execve
12 i386 chdir sys_chdir 12 i386 chdir sys_chdir
13 i386 time sys_time compat_sys_time 13 i386 time sys_time compat_sys_time
14 i386 mknod sys_mknod 14 i386 mknod sys_mknod
...@@ -125,7 +125,7 @@ ...@@ -125,7 +125,7 @@
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
117 i386 ipc sys_ipc compat_sys_ipc 117 i386 ipc sys_ipc compat_sys_ipc
118 i386 fsync sys_fsync 118 i386 fsync sys_fsync
119 i386 sigreturn sys_sigreturn stub32_sigreturn 119 i386 sigreturn sys_sigreturn sys32_sigreturn
120 i386 clone sys_clone stub32_clone 120 i386 clone sys_clone stub32_clone
121 i386 setdomainname sys_setdomainname 121 i386 setdomainname sys_setdomainname
122 i386 uname sys_newuname 122 i386 uname sys_newuname
...@@ -179,7 +179,7 @@ ...@@ -179,7 +179,7 @@
170 i386 setresgid sys_setresgid16 170 i386 setresgid sys_setresgid16
171 i386 getresgid sys_getresgid16 171 i386 getresgid sys_getresgid16
172 i386 prctl sys_prctl 172 i386 prctl sys_prctl
173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn 173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn
174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
175 i386 rt_sigprocmask sys_rt_sigprocmask 175 i386 rt_sigprocmask sys_rt_sigprocmask
176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending 176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
...@@ -196,7 +196,7 @@ ...@@ -196,7 +196,7 @@
187 i386 sendfile sys_sendfile compat_sys_sendfile 187 i386 sendfile sys_sendfile compat_sys_sendfile
188 i386 getpmsg 188 i386 getpmsg
189 i386 putpmsg 189 i386 putpmsg
190 i386 vfork sys_vfork stub32_vfork 190 i386 vfork sys_vfork sys_vfork
191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
192 i386 mmap2 sys_mmap_pgoff 192 i386 mmap2 sys_mmap_pgoff
193 i386 truncate64 sys_truncate64 sys32_truncate64 193 i386 truncate64 sys_truncate64 sys32_truncate64
...@@ -364,7 +364,7 @@ ...@@ -364,7 +364,7 @@
355 i386 getrandom sys_getrandom 355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create 356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf 357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat 358 i386 execveat sys_execveat compat_sys_execveat
359 i386 socket sys_socket 359 i386 socket sys_socket
360 i386 socketpair sys_socketpair 360 i386 socketpair sys_socketpair
361 i386 bind sys_bind 361 i386 bind sys_bind
......
...@@ -19,9 +19,7 @@ obj-y += vma.o ...@@ -19,9 +19,7 @@ obj-y += vma.o
# vDSO images to build # vDSO images to build
vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32 vdso_img-$(VDSOX32-y) += x32
vdso_img-$(VDSO32-y) += 32-int80 vdso_img-$(VDSO32-y) += 32
vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall
vdso_img-$(VDSO32-y) += 32-sysenter
obj-$(VDSO32-y) += vdso32-setup.o obj-$(VDSO32-y) += vdso32-setup.o
...@@ -69,7 +67,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE ...@@ -69,7 +67,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \ -fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
$(vobjs): KBUILD_CFLAGS += $(CFL) $(vobjs): KBUILD_CFLAGS += $(CFL)
...@@ -122,15 +120,6 @@ $(obj)/%.so: $(obj)/%.so.dbg ...@@ -122,15 +120,6 @@ $(obj)/%.so: $(obj)/%.so.dbg
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso) $(call if_changed,vdso)
#
# Build multiple 32-bit vDSO images to choose from at boot time.
#
vdso32.so-$(VDSO32-y) += int80
vdso32.so-$(CONFIG_IA32_EMULATION) += syscall
vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
...@@ -139,14 +128,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 ...@@ -139,14 +128,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o
targets += vdso32/vclock_gettime.o targets += vdso32/vclock_gettime.o
$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
...@@ -157,13 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) ...@@ -157,13 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32.so.dbg: FORCE \
$(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vdso32.lds \
$(obj)/vdso32/vclock_gettime.o \ $(obj)/vdso32/vclock_gettime.o \
$(obj)/vdso32/note.o \ $(obj)/vdso32/note.o \
$(obj)/vdso32/%.o $(obj)/vdso32/system_call.o
$(call if_changed,vdso) $(call if_changed,vdso)
# #
...@@ -206,4 +193,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE ...@@ -206,4 +193,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets) PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE vdso_install: $(vdso_img_insttargets) FORCE
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
...@@ -98,10 +98,10 @@ struct vdso_sym required_syms[] = { ...@@ -98,10 +98,10 @@ struct vdso_sym required_syms[] = {
"VDSO_FAKE_SECTION_TABLE_END", false "VDSO_FAKE_SECTION_TABLE_END", false
}, },
{"VDSO32_NOTE_MASK", true}, {"VDSO32_NOTE_MASK", true},
{"VDSO32_SYSENTER_RETURN", true},
{"__kernel_vsyscall", true}, {"__kernel_vsyscall", true},
{"__kernel_sigreturn", true}, {"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true}, {"__kernel_rt_sigreturn", true},
{"int80_landing_pad", true},
}; };
__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) __attribute__((format(printf, 1, 2))) __attribute__((noreturn))
......
...@@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup); ...@@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup);
__setup_param("vdso=", vdso_setup, vdso32_setup, 0); __setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif #endif
#ifdef CONFIG_X86_64
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
#else /* CONFIG_X86_32 */
#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
#define vdso32_syscall() (0)
#endif /* CONFIG_X86_64 */
#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
const struct vdso_image *selected_vdso32;
#endif
int __init sysenter_setup(void) int __init sysenter_setup(void)
{ {
#ifdef CONFIG_COMPAT init_vdso_image(&vdso_image_32);
if (vdso32_syscall())
selected_vdso32 = &vdso_image_32_syscall;
else
#endif
if (vdso32_sysenter())
selected_vdso32 = &vdso_image_32_sysenter;
else
selected_vdso32 = &vdso_image_32_int80;
init_vdso_image(selected_vdso32);
return 0; return 0;
} }
......
/*
* Code for the vDSO. This version uses the old int $0x80 method.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
int $0x80
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
.align 4
.LENDFDEDLSI:
.previous
/*
* Pad out the segment to match the size of the sysenter.S version.
*/
VDSO32_vsyscall_eh_frame_size = 0x40
.section .data,"aw",@progbits
.space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
.previous
/*
* Code for the vDSO. This version uses the syscall instruction.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#define SYSCALL_ENTER_KERNEL syscall
#include "sigreturn.S"
#include <asm/segment.h>
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
push %ebp
.Lpush_ebp:
movl %ecx, %ebp
syscall
movl %ebp, %ecx
popl %ebp
.Lpop_ebp:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.section .eh_frame,"a",@progbits
.LSTARTFRAME:
.long .LENDCIE-.LSTARTCIE
.LSTARTCIE:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIE:
.long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
.LSTARTFDE1:
.long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0 /* Augmentation length */
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.uleb128 8
.byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
.byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.uleb128 4
.align 4
.LENDFDE1:
.previous
/*
* Pad out the segment to match the size of the sysenter.S version.
*/
VDSO32_vsyscall_eh_frame_size = 0x40
.section .data,"aw",@progbits
.space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
.previous
/*
* Code for the vDSO. This version uses the sysenter instruction.
*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
/*
* The caller puts arg2 in %ecx, which gets pushed. The kernel will use
* %ecx itself for arg2. The pushing is because the sysexit instruction
* (found in entry.S) requires that we clobber %ecx with the desired %esp.
* User code might expect that %ecx is unclobbered though, as it would be
* for returning via the iret instruction, so we must push and pop.
*
* The caller puts arg3 in %edx, which the sysexit instruction requires
* for %eip. Thus, exactly as for arg2, we must push and pop.
*
* Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
* instruction clobbers %esp, the user's %esp won't even survive entry
* into the kernel. We store %esp in %ebp. Code in entry.S must fetch
* arg6 from the stack.
*
* You can not use this vsyscall for the clone() syscall because the
* three words on the parent stack do not get copied to the child.
*/
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
.LSTART_vsyscall:
push %ecx
.Lpush_ecx:
push %edx
.Lpush_edx:
push %ebp
.Lenter_kernel:
movl %esp,%ebp
sysenter
/* 7: align return point with nop's to make disassembly easier */
.space 7,0x90
/* 14: System call restart point is here! (SYSENTER_RETURN-2) */
int $0x80
/* 16: System call normal return point is here! */
VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
pop %ebp
.Lpop_ebp:
pop %edx
.Lpop_edx:
pop %ecx
.Lpop_ecx:
ret
.LEND_vsyscall:
.size __kernel_vsyscall,.-.LSTART_vsyscall
.previous
.section .eh_frame,"a",@progbits
.LSTARTFRAMEDLSI:
.long .LENDCIEDLSI-.LSTARTCIEDLSI
.LSTARTCIEDLSI:
.long 0 /* CIE ID */
.byte 1 /* Version number */
.string "zR" /* NUL-terminated augmentation string */
.uleb128 1 /* Code alignment factor */
.sleb128 -4 /* Data alignment factor */
.byte 8 /* Return address register column */
.uleb128 1 /* Augmentation value length */
.byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
.byte 0x0c /* DW_CFA_def_cfa */
.uleb128 4
.uleb128 4
.byte 0x88 /* DW_CFA_offset, column 0x8 */
.uleb128 1
.align 4
.LENDCIEDLSI:
.long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
.LSTARTFDEDLSI:
.long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
.long .LSTART_vsyscall-. /* PC-relative start address */
.long .LEND_vsyscall-.LSTART_vsyscall
.uleb128 0
/* What follows are the instructions for the table generation.
We have to record all changes of the stack pointer. */
.byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x10 /* RA at offset 16 now */
.byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
/* Finally the epilogue. */
.byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x0c /* RA at offset 12 now */
.byte 0xc5 /* DW_CFA_restore %ebp */
.byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x08 /* RA at offset 8 now */
.byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
.byte 0x0e /* DW_CFA_def_cfa_offset */
.byte 0x04 /* RA at offset 4 now */
.align 4
.LENDFDEDLSI:
.previous
/*
* Emit a symbol with the size of this .eh_frame data,
* to verify it matches the other versions.
*/
VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
/*
* Code for the vDSO. This version uses the old int $0x80 method.
*/
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/*
* First get the common code for the sigreturn entry points.
* This must come first.
*/
#include "sigreturn.S"
.text
.globl __kernel_vsyscall
.type __kernel_vsyscall,@function
ALIGN
__kernel_vsyscall:
CFI_STARTPROC
/*
* Reshuffle regs so that all of any of the entry instructions
* will preserve enough state.
*/
pushl %edx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET edx, 0
pushl %ecx
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ecx, 0
movl %esp, %ecx
#ifdef CONFIG_X86_64
/* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
"syscall", X86_FEATURE_SYSCALL32
#else
ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
#endif
/* Enter using int $0x80 */
movl (%esp), %ecx
int $0x80
GLOBAL(int80_landing_pad)
/* Restore ECX and EDX in case they were clobbered. */
popl %ecx
CFI_RESTORE ecx
CFI_ADJUST_CFA_OFFSET -4
popl %edx
CFI_RESTORE edx
CFI_ADJUST_CFA_OFFSET -4
ret
CFI_ENDPROC
.size __kernel_vsyscall,.-__kernel_vsyscall
.previous
...@@ -180,21 +180,10 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) ...@@ -180,21 +180,10 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void) static int load_vdso32(void)
{ {
int ret;
if (vdso32_enabled != 1) /* Other values all mean "disabled" */ if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0; return 0;
ret = map_vdso(selected_vdso32, false); return map_vdso(&vdso_image_32, false);
if (ret)
return ret;
if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
current_thread_info()->sysenter_return =
current->mm->context.vdso +
selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
return 0;
} }
#endif #endif
......
...@@ -38,7 +38,14 @@ ...@@ -38,7 +38,14 @@
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include "vsyscall_trace.h" #include "vsyscall_trace.h"
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE)
NATIVE;
#elif defined(CONFIG_LEGACY_VSYSCALL_NONE)
NONE;
#else
EMULATE;
#endif
static int __init vsyscall_setup(char *str) static int __init vsyscall_setup(char *str)
{ {
......
...@@ -289,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, ...@@ -289,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
/* Return stub is in 32bit vsyscall page */ /* Return stub is in 32bit vsyscall page */
if (current->mm->context.vdso) if (current->mm->context.vdso)
restorer = current->mm->context.vdso + restorer = current->mm->context.vdso +
selected_vdso32->sym___kernel_sigreturn; vdso_image_32.sym___kernel_sigreturn;
else else
restorer = &frame->retcode; restorer = &frame->retcode;
} }
...@@ -368,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, ...@@ -368,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
restorer = ksig->ka.sa.sa_restorer; restorer = ksig->ka.sa.sa_restorer;
else else
restorer = current->mm->context.vdso + restorer = current->mm->context.vdso +
selected_vdso32->sym___kernel_rt_sigreturn; vdso_image_32.sym___kernel_rt_sigreturn;
put_user_ex(ptr_to_compat(restorer), &frame->pretcode); put_user_ex(ptr_to_compat(restorer), &frame->pretcode);
/* /*
......
#ifndef _ASM_X86_DWARF2_H
#define _ASM_X86_DWARF2_H
#ifndef __ASSEMBLY__
#warning "asm/dwarf2.h should be only included in pure assembly files"
#endif
/*
* Macros for dwarf2 CFI unwind table entries.
* See "as.info" for details on these pseudo ops. Unfortunately
* they are only supported in very new binutils, so define them
* away for older version.
*/
#ifdef CONFIG_AS_CFI
#define CFI_STARTPROC .cfi_startproc
#define CFI_ENDPROC .cfi_endproc
#define CFI_DEF_CFA .cfi_def_cfa
#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
#define CFI_OFFSET .cfi_offset
#define CFI_REL_OFFSET .cfi_rel_offset
#define CFI_REGISTER .cfi_register
#define CFI_RESTORE .cfi_restore
#define CFI_REMEMBER_STATE .cfi_remember_state
#define CFI_RESTORE_STATE .cfi_restore_state
#define CFI_UNDEFINED .cfi_undefined
#define CFI_ESCAPE .cfi_escape
#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
#define CFI_SIGNAL_FRAME .cfi_signal_frame
#else
#define CFI_SIGNAL_FRAME
#endif
#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
#ifndef BUILD_VDSO
/*
* Emit CFI data in .debug_frame sections, not .eh_frame sections.
* The latter we currently just discard since we don't do DWARF
* unwinding at runtime. So only the offline DWARF information is
* useful to anyone. Note we should not use this directive if
* vmlinux.lds.S gets changed so it doesn't discard .eh_frame.
*/
.cfi_sections .debug_frame
#else
/*
* For the vDSO, emit both runtime unwind information and debug
* symbols for the .dbg file.
*/
.cfi_sections .eh_frame, .debug_frame
#endif
#endif
#else
/*
* Due to the structure of pre-exisiting code, don't use assembler line
* comment character # to ignore the arguments. Instead, use a dummy macro.
*/
.macro cfi_ignore a=0, b=0, c=0, d=0
.endm
#define CFI_STARTPROC cfi_ignore
#define CFI_ENDPROC cfi_ignore
#define CFI_DEF_CFA cfi_ignore
#define CFI_DEF_CFA_REGISTER cfi_ignore
#define CFI_DEF_CFA_OFFSET cfi_ignore
#define CFI_ADJUST_CFA_OFFSET cfi_ignore
#define CFI_OFFSET cfi_ignore
#define CFI_REL_OFFSET cfi_ignore
#define CFI_REGISTER cfi_ignore
#define CFI_RESTORE cfi_ignore
#define CFI_REMEMBER_STATE cfi_ignore
#define CFI_RESTORE_STATE cfi_ignore
#define CFI_UNDEFINED cfi_ignore
#define CFI_ESCAPE cfi_ignore
#define CFI_SIGNAL_FRAME cfi_ignore
#endif
#endif /* _ASM_X86_DWARF2_H */
...@@ -171,11 +171,11 @@ do { \ ...@@ -171,11 +171,11 @@ do { \
static inline void elf_common_init(struct thread_struct *t, static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds) struct pt_regs *regs, const u16 ds)
{ {
/* Commented-out registers are cleared in stub_execve */ /* ax gets execve's return value. */
/*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0;
regs->si = regs->di /*= regs->bp*/ = 0; regs->si = regs->di = regs->bp = 0;
regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
/*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
t->fs = t->gs = 0; t->fs = t->gs = 0;
t->fsindex = t->gsindex = 0; t->fsindex = t->gsindex = 0;
t->ds = t->es = ds; t->ds = t->es = ds;
...@@ -328,7 +328,7 @@ else \ ...@@ -328,7 +328,7 @@ else \
#define VDSO_ENTRY \ #define VDSO_ENTRY \
((unsigned long)current->mm->context.vdso + \ ((unsigned long)current->mm->context.vdso + \
selected_vdso32->sym___kernel_vsyscall) vdso_image_32.sym___kernel_vsyscall)
struct linux_binprm; struct linux_binprm;
......
...@@ -556,12 +556,12 @@ static inline unsigned int cpuid_edx(unsigned int op) ...@@ -556,12 +556,12 @@ static inline unsigned int cpuid_edx(unsigned int op)
} }
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
static inline void rep_nop(void) static __always_inline void rep_nop(void)
{ {
asm volatile("rep; nop" ::: "memory"); asm volatile("rep; nop" ::: "memory");
} }
static inline void cpu_relax(void) static __always_inline void cpu_relax(void)
{ {
rep_nop(); rep_nop();
} }
......
...@@ -79,12 +79,12 @@ do { \ ...@@ -79,12 +79,12 @@ do { \
#else /* CONFIG_X86_32 */ #else /* CONFIG_X86_32 */
/* frame pointer must be last for get_wchan */ /* frame pointer must be last for get_wchan */
#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" #define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" #define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
#define __EXTRA_CLOBBER \ #define __EXTRA_CLOBBER \
, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
"r12", "r13", "r14", "r15" "r12", "r13", "r14", "r15", "flags"
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR
#define __switch_canary \ #define __switch_canary \
...@@ -100,7 +100,11 @@ do { \ ...@@ -100,7 +100,11 @@ do { \
#define __switch_canary_iparam #define __switch_canary_iparam
#endif /* CC_STACKPROTECTOR */ #endif /* CC_STACKPROTECTOR */
/* Save restore flags to clear handle leaking NT */ /*
* There is no need to save or restore flags, because flags are always
* clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
* has no effect.
*/
#define switch_to(prev, next, last) \ #define switch_to(prev, next, last) \
asm volatile(SAVE_CONTEXT \ asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
......
...@@ -20,9 +20,21 @@ ...@@ -20,9 +20,21 @@
#include <asm/thread_info.h> /* for TS_COMPAT */ #include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h> #include <asm/unistd.h>
typedef void (*sys_call_ptr_t)(void); typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
extern const sys_call_ptr_t sys_call_table[]; extern const sys_call_ptr_t sys_call_table[];
#if defined(CONFIG_X86_32)
#define ia32_sys_call_table sys_call_table
#define __NR_syscall_compat_max __NR_syscall_max
#define IA32_NR_syscalls NR_syscalls
#endif
#if defined(CONFIG_IA32_EMULATION)
extern const sys_call_ptr_t ia32_sys_call_table[];
#endif
/* /*
* Only the low 32 bits of orig_ax are meaningful, so we return int. * Only the low 32 bits of orig_ax are meaningful, so we return int.
* This importantly ignores the high bits on 64-bit, so comparisons * This importantly ignores the high bits on 64-bit, so comparisons
......
...@@ -58,7 +58,6 @@ struct thread_info { ...@@ -58,7 +58,6 @@ struct thread_info {
__u32 status; /* thread synchronous flags */ __u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */ __u32 cpu; /* current CPU */
mm_segment_t addr_limit; mm_segment_t addr_limit;
void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1; unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */ unsigned int uaccess_err:1; /* uaccess failed */
}; };
......
...@@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un ...@@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
* limit, not add it to the address). * limit, not add it to the address).
*/ */
if (__builtin_constant_p(size)) if (__builtin_constant_p(size))
return addr > limit - size; return unlikely(addr > limit - size);
/* Arbitrary sizes? Be careful about overflow */ /* Arbitrary sizes? Be careful about overflow */
addr += size; addr += size;
if (addr < size) if (unlikely(addr < size))
return true; return true;
return addr > limit; return unlikely(addr > limit);
} }
#define __range_not_ok(addr, size, limit) \ #define __range_not_ok(addr, size, limit) \
...@@ -182,7 +182,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) ...@@ -182,7 +182,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
: "=a" (__ret_gu), "=r" (__val_gu) \ : "=a" (__ret_gu), "=r" (__val_gu) \
: "0" (ptr), "i" (sizeof(*(ptr)))); \ : "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \ (x) = (__force __typeof__(*(ptr))) __val_gu; \
__ret_gu; \ __builtin_expect(__ret_gu, 0); \
}) })
#define __put_user_x(size, x, ptr, __ret_pu) \ #define __put_user_x(size, x, ptr, __ret_pu) \
...@@ -278,7 +278,7 @@ extern void __put_user_8(void); ...@@ -278,7 +278,7 @@ extern void __put_user_8(void);
__put_user_x(X, __pu_val, ptr, __ret_pu); \ __put_user_x(X, __pu_val, ptr, __ret_pu); \
break; \ break; \
} \ } \
__ret_pu; \ __builtin_expect(__ret_pu, 0); \
}) })
#define __put_user_size(x, ptr, size, retval, errret) \ #define __put_user_size(x, ptr, size, retval, errret) \
...@@ -401,7 +401,7 @@ do { \ ...@@ -401,7 +401,7 @@ do { \
({ \ ({ \
int __pu_err; \ int __pu_err; \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
__pu_err; \ __builtin_expect(__pu_err, 0); \
}) })
#define __get_user_nocheck(x, ptr, size) \ #define __get_user_nocheck(x, ptr, size) \
...@@ -410,7 +410,7 @@ do { \ ...@@ -410,7 +410,7 @@ do { \
unsigned long __gu_val; \ unsigned long __gu_val; \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \ (x) = (__force __typeof__(*(ptr)))__gu_val; \
__gu_err; \ __builtin_expect(__gu_err, 0); \
}) })
/* FIXME: this hack is definitely wrong -AK */ /* FIXME: this hack is definitely wrong -AK */
......
...@@ -26,7 +26,7 @@ struct vdso_image { ...@@ -26,7 +26,7 @@ struct vdso_image {
long sym___kernel_sigreturn; long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn; long sym___kernel_rt_sigreturn;
long sym___kernel_vsyscall; long sym___kernel_vsyscall;
long sym_VDSO32_SYSENTER_RETURN; long sym_int80_landing_pad;
}; };
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -38,13 +38,7 @@ extern const struct vdso_image vdso_image_x32; ...@@ -38,13 +38,7 @@ extern const struct vdso_image vdso_image_x32;
#endif #endif
#if defined CONFIG_X86_32 || defined CONFIG_COMPAT #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
extern const struct vdso_image vdso_image_32_int80; extern const struct vdso_image vdso_image_32;
#ifdef CONFIG_COMPAT
extern const struct vdso_image vdso_image_32_syscall;
#endif
extern const struct vdso_image vdso_image_32_sysenter;
extern const struct vdso_image *selected_vdso32;
#endif #endif
extern void __init init_vdso_image(const struct vdso_image *image); extern void __init init_vdso_image(const struct vdso_image *image);
......
...@@ -53,9 +53,6 @@ void common(void) { ...@@ -53,9 +53,6 @@ void common(void) {
OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp); OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp);
OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip);
BLANK();
OFFSET(TI_sysenter_return, thread_info, sysenter_return);
BLANK(); BLANK();
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
#endif #endif
......
...@@ -299,7 +299,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, ...@@ -299,7 +299,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
if (current->mm->context.vdso) if (current->mm->context.vdso)
restorer = current->mm->context.vdso + restorer = current->mm->context.vdso +
selected_vdso32->sym___kernel_sigreturn; vdso_image_32.sym___kernel_sigreturn;
else else
restorer = &frame->retcode; restorer = &frame->retcode;
if (ksig->ka.sa.sa_flags & SA_RESTORER) if (ksig->ka.sa.sa_flags & SA_RESTORER)
...@@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, ...@@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace. */ /* Set up to return from userspace. */
restorer = current->mm->context.vdso + restorer = current->mm->context.vdso +
selected_vdso32->sym___kernel_rt_sigreturn; vdso_image_32.sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER) if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer; restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode); put_user_ex(restorer, &frame->pretcode);
......
...@@ -3,6 +3,10 @@ ...@@ -3,6 +3,10 @@
#include <uapi/linux/audit.h> #include <uapi/linux/audit.h>
typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
static inline int syscall_get_arch(void) static inline int syscall_get_arch(void)
{ {
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/sys.h> #include <linux/sys.h>
#include <linux/cache.h> #include <linux/cache.h>
#include <generated/user_constants.h> #include <generated/user_constants.h>
#include <asm/syscall.h>
#define __NO_STUBS #define __NO_STUBS
...@@ -24,15 +25,13 @@ ...@@ -24,15 +25,13 @@
#define old_mmap sys_old_mmap #define old_mmap sys_old_mmap
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h> #include <asm/syscalls_32.h>
#undef __SYSCALL_I386 #undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
extern asmlinkage void sys_ni_syscall(void);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
/* /*
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/sys.h> #include <linux/sys.h>
#include <linux/cache.h> #include <linux/cache.h>
#include <generated/user_constants.h> #include <generated/user_constants.h>
#include <asm/syscall.h>
#define __NO_STUBS #define __NO_STUBS
...@@ -37,15 +38,13 @@ ...@@ -37,15 +38,13 @@
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#define __SYSCALL_X32(nr, sym, compat) /* Not supported */ #define __SYSCALL_X32(nr, sym, compat) /* Not supported */
#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h> #include <asm/syscalls_64.h>
#undef __SYSCALL_64 #undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
typedef void (*sys_call_ptr_t)(void); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
extern void sys_ni_syscall(void);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
/* /*
......
...@@ -965,17 +965,8 @@ char * __init xen_auto_xlated_memory_setup(void) ...@@ -965,17 +965,8 @@ char * __init xen_auto_xlated_memory_setup(void)
static void __init fiddle_vdso(void) static void __init fiddle_vdso(void)
{ {
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
/* u32 *mask = vdso_image_32.data +
* This could be called before selected_vdso32 is initialized, so vdso_image_32.sym_VDSO32_NOTE_MASK;
* just fiddle with both possible images. vdso_image_32_syscall
* can't be selected, since it only exists on 64-bit systems.
*/
u32 *mask;
mask = vdso_image_32_int80.data +
vdso_image_32_int80.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
mask = vdso_image_32_sysenter.data +
vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK;
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif #endif
} }
......
...@@ -4,8 +4,8 @@ include ../lib.mk ...@@ -4,8 +4,8 @@ include ../lib.mk
.PHONY: all all_32 all_64 warn_32bit_failure clean .PHONY: all all_32 all_64 warn_32bit_failure clean
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso
TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
...@@ -60,3 +60,5 @@ endif ...@@ -60,3 +60,5 @@ endif
# Some tests have additional dependencies. # Some tests have additional dependencies.
sysret_ss_attrs_64: thunks.S sysret_ss_attrs_64: thunks.S
ptrace_syscall_32: raw_syscall_helper_32.S
test_syscall_vdso_32: thunks_32.S
#define _GNU_SOURCE
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/user.h>
#include <unistd.h>
#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <err.h>
#include <string.h>
#include <asm/ptrace-abi.h>
#include <sys/auxv.h>
/* Bitness-agnostic defines for user_regs_struct fields. */
#ifdef __x86_64__
# define user_syscall_nr orig_rax
# define user_arg0 rdi
# define user_arg1 rsi
# define user_arg2 rdx
# define user_arg3 r10
# define user_arg4 r8
# define user_arg5 r9
# define user_ip rip
# define user_ax rax
#else
# define user_syscall_nr orig_eax
# define user_arg0 ebx
# define user_arg1 ecx
# define user_arg2 edx
# define user_arg3 esi
# define user_arg4 edi
# define user_arg5 ebp
# define user_ip eip
# define user_ax eax
#endif
static int nerrs = 0;
struct syscall_args32 {
uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
};
#ifdef __i386__
extern void sys32_helper(struct syscall_args32 *, void *);
extern void int80_and_ret(void);
#endif
/*
* Helper to invoke int80 with controlled regs and capture the final regs.
*/
static void do_full_int80(struct syscall_args32 *args)
{
#ifdef __x86_64__
register unsigned long bp asm("bp") = args->arg5;
asm volatile ("int $0x80"
: "+a" (args->nr),
"+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
"+S" (args->arg3), "+D" (args->arg4), "+r" (bp));
args->arg5 = bp;
#else
sys32_helper(args, int80_and_ret);
#endif
}
#ifdef __i386__
static void (*vsyscall32)(void);
/*
* Nasty helper to invoke AT_SYSINFO (i.e. __kernel_vsyscall) with
* controlled regs and capture the final regs. This is so nasty that it
* crashes my copy of gdb :)
*/
static void do_full_vsyscall32(struct syscall_args32 *args)
{
sys32_helper(args, vsyscall32);
}
#endif
static siginfo_t wait_trap(pid_t chld)
{
siginfo_t si;
if (waitid(P_PID, chld, &si, WEXITED|WSTOPPED) != 0)
err(1, "waitid");
if (si.si_pid != chld)
errx(1, "got unexpected pid in event\n");
if (si.si_code != CLD_TRAPPED)
errx(1, "got unexpected event type %d\n", si.si_code);
return si;
}
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static void clearhandler(int sig)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_DFL;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
#ifdef __x86_64__
# define REG_BP REG_RBP
#else
# define REG_BP REG_EBP
#endif
static void empty_handler(int sig, siginfo_t *si, void *ctx_void)
{
}
static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
{
struct syscall_args32 args = {
.nr = 224, /* gettid */
.arg0 = 10, .arg1 = 11, .arg2 = 12,
.arg3 = 13, .arg4 = 14, .arg5 = 15,
};
do_syscall(&args);
if (args.nr != getpid() ||
args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 ||
args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
printf("[FAIL]\tgetpid() failed to preseve regs\n");
nerrs++;
} else {
printf("[OK]\tgetpid() preserves regs\n");
}
sethandler(SIGUSR1, empty_handler, 0);
args.nr = 37; /* kill */
args.arg0 = getpid();
args.arg1 = SIGUSR1;
do_syscall(&args);
if (args.nr != 0 ||
args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 ||
args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preseve regs\n");
nerrs++;
} else {
printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n");
}
clearhandler(SIGUSR1);
}
static void test_ptrace_syscall_restart(void)
{
printf("[RUN]\tptrace-induced syscall restart\n");
pid_t chld = fork();
if (chld < 0)
err(1, "fork");
if (chld == 0) {
if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
err(1, "PTRACE_TRACEME");
printf("\tChild will make one syscall\n");
raise(SIGSTOP);
syscall(SYS_gettid, 10, 11, 12, 13, 14, 15);
_exit(0);
}
int status;
/* Wait for SIGSTOP. */
if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
err(1, "waitpid");
struct user_regs_struct regs;
printf("[RUN]\tSYSEMU\n");
if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
err(1, "PTRACE_SYSCALL");
wait_trap(chld);
if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
err(1, "PTRACE_GETREGS");
if (regs.user_syscall_nr != SYS_gettid ||
regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
regs.user_arg4 != 14 || regs.user_arg5 != 15) {
printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
nerrs++;
} else {
printf("[OK]\tInitial nr and args are correct\n");
}
printf("[RUN]\tRestart the syscall (ip = 0x%lx)\n",
(unsigned long)regs.user_ip);
/*
* This does exactly what it appears to do if syscall is int80 or
* SYSCALL64. For SYSCALL32 or SYSENTER, though, this is highly
* magical. It needs to work so that ptrace and syscall restart
* work as expected.
*/
regs.user_ax = regs.user_syscall_nr;
regs.user_ip -= 2;
if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
err(1, "PTRACE_SETREGS");
if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
err(1, "PTRACE_SYSCALL");
wait_trap(chld);
if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
err(1, "PTRACE_GETREGS");
if (regs.user_syscall_nr != SYS_gettid ||
regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
regs.user_arg4 != 14 || regs.user_arg5 != 15) {
printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
nerrs++;
} else {
printf("[OK]\tRestarted nr and args are correct\n");
}
printf("[RUN]\tChange nr and args and restart the syscall (ip = 0x%lx)\n",
(unsigned long)regs.user_ip);
regs.user_ax = SYS_getpid;
regs.user_arg0 = 20;
regs.user_arg1 = 21;
regs.user_arg2 = 22;
regs.user_arg3 = 23;
regs.user_arg4 = 24;
regs.user_arg5 = 25;
regs.user_ip -= 2;
if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
err(1, "PTRACE_SETREGS");
if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
err(1, "PTRACE_SYSCALL");
wait_trap(chld);
if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
err(1, "PTRACE_GETREGS");
if (regs.user_syscall_nr != SYS_getpid ||
regs.user_arg0 != 20 || regs.user_arg1 != 21 || regs.user_arg2 != 22 ||
regs.user_arg3 != 23 || regs.user_arg4 != 24 || regs.user_arg5 != 25) {
printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
nerrs++;
} else {
printf("[OK]\tReplacement nr and args are correct\n");
}
if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
err(1, "PTRACE_CONT");
if (waitpid(chld, &status, 0) != chld)
err(1, "waitpid");
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
printf("[FAIL]\tChild failed\n");
nerrs++;
} else {
printf("[OK]\tChild exited cleanly\n");
}
}
int main()
{
printf("[RUN]\tCheck int80 return regs\n");
test_sys32_regs(do_full_int80);
#if defined(__i386__) && (!defined(__GLIBC__) || __GLIBC__ > 2 || __GLIBC_MINOR__ >= 16)
vsyscall32 = (void *)getauxval(AT_SYSINFO);
printf("[RUN]\tCheck AT_SYSINFO return regs\n");
test_sys32_regs(do_full_vsyscall32);
#endif
test_ptrace_syscall_restart();
return 0;
}
.global sys32_helper
sys32_helper:
/* Args: syscall_args_32*, function pointer */
pushl %ebp
pushl %ebx
pushl %esi
pushl %edi
movl 5*4(%esp), %eax /* pointer to args struct */
movl 1*4(%eax), %ebx
movl 2*4(%eax), %ecx
movl 3*4(%eax), %edx
movl 4*4(%eax), %esi
movl 5*4(%eax), %edi
movl 6*4(%eax), %ebp
movl 0*4(%eax), %eax
call *(6*4)(%esp) /* Do the syscall */
/* Now we need to recover without losing any reg values */
pushl %eax
movl 6*4(%esp), %eax
popl 0*4(%eax)
movl %ebx, 1*4(%eax)
movl %ecx, 2*4(%eax)
movl %edx, 3*4(%eax)
movl %esi, 4*4(%eax)
movl %edi, 5*4(%eax)
movl %ebp, 6*4(%eax)
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
.type sys32_helper, @function
.size sys32_helper, .-sys32_helper
.global int80_and_ret
int80_and_ret:
int $0x80
ret
.type int80_and_ret, @function
.size int80_and_ret, .-int80_and_ret
/*
* 32-bit syscall ABI conformance test.
*
* Copyright (c) 2015 Denys Vlasenko
*
* This program is free software; you can redistribute it and/or modify
* it under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
/*
* Can be built statically:
* gcc -Os -Wall -static -m32 test_syscall_vdso.c thunks_32.S
*/
#undef _GNU_SOURCE
#define _GNU_SOURCE 1
#undef __USE_GNU
#define __USE_GNU 1
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/select.h>
#include <sys/time.h>
#include <elf.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
#if !defined(__i386__)
int main(int argc, char **argv, char **envp)
{
printf("[SKIP]\tNot a 32-bit x86 userspace\n");
return 0;
}
#else
long syscall_addr;
long get_syscall(char **envp)
{
Elf32_auxv_t *auxv;
while (*envp++ != NULL)
continue;
for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++)
if (auxv->a_type == AT_SYSINFO)
return auxv->a_un.a_val;
printf("[WARN]\tAT_SYSINFO not supplied\n");
return 0;
}
asm (
" .pushsection .text\n"
" .global int80\n"
"int80:\n"
" int $0x80\n"
" ret\n"
" .popsection\n"
);
extern char int80;
struct regs64 {
uint64_t rax, rbx, rcx, rdx;
uint64_t rsi, rdi, rbp, rsp;
uint64_t r8, r9, r10, r11;
uint64_t r12, r13, r14, r15;
};
struct regs64 regs64;
int kernel_is_64bit;
asm (
" .pushsection .text\n"
" .code64\n"
"get_regs64:\n"
" push %rax\n"
" mov $regs64, %eax\n"
" pop 0*8(%rax)\n"
" movq %rbx, 1*8(%rax)\n"
" movq %rcx, 2*8(%rax)\n"
" movq %rdx, 3*8(%rax)\n"
" movq %rsi, 4*8(%rax)\n"
" movq %rdi, 5*8(%rax)\n"
" movq %rbp, 6*8(%rax)\n"
" movq %rsp, 7*8(%rax)\n"
" movq %r8, 8*8(%rax)\n"
" movq %r9, 9*8(%rax)\n"
" movq %r10, 10*8(%rax)\n"
" movq %r11, 11*8(%rax)\n"
" movq %r12, 12*8(%rax)\n"
" movq %r13, 13*8(%rax)\n"
" movq %r14, 14*8(%rax)\n"
" movq %r15, 15*8(%rax)\n"
" ret\n"
"poison_regs64:\n"
" movq $0x7f7f7f7f, %r8\n"
" shl $32, %r8\n"
" orq $0x7f7f7f7f, %r8\n"
" movq %r8, %r9\n"
" movq %r8, %r10\n"
" movq %r8, %r11\n"
" movq %r8, %r12\n"
" movq %r8, %r13\n"
" movq %r8, %r14\n"
" movq %r8, %r15\n"
" ret\n"
" .code32\n"
" .popsection\n"
);
extern void get_regs64(void);
extern void poison_regs64(void);
extern unsigned long call64_from_32(void (*function)(void));
void print_regs64(void)
{
if (!kernel_is_64bit)
return;
printf("ax:%016llx bx:%016llx cx:%016llx dx:%016llx\n", regs64.rax, regs64.rbx, regs64.rcx, regs64.rdx);
printf("si:%016llx di:%016llx bp:%016llx sp:%016llx\n", regs64.rsi, regs64.rdi, regs64.rbp, regs64.rsp);
printf(" 8:%016llx 9:%016llx 10:%016llx 11:%016llx\n", regs64.r8 , regs64.r9 , regs64.r10, regs64.r11);
printf("12:%016llx 13:%016llx 14:%016llx 15:%016llx\n", regs64.r12, regs64.r13, regs64.r14, regs64.r15);
}
int check_regs64(void)
{
int err = 0;
int num = 8;
uint64_t *r64 = &regs64.r8;
if (!kernel_is_64bit)
return 0;
do {
if (*r64 == 0x7f7f7f7f7f7f7f7fULL)
continue; /* register did not change */
if (syscall_addr != (long)&int80) {
/*
* Non-INT80 syscall entrypoints are allowed to clobber R8+ regs:
* either clear them to 0, or for R11, load EFLAGS.
*/
if (*r64 == 0)
continue;
if (num == 11) {
printf("[NOTE]\tR11 has changed:%016llx - assuming clobbered by SYSRET insn\n", *r64);
continue;
}
} else {
/* INT80 syscall entrypoint can be used by
* 64-bit programs too, unlike SYSCALL/SYSENTER.
* Therefore it must preserve R12+
* (they are callee-saved registers in 64-bit C ABI).
*
* This was probably historically not intended,
* but R8..11 are clobbered (cleared to 0).
* IOW: they are the only registers which aren't
* preserved across INT80 syscall.
*/
if (*r64 == 0 && num <= 11)
continue;
}
printf("[FAIL]\tR%d has changed:%016llx\n", num, *r64);
err++;
} while (r64++, ++num < 16);
if (!err)
printf("[OK]\tR8..R15 did not leak kernel data\n");
return err;
}
int nfds;
fd_set rfds;
fd_set wfds;
fd_set efds;
struct timespec timeout;
sigset_t sigmask;
struct {
sigset_t *sp;
int sz;
} sigmask_desc;
void prep_args()
{
nfds = 42;
FD_ZERO(&rfds);
FD_ZERO(&wfds);
FD_ZERO(&efds);
FD_SET(0, &rfds);
FD_SET(1, &wfds);
FD_SET(2, &efds);
timeout.tv_sec = 0;
timeout.tv_nsec = 123;
sigemptyset(&sigmask);
sigaddset(&sigmask, SIGINT);
sigaddset(&sigmask, SIGUSR2);
sigaddset(&sigmask, SIGRTMAX);
sigmask_desc.sp = &sigmask;
sigmask_desc.sz = 8; /* bytes */
}
static void print_flags(const char *name, unsigned long r)
{
static const char *bitarray[] = {
"\n" ,"c\n" ,/* Carry Flag */
"0 " ,"1 " ,/* Bit 1 - always on */
"" ,"p " ,/* Parity Flag */
"0 " ,"3? " ,
"" ,"a " ,/* Auxiliary carry Flag */
"0 " ,"5? " ,
"" ,"z " ,/* Zero Flag */
"" ,"s " ,/* Sign Flag */
"" ,"t " ,/* Trap Flag */
"" ,"i " ,/* Interrupt Flag */
"" ,"d " ,/* Direction Flag */
"" ,"o " ,/* Overflow Flag */
"0 " ,"1 " ,/* I/O Privilege Level (2 bits) */
"0" ,"1" ,/* I/O Privilege Level (2 bits) */
"" ,"n " ,/* Nested Task */
"0 " ,"15? ",
"" ,"r " ,/* Resume Flag */
"" ,"v " ,/* Virtual Mode */
"" ,"ac " ,/* Alignment Check/Access Control */
"" ,"vif ",/* Virtual Interrupt Flag */
"" ,"vip ",/* Virtual Interrupt Pending */
"" ,"id " ,/* CPUID detection */
NULL
};
const char **bitstr;
int bit;
printf("%s=%016lx ", name, r);
bitstr = bitarray + 42;
bit = 21;
if ((r >> 22) != 0)
printf("(extra bits are set) ");
do {
if (bitstr[(r >> bit) & 1][0])
fputs(bitstr[(r >> bit) & 1], stdout);
bitstr -= 2;
bit--;
} while (bit >= 0);
}
int run_syscall(void)
{
long flags, bad_arg;
prep_args();
if (kernel_is_64bit)
call64_from_32(poison_regs64);
/*print_regs64();*/
asm("\n"
/* Try 6-arg syscall: pselect. It should return quickly */
" push %%ebp\n"
" mov $308, %%eax\n" /* PSELECT */
" mov nfds, %%ebx\n" /* ebx arg1 */
" mov $rfds, %%ecx\n" /* ecx arg2 */
" mov $wfds, %%edx\n" /* edx arg3 */
" mov $efds, %%esi\n" /* esi arg4 */
" mov $timeout, %%edi\n" /* edi arg5 */
" mov $sigmask_desc, %%ebp\n" /* %ebp arg6 */
" push $0x200ed7\n" /* set almost all flags */
" popf\n" /* except TF, IOPL, NT, RF, VM, AC, VIF, VIP */
" call *syscall_addr\n"
/* Check that registers are not clobbered */
" pushf\n"
" pop %%eax\n"
" cld\n"
" cmp nfds, %%ebx\n" /* ebx arg1 */
" mov $1, %%ebx\n"
" jne 1f\n"
" cmp $rfds, %%ecx\n" /* ecx arg2 */
" mov $2, %%ebx\n"
" jne 1f\n"
" cmp $wfds, %%edx\n" /* edx arg3 */
" mov $3, %%ebx\n"
" jne 1f\n"
" cmp $efds, %%esi\n" /* esi arg4 */
" mov $4, %%ebx\n"
" jne 1f\n"
" cmp $timeout, %%edi\n" /* edi arg5 */
" mov $5, %%ebx\n"
" jne 1f\n"
" cmpl $sigmask_desc, %%ebp\n" /* %ebp arg6 */
" mov $6, %%ebx\n"
" jne 1f\n"
" mov $0, %%ebx\n"
"1:\n"
" pop %%ebp\n"
: "=a" (flags), "=b" (bad_arg)
:
: "cx", "dx", "si", "di"
);
if (kernel_is_64bit) {
memset(&regs64, 0x77, sizeof(regs64));
call64_from_32(get_regs64);
/*print_regs64();*/
}
/*
* On paravirt kernels, flags are not preserved across syscalls.
* Thus, we do not consider it a bug if some are changed.
* We just show ones which do.
*/
if ((0x200ed7 ^ flags) != 0) {
print_flags("[WARN]\tFlags before", 0x200ed7);
print_flags("[WARN]\tFlags after", flags);
print_flags("[WARN]\tFlags change", (0x200ed7 ^ flags));
}
if (bad_arg) {
printf("[FAIL]\targ#%ld clobbered\n", bad_arg);
return 1;
}
printf("[OK]\tArguments are preserved across syscall\n");
return check_regs64();
}
int run_syscall_twice()
{
int exitcode = 0;
long sv;
if (syscall_addr) {
printf("[RUN]\tExecuting 6-argument 32-bit syscall via VDSO\n");
exitcode = run_syscall();
}
sv = syscall_addr;
syscall_addr = (long)&int80;
printf("[RUN]\tExecuting 6-argument 32-bit syscall via INT 80\n");
exitcode += run_syscall();
syscall_addr = sv;
return exitcode;
}
void ptrace_me()
{
pid_t pid;
fflush(NULL);
pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
/* child */
if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) != 0)
exit(0);
raise(SIGSTOP);
return;
}
/* parent */
printf("[RUN]\tRunning tests under ptrace\n");
while (1) {
int status;
pid = waitpid(-1, &status, __WALL);
if (WIFEXITED(status))
exit(WEXITSTATUS(status));
if (WIFSIGNALED(status))
exit(WTERMSIG(status));
if (pid <= 0 || !WIFSTOPPED(status)) /* paranoia */
exit(255);
/*
* Note: we do not inject sig = WSTOPSIG(status).
* We probably should, but careful: do not inject SIGTRAP
* generated by syscall entry/exit stops.
* That kills the child.
*/
ptrace(PTRACE_SYSCALL, pid, 0L, 0L /*sig*/);
}
}
int main(int argc, char **argv, char **envp)
{
int exitcode = 0;
int cs;
asm("\n"
" movl %%cs, %%eax\n"
: "=a" (cs)
);
kernel_is_64bit = (cs == 0x23);
if (!kernel_is_64bit)
printf("[NOTE]\tNot a 64-bit kernel, won't test R8..R15 leaks\n");
/* This only works for non-static builds:
* syscall_addr = dlsym(dlopen("linux-gate.so.1", RTLD_NOW), "__kernel_vsyscall");
*/
syscall_addr = get_syscall(envp);
exitcode += run_syscall_twice();
ptrace_me();
exitcode += run_syscall_twice();
return exitcode;
}
#endif
/*
* thunks_32.S - assembly helpers for mixed-bitness code
* Copyright (c) 2015 Denys Vlasenko
*
* This program is free software; you can redistribute it and/or modify
* it under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* These are little helpers that make it easier to switch bitness on
* the fly.
*/
.text
.code32
.global call64_from_32
.type call32_from_64, @function
// 4(%esp): function to call
call64_from_32:
// Fetch function address
mov 4(%esp), %eax
// Save registers which are callee-clobbered by 64-bit ABI
push %ecx
push %edx
push %esi
push %edi
// Switch to long mode
jmp $0x33,$1f
1: .code64
// Call the function
call *%rax
// Switch to compatibility mode
push $0x23 /* USER32_CS */
.code32; push $1f; .code64 /* hack: can't have X86_64_32S relocation in 32-bit ELF */
lretq
1: .code32
pop %edi
pop %esi
pop %edx
pop %ecx
ret
.size call64_from_32, .-call64_from_32
/*
* unwind_vdso.c - tests unwind info for AT_SYSINFO in the vDSO
* Copyright (c) 2014-2015 Andrew Lutomirski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* This tests __kernel_vsyscall's unwind info.
*/
#define _GNU_SOURCE
#include <features.h>
#include <stdio.h>
#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ < 16
int main()
{
/* We need getauxval(). */
printf("[SKIP]\tGLIBC before 2.16 cannot compile this test\n");
return 0;
}
#else
#include <sys/time.h>
#include <stdlib.h>
#include <syscall.h>
#include <unistd.h>
#include <string.h>
#include <inttypes.h>
#include <sys/mman.h>
#include <signal.h>
#include <sys/ucontext.h>
#include <err.h>
#include <stddef.h>
#include <stdbool.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/ucontext.h>
#include <link.h>
#include <sys/auxv.h>
#include <dlfcn.h>
#include <unwind.h>
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
#ifdef __x86_64__
# define WIDTH "q"
#else
# define WIDTH "l"
#endif
static unsigned long get_eflags(void)
{
unsigned long eflags;
asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
return eflags;
}
static void set_eflags(unsigned long eflags)
{
asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
: : "rm" (eflags) : "flags");
}
#define X86_EFLAGS_TF (1UL << 8)
static volatile sig_atomic_t nerrs;
static unsigned long sysinfo;
static bool got_sysinfo = false;
static unsigned long return_address;
struct unwind_state {
unsigned long ip; /* trap source */
int depth; /* -1 until we hit the trap source */
};
_Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
{
struct unwind_state *state = opaque;
unsigned long ip = _Unwind_GetIP(ctx);
if (state->depth == -1) {
if (ip == state->ip)
state->depth = 0;
else
return _URC_NO_REASON; /* Not there yet */
}
printf("\t 0x%lx\n", ip);
if (ip == return_address) {
/* Here we are. */
unsigned long eax = _Unwind_GetGR(ctx, 0);
unsigned long ecx = _Unwind_GetGR(ctx, 1);
unsigned long edx = _Unwind_GetGR(ctx, 2);
unsigned long ebx = _Unwind_GetGR(ctx, 3);
unsigned long ebp = _Unwind_GetGR(ctx, 5);
unsigned long esi = _Unwind_GetGR(ctx, 6);
unsigned long edi = _Unwind_GetGR(ctx, 7);
bool ok = (eax == SYS_getpid || eax == getpid()) &&
ebx == 1 && ecx == 2 && edx == 3 &&
esi == 4 && edi == 5 && ebp == 6;
if (!ok)
nerrs++;
printf("[%s]\t NR = %ld, args = %ld, %ld, %ld, %ld, %ld, %ld\n",
(ok ? "OK" : "FAIL"),
eax, ebx, ecx, edx, esi, edi, ebp);
return _URC_NORMAL_STOP;
} else {
state->depth++;
return _URC_NO_REASON;
}
}
static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t *)ctx_void;
struct unwind_state state;
unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
if (!got_sysinfo && ip == sysinfo) {
got_sysinfo = true;
/* Find the return address. */
return_address = *(unsigned long *)(unsigned long)ctx->uc_mcontext.gregs[REG_ESP];
printf("\tIn vsyscall at 0x%lx, returning to 0x%lx\n",
ip, return_address);
}
if (!got_sysinfo)
return; /* Not there yet */
if (ip == return_address) {
ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
printf("\tVsyscall is done\n");
return;
}
printf("\tSIGTRAP at 0x%lx\n", ip);
state.ip = ip;
state.depth = -1;
_Unwind_Backtrace(trace_fn, &state);
}
int main()
{
sysinfo = getauxval(AT_SYSINFO);
printf("\tAT_SYSINFO is 0x%lx\n", sysinfo);
Dl_info info;
if (!dladdr((void *)sysinfo, &info)) {
printf("[WARN]\tdladdr failed on AT_SYSINFO\n");
} else {
printf("[OK]\tAT_SYSINFO maps to %s, loaded at 0x%p\n",
info.dli_fname, info.dli_fbase);
}
sethandler(SIGTRAP, sigtrap, 0);
syscall(SYS_getpid); /* Force symbol binding without TF set. */
printf("[RUN]\tSet TF and check a fast syscall\n");
set_eflags(get_eflags() | X86_EFLAGS_TF);
syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
if (!got_sysinfo) {
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
/*
* The most likely cause of this is that you're on Debian or
* a Debian-based distro, you're missing libc6-i686, and you're
* affected by libc/19006 (https://sourceware.org/PR19006).
*/
printf("[WARN]\tsyscall(2) didn't enter AT_SYSINFO\n");
}
if (get_eflags() & X86_EFLAGS_TF) {
printf("[FAIL]\tTF is still set\n");
nerrs++;
}
if (nerrs) {
printf("[FAIL]\tThere were errors\n");
return 1;
} else {
printf("[OK]\tAll is well\n");
return 0;
}
}
#endif /* New enough libc */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment