Commit 5c4a1c09 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "These are the fixes left over from the v5.4 cycle:

   - Various low level 32-bit entry code fixes and improvements by Andy
     Lutomirski, Peter Zijlstra and Thomas Gleixner.

   - Fix 32-bit Xen PV breakage, by Jan Beulich"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/entry/32: Fix FIXUP_ESPFIX_STACK with user CR3
  x86/pti/32: Calculate the various PTI cpu_entry_area sizes correctly, make the CPU_ENTRY_AREA_PAGES assert precise
  selftests/x86/sigreturn/32: Invalidate DS and ES when abusing the kernel
  selftests/x86/mov_ss_trap: Fix the SYSENTER test
  x86/entry/32: Fix NMI vs ESPFIX
  x86/entry/32: Unwind the ESPFIX stack earlier on exception entry
  x86/entry/32: Move FIXUP_FRAME after pushing %fs in SAVE_ALL
  x86/entry/32: Use %ss segment where required
  x86/entry/32: Fix IRET exception
  x86/cpu_entry_area: Add guard page for entry stack on 32bit
  x86/pti/32: Size initial_page_table correctly
  x86/doublefault/32: Fix stack canaries in the double fault handler
  x86/xen/32: Simplify ring check in xen_iret_crit_fixup()
  x86/xen/32: Make xen_iret_crit_fixup() independent of frame layout
  x86/stackframe/32: Repair 32-bit Xen PV
parents 53a07a14 4a13b0e3
...@@ -172,7 +172,7 @@ ...@@ -172,7 +172,7 @@
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
.if \no_user_check == 0 .if \no_user_check == 0
/* coming from usermode? */ /* coming from usermode? */
testl $SEGMENT_RPL_MASK, PT_CS(%esp) testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp)
jz .Lend_\@ jz .Lend_\@
.endif .endif
/* On user-cr3? */ /* On user-cr3? */
...@@ -205,64 +205,76 @@ ...@@ -205,64 +205,76 @@
#define CS_FROM_ENTRY_STACK (1 << 31) #define CS_FROM_ENTRY_STACK (1 << 31)
#define CS_FROM_USER_CR3 (1 << 30) #define CS_FROM_USER_CR3 (1 << 30)
#define CS_FROM_KERNEL (1 << 29) #define CS_FROM_KERNEL (1 << 29)
#define CS_FROM_ESPFIX (1 << 28)
.macro FIXUP_FRAME .macro FIXUP_FRAME
/* /*
* The high bits of the CS dword (__csh) are used for CS_FROM_*. * The high bits of the CS dword (__csh) are used for CS_FROM_*.
* Clear them in case hardware didn't do this for us. * Clear them in case hardware didn't do this for us.
*/ */
andl $0x0000ffff, 3*4(%esp) andl $0x0000ffff, 4*4(%esp)
#ifdef CONFIG_VM86 #ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, 4*4(%esp) testl $X86_EFLAGS_VM, 5*4(%esp)
jnz .Lfrom_usermode_no_fixup_\@ jnz .Lfrom_usermode_no_fixup_\@
#endif #endif
testl $SEGMENT_RPL_MASK, 3*4(%esp) testl $USER_SEGMENT_RPL_MASK, 4*4(%esp)
jnz .Lfrom_usermode_no_fixup_\@ jnz .Lfrom_usermode_no_fixup_\@
orl $CS_FROM_KERNEL, 3*4(%esp) orl $CS_FROM_KERNEL, 4*4(%esp)
/* /*
* When we're here from kernel mode; the (exception) stack looks like: * When we're here from kernel mode; the (exception) stack looks like:
* *
* 5*4(%esp) - <previous context> * 6*4(%esp) - <previous context>
* 4*4(%esp) - flags * 5*4(%esp) - flags
* 3*4(%esp) - cs * 4*4(%esp) - cs
* 2*4(%esp) - ip * 3*4(%esp) - ip
* 1*4(%esp) - orig_eax * 2*4(%esp) - orig_eax
* 0*4(%esp) - gs / function * 1*4(%esp) - gs / function
* 0*4(%esp) - fs
* *
* Lets build a 5 entry IRET frame after that, such that struct pt_regs * Lets build a 5 entry IRET frame after that, such that struct pt_regs
* is complete and in particular regs->sp is correct. This gives us * is complete and in particular regs->sp is correct. This gives us
* the original 5 enties as gap: * the original 6 enties as gap:
* *
* 12*4(%esp) - <previous context> * 14*4(%esp) - <previous context>
* 11*4(%esp) - gap / flags * 13*4(%esp) - gap / flags
* 10*4(%esp) - gap / cs * 12*4(%esp) - gap / cs
* 9*4(%esp) - gap / ip * 11*4(%esp) - gap / ip
* 8*4(%esp) - gap / orig_eax * 10*4(%esp) - gap / orig_eax
* 7*4(%esp) - gap / gs / function * 9*4(%esp) - gap / gs / function
* 6*4(%esp) - ss * 8*4(%esp) - gap / fs
* 5*4(%esp) - sp * 7*4(%esp) - ss
* 4*4(%esp) - flags * 6*4(%esp) - sp
* 3*4(%esp) - cs * 5*4(%esp) - flags
* 2*4(%esp) - ip * 4*4(%esp) - cs
* 1*4(%esp) - orig_eax * 3*4(%esp) - ip
* 0*4(%esp) - gs / function * 2*4(%esp) - orig_eax
* 1*4(%esp) - gs / function
* 0*4(%esp) - fs
*/ */
pushl %ss # ss pushl %ss # ss
pushl %esp # sp (points at ss) pushl %esp # sp (points at ss)
addl $6*4, (%esp) # point sp back at the previous context addl $7*4, (%esp) # point sp back at the previous context
pushl 6*4(%esp) # flags pushl 7*4(%esp) # flags
pushl 6*4(%esp) # cs pushl 7*4(%esp) # cs
pushl 6*4(%esp) # ip pushl 7*4(%esp) # ip
pushl 6*4(%esp) # orig_eax pushl 7*4(%esp) # orig_eax
pushl 6*4(%esp) # gs / function pushl 7*4(%esp) # gs / function
pushl 7*4(%esp) # fs
.Lfrom_usermode_no_fixup_\@: .Lfrom_usermode_no_fixup_\@:
.endm .endm
.macro IRET_FRAME .macro IRET_FRAME
/*
* We're called with %ds, %es, %fs, and %gs from the interrupted
* frame, so we shouldn't use them. Also, we may be in ESPFIX
* mode and therefore have a nonzero SS base and an offset ESP,
* so any attempt to access the stack needs to use SS. (except for
* accesses through %esp, which automatically use SS.)
*/
testl $CS_FROM_KERNEL, 1*4(%esp) testl $CS_FROM_KERNEL, 1*4(%esp)
jz .Lfinished_frame_\@ jz .Lfinished_frame_\@
...@@ -276,31 +288,40 @@ ...@@ -276,31 +288,40 @@
movl 5*4(%esp), %eax # (modified) regs->sp movl 5*4(%esp), %eax # (modified) regs->sp
movl 4*4(%esp), %ecx # flags movl 4*4(%esp), %ecx # flags
movl %ecx, -4(%eax) movl %ecx, %ss:-1*4(%eax)
movl 3*4(%esp), %ecx # cs movl 3*4(%esp), %ecx # cs
andl $0x0000ffff, %ecx andl $0x0000ffff, %ecx
movl %ecx, -8(%eax) movl %ecx, %ss:-2*4(%eax)
movl 2*4(%esp), %ecx # ip movl 2*4(%esp), %ecx # ip
movl %ecx, -12(%eax) movl %ecx, %ss:-3*4(%eax)
movl 1*4(%esp), %ecx # eax movl 1*4(%esp), %ecx # eax
movl %ecx, -16(%eax) movl %ecx, %ss:-4*4(%eax)
popl %ecx popl %ecx
lea -16(%eax), %esp lea -4*4(%eax), %esp
popl %eax popl %eax
.Lfinished_frame_\@: .Lfinished_frame_\@:
.endm .endm
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
cld cld
.if \skip_gs == 0 .if \skip_gs == 0
PUSH_GS PUSH_GS
.endif .endif
FIXUP_FRAME
pushl %fs pushl %fs
pushl %eax
movl $(__KERNEL_PERCPU), %eax
movl %eax, %fs
.if \unwind_espfix > 0
UNWIND_ESPFIX_STACK
.endif
popl %eax
FIXUP_FRAME
pushl %es pushl %es
pushl %ds pushl %ds
pushl \pt_regs_ax pushl \pt_regs_ax
...@@ -313,8 +334,6 @@ ...@@ -313,8 +334,6 @@
movl $(__USER_DS), %edx movl $(__USER_DS), %edx
movl %edx, %ds movl %edx, %ds
movl %edx, %es movl %edx, %es
movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
.if \skip_gs == 0 .if \skip_gs == 0
SET_KERNEL_GS %edx SET_KERNEL_GS %edx
.endif .endif
...@@ -324,8 +343,8 @@ ...@@ -324,8 +343,8 @@
.endif .endif
.endm .endm
.macro SAVE_ALL_NMI cr3_reg:req .macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0
SAVE_ALL SAVE_ALL unwind_espfix=\unwind_espfix
BUG_IF_WRONG_CR3 BUG_IF_WRONG_CR3
...@@ -357,6 +376,7 @@ ...@@ -357,6 +376,7 @@
2: popl %es 2: popl %es
3: popl %fs 3: popl %fs
POP_GS \pop POP_GS \pop
IRET_FRAME
.pushsection .fixup, "ax" .pushsection .fixup, "ax"
4: movl $0, (%esp) 4: movl $0, (%esp)
jmp 1b jmp 1b
...@@ -395,7 +415,8 @@ ...@@ -395,7 +415,8 @@
.macro CHECK_AND_APPLY_ESPFIX .macro CHECK_AND_APPLY_ESPFIX
#ifdef CONFIG_X86_ESPFIX32 #ifdef CONFIG_X86_ESPFIX32
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) #define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
...@@ -1075,7 +1096,6 @@ restore_all: ...@@ -1075,7 +1096,6 @@ restore_all:
/* Restore user state */ /* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code RESTORE_REGS pop=4 # skip orig_eax/error_code
.Lirq_return: .Lirq_return:
IRET_FRAME
/* /*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
* when returning from IPI handler and when returning from * when returning from IPI handler and when returning from
...@@ -1128,30 +1148,43 @@ ENDPROC(entry_INT80_32) ...@@ -1128,30 +1148,43 @@ ENDPROC(entry_INT80_32)
* We can't call C functions using the ESPFIX stack. This code reads * We can't call C functions using the ESPFIX stack. This code reads
* the high word of the segment base from the GDT and swiches to the * the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset. * normal stack and adjusts ESP with the matching offset.
*
* We might be on user CR3 here, so percpu data is not mapped and we can't
* access the GDT through the percpu segment. Instead, use SGDT to find
* the cpu_entry_area alias of the GDT.
*/ */
#ifdef CONFIG_X86_ESPFIX32 #ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */ /* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ pushl %ecx
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ subl $2*4, %esp
sgdt (%esp)
movl 2(%esp), %ecx /* GDT address */
/*
* Careful: ECX is a linear pointer, so we need to force base
* zero. %cs is the only known-linear segment we have right now.
*/
mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */
mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */
shl $16, %eax shl $16, %eax
addl $2*4, %esp
popl %ecx
addl %esp, %eax /* the adjusted stack pointer */ addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS pushl $__KERNEL_DS
pushl %eax pushl %eax
lss (%esp), %esp /* switch to the normal stack segment */ lss (%esp), %esp /* switch to the normal stack segment */
#endif #endif
.endm .endm
.macro UNWIND_ESPFIX_STACK .macro UNWIND_ESPFIX_STACK
/* It's safe to clobber %eax, all other regs need to be preserved */
#ifdef CONFIG_X86_ESPFIX32 #ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax movl %ss, %eax
/* see if on espfix stack */ /* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax cmpw $__ESPFIX_SS, %ax
jne 27f jne .Lno_fixup_\@
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %es
/* switch to normal stack */ /* switch to normal stack */
FIXUP_ESPFIX_STACK FIXUP_ESPFIX_STACK
27: .Lno_fixup_\@:
#endif #endif
.endm .endm
...@@ -1341,11 +1374,6 @@ END(spurious_interrupt_bug) ...@@ -1341,11 +1374,6 @@ END(spurious_interrupt_bug)
#ifdef CONFIG_XEN_PV #ifdef CONFIG_XEN_PV
ENTRY(xen_hypervisor_callback) ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
/* /*
* Check to see if we got the event in the critical * Check to see if we got the event in the critical
* region in xen_iret_direct, after we've reenabled * region in xen_iret_direct, after we've reenabled
...@@ -1353,16 +1381,17 @@ ENTRY(xen_hypervisor_callback) ...@@ -1353,16 +1381,17 @@ ENTRY(xen_hypervisor_callback)
* iret instruction's behaviour where it delivers a * iret instruction's behaviour where it delivers a
* pending interrupt when enabling interrupts: * pending interrupt when enabling interrupts:
*/ */
movl PT_EIP(%esp), %eax cmpl $xen_iret_start_crit, (%esp)
cmpl $xen_iret_start_crit, %eax
jb 1f jb 1f
cmpl $xen_iret_end_crit, %eax cmpl $xen_iret_end_crit, (%esp)
jae 1f jae 1f
call xen_iret_crit_fixup
jmp xen_iret_crit_fixup 1:
pushl $-1 /* orig_ax = -1 => not a system call */
ENTRY(xen_do_upcall) SAVE_ALL
1: mov %esp, %eax ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
mov %esp, %eax
call xen_evtchn_do_upcall call xen_evtchn_do_upcall
#ifndef CONFIG_PREEMPTION #ifndef CONFIG_PREEMPTION
call xen_maybe_preempt_hcall call xen_maybe_preempt_hcall
...@@ -1449,10 +1478,9 @@ END(page_fault) ...@@ -1449,10 +1478,9 @@ END(page_fault)
common_exception_read_cr2: common_exception_read_cr2:
/* the function address is in %gs's slot on the stack */ /* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER ENCODE_FRAME_POINTER
UNWIND_ESPFIX_STACK
/* fixup %gs */ /* fixup %gs */
GS_TO_REG %ecx GS_TO_REG %ecx
...@@ -1474,9 +1502,8 @@ END(common_exception_read_cr2) ...@@ -1474,9 +1502,8 @@ END(common_exception_read_cr2)
common_exception: common_exception:
/* the function address is in %gs's slot on the stack */ /* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER ENCODE_FRAME_POINTER
UNWIND_ESPFIX_STACK
/* fixup %gs */ /* fixup %gs */
GS_TO_REG %ecx GS_TO_REG %ecx
...@@ -1515,6 +1542,10 @@ ENTRY(nmi) ...@@ -1515,6 +1542,10 @@ ENTRY(nmi)
ASM_CLAC ASM_CLAC
#ifdef CONFIG_X86_ESPFIX32 #ifdef CONFIG_X86_ESPFIX32
/*
* ESPFIX_SS is only ever set on the return to user path
* after we've switched to the entry stack.
*/
pushl %eax pushl %eax
movl %ss, %eax movl %ss, %eax
cmpw $__ESPFIX_SS, %ax cmpw $__ESPFIX_SS, %ax
...@@ -1550,6 +1581,11 @@ ENTRY(nmi) ...@@ -1550,6 +1581,11 @@ ENTRY(nmi)
movl %ebx, %esp movl %ebx, %esp
.Lnmi_return: .Lnmi_return:
#ifdef CONFIG_X86_ESPFIX32
testl $CS_FROM_ESPFIX, PT_CS(%esp)
jnz .Lnmi_from_espfix
#endif
CHECK_AND_APPLY_ESPFIX CHECK_AND_APPLY_ESPFIX
RESTORE_ALL_NMI cr3_reg=%edi pop=4 RESTORE_ALL_NMI cr3_reg=%edi pop=4
jmp .Lirq_return jmp .Lirq_return
...@@ -1557,23 +1593,42 @@ ENTRY(nmi) ...@@ -1557,23 +1593,42 @@ ENTRY(nmi)
#ifdef CONFIG_X86_ESPFIX32 #ifdef CONFIG_X86_ESPFIX32
.Lnmi_espfix_stack: .Lnmi_espfix_stack:
/* /*
* create the pointer to lss back * Create the pointer to LSS back
*/ */
pushl %ss pushl %ss
pushl %esp pushl %esp
addl $4, (%esp) addl $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3 /* Copy the (short) IRET frame */
pushl 16(%esp) pushl 4*4(%esp) # flags
.endr pushl 4*4(%esp) # cs
pushl %eax pushl 4*4(%esp) # ip
SAVE_ALL_NMI cr3_reg=%edi
pushl %eax # orig_ax
SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1
ENCODE_FRAME_POINTER ENCODE_FRAME_POINTER
FIXUP_ESPFIX_STACK # %eax == %esp
/* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */
xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp)
xorl %edx, %edx # zero error code xorl %edx, %edx # zero error code
call do_nmi movl %esp, %eax # pt_regs pointer
jmp .Lnmi_from_sysenter_stack
.Lnmi_from_espfix:
RESTORE_ALL_NMI cr3_reg=%edi RESTORE_ALL_NMI cr3_reg=%edi
lss 12+4(%esp), %esp # back to espfix stack /*
* Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to
* fix up the gap and long frame:
*
* 3 - original frame (exception)
* 2 - ESPFIX block (above)
* 6 - gap (FIXUP_FRAME)
* 5 - long frame (FIXUP_FRAME)
* 1 - orig_ax
*/
lss (1+5+6)*4(%esp), %esp # back to espfix stack
jmp .Lirq_return jmp .Lirq_return
#endif #endif
END(nmi) END(nmi)
......
...@@ -78,8 +78,12 @@ struct cpu_entry_area { ...@@ -78,8 +78,12 @@ struct cpu_entry_area {
/* /*
* The GDT is just below entry_stack and thus serves (on x86_64) as * The GDT is just below entry_stack and thus serves (on x86_64) as
* a a read-only guard page. * a read-only guard page. On 32-bit the GDT must be writeable, so
* it needs an extra guard page.
*/ */
#ifdef CONFIG_X86_32
char guard_entry_stack[PAGE_SIZE];
#endif
struct entry_stack_page entry_stack_page; struct entry_stack_page entry_stack_page;
/* /*
...@@ -94,7 +98,6 @@ struct cpu_entry_area { ...@@ -94,7 +98,6 @@ struct cpu_entry_area {
*/ */
struct cea_exception_stacks estacks; struct cea_exception_stacks estacks;
#endif #endif
#ifdef CONFIG_CPU_SUP_INTEL
/* /*
* Per CPU debug store for Intel performance monitoring. Wastes a * Per CPU debug store for Intel performance monitoring. Wastes a
* full page at the moment. * full page at the moment.
...@@ -105,11 +108,13 @@ struct cpu_entry_area { ...@@ -105,11 +108,13 @@ struct cpu_entry_area {
* Reserve enough fixmap PTEs. * Reserve enough fixmap PTEs.
*/ */
struct debug_store_buffers cpu_debug_buffers; struct debug_store_buffers cpu_debug_buffers;
#endif
}; };
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) #define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
/* Total size includes the readonly IDT mapping page as well: */
#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
...@@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); ...@@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
extern void setup_cpu_entry_areas(void); extern void setup_cpu_entry_areas(void);
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
/* Single page reserved for the readonly IDT mapping: */
#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE #define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) #define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
#define CPU_ENTRY_AREA_MAP_SIZE \ #define CPU_ENTRY_AREA_MAP_SIZE \
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
extern struct cpu_entry_area *get_cpu_entry_area(int cpu); extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
......
...@@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ ...@@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
* Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
* to avoid include recursion hell * to avoid include recursion hell
*/ */
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 39)
/* The +1 is for the readonly IDT page: */
#define CPU_ENTRY_AREA_BASE \ #define CPU_ENTRY_AREA_BASE \
((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK)
& PMD_MASK)
#define LDT_BASE_ADDR \ #define LDT_BASE_ADDR \
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
......
...@@ -31,6 +31,18 @@ ...@@ -31,6 +31,18 @@
*/ */
#define SEGMENT_RPL_MASK 0x3 #define SEGMENT_RPL_MASK 0x3
/*
* When running on Xen PV, the actual privilege level of the kernel is 1,
* not 0. Testing the Requested Privilege Level in a segment selector to
* determine whether the context is user mode or kernel mode with
* SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level
* matches the 0x3 mask.
*
* Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV
* kernels because privilege level 2 is never used.
*/
#define USER_SEGMENT_RPL_MASK 0x2
/* User mode is privilege level 3: */ /* User mode is privilege level 3: */
#define USER_RPL 0x3 #define USER_RPL 0x3
......
...@@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { ...@@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.ss = __KERNEL_DS, .ss = __KERNEL_DS,
.ds = __USER_DS, .ds = __USER_DS,
.fs = __KERNEL_PERCPU, .fs = __KERNEL_PERCPU,
#ifndef CONFIG_X86_32_LAZY_GS
.gs = __KERNEL_STACK_CANARY,
#endif
.__cr3 = __pa_nodebug(swapper_pg_dir), .__cr3 = __pa_nodebug(swapper_pg_dir),
}; };
......
...@@ -571,6 +571,16 @@ ENTRY(initial_page_table) ...@@ -571,6 +571,16 @@ ENTRY(initial_page_table)
# error "Kernel PMDs should be 1, 2 or 3" # error "Kernel PMDs should be 1, 2 or 3"
# endif # endif
.align PAGE_SIZE /* needs to be page-sized too */ .align PAGE_SIZE /* needs to be page-sized too */
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* PTI needs another page so sync_initial_pagetable() works correctly
* and does not scribble over the data which is placed behind the
* actual initial_page_table. See clone_pgd_range().
*/
.fill 1024, 4, 0
#endif
#endif #endif
.data .data
......
...@@ -178,7 +178,9 @@ static __init void setup_cpu_entry_area_ptes(void) ...@@ -178,7 +178,9 @@ static __init void setup_cpu_entry_area_ptes(void)
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
unsigned long start, end; unsigned long start, end;
BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); /* The +1 is for the readonly IDT: */
BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
start = CPU_ENTRY_AREA_BASE; start = CPU_ENTRY_AREA_BASE;
......
...@@ -126,10 +126,9 @@ hyper_iret: ...@@ -126,10 +126,9 @@ hyper_iret:
.globl xen_iret_start_crit, xen_iret_end_crit .globl xen_iret_start_crit, xen_iret_end_crit
/* /*
* This is called by xen_hypervisor_callback in entry.S when it sees * This is called by xen_hypervisor_callback in entry_32.S when it sees
* that the EIP at the time of interrupt was between * that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in * xen_iret_start_crit and xen_iret_end_crit.
* %eax so we can do a more refined determination of what to do.
* *
* The stack format at this point is: * The stack format at this point is:
* ---------------- * ----------------
...@@ -138,70 +137,46 @@ hyper_iret: ...@@ -138,70 +137,46 @@ hyper_iret:
* eflags } outer exception info * eflags } outer exception info
* cs } * cs }
* eip } * eip }
* ---------------- <- edi (copy dest)
* eax : outer eax if it hasn't been restored
* ---------------- * ----------------
* eflags } nested exception info * eax : outer eax if it hasn't been restored
* cs } (no ss/esp because we're nested
* eip } from the same ring)
* orig_eax }<- esi (copy src)
* - - - - - - - -
* fs }
* es }
* ds } SAVE_ALL state
* eax }
* : :
* ebx }<- esp
* ---------------- * ----------------
* eflags }
* cs } nested exception info
* eip }
* return address : (into xen_hypervisor_callback)
* *
* In order to deliver the nested exception properly, we need to shift * In order to deliver the nested exception properly, we need to discard the
* everything from the return addr up to the error code so it sits * nested exception frame such that when we handle the exception, we do it
* just under the outer exception info. This means that when we * in the context of the outer exception rather than starting a new one.
* handle the exception, we do it in the context of the outer
* exception rather than starting a new one.
* *
* The only caveat is that if the outer eax hasn't been restored yet * The only caveat is that if the outer eax hasn't been restored yet (i.e.
* (ie, it's still on stack), we need to insert its value into the * it's still on stack), we need to restore its value here.
* SAVE_ALL state before going on, since it's usermode state which we
* eventually need to restore.
*/ */
ENTRY(xen_iret_crit_fixup) ENTRY(xen_iret_crit_fixup)
/* /*
* Paranoia: Make sure we're really coming from kernel space. * Paranoia: Make sure we're really coming from kernel space.
* One could imagine a case where userspace jumps into the * One could imagine a case where userspace jumps into the
* critical range address, but just before the CPU delivers a * critical range address, but just before the CPU delivers a
* GP, it decides to deliver an interrupt instead. Unlikely? * PF, it decides to deliver an interrupt instead. Unlikely?
* Definitely. Easy to avoid? Yes. The Intel documents * Definitely. Easy to avoid? Yes.
* explicitly say that the reported EIP for a bad jump is the
* jump instruction itself, not the destination, but some
* virtual environments get this wrong.
*/ */
movl PT_CS(%esp), %ecx testb $2, 2*4(%esp) /* nested CS */
andl $SEGMENT_RPL_MASK, %ecx jnz 2f
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
/* /*
* If eip is before iret_restore_end then stack * If eip is before iret_restore_end then stack
* hasn't been restored yet. * hasn't been restored yet.
*/ */
cmp $iret_restore_end, %eax cmpl $iret_restore_end, 1*4(%esp)
jae 1f jae 1f
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ movl 4*4(%esp), %eax /* load outer EAX */
movl %eax, PT_EAX(%esp) ret $4*4 /* discard nested EIP, CS, and EFLAGS as
* well as the just restored EAX */
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ 1:
ret $3*4 /* discard nested EIP, CS, and EFLAGS */
/* set up the copy */
1: std
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall
2:
ret
END(xen_iret_crit_fixup)
...@@ -257,7 +257,8 @@ int main() ...@@ -257,7 +257,8 @@ int main()
err(1, "sigaltstack"); err(1, "sigaltstack");
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK); sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
nr = SYS_getpid; nr = SYS_getpid;
asm volatile ("mov %[ss], %%ss; SYSENTER" : "+a" (nr) /* Clear EBP first to make sure we segfault cleanly. */
asm volatile ("xorl %%ebp, %%ebp; mov %[ss], %%ss; SYSENTER" : "+a" (nr)
: [ss] "m" (ss) : "flags", "rcx" : [ss] "m" (ss) : "flags", "rcx"
#ifdef __x86_64__ #ifdef __x86_64__
, "r11" , "r11"
......
...@@ -451,6 +451,19 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) ...@@ -451,6 +451,19 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
ctx->uc_mcontext.gregs[REG_CX] = 0; ctx->uc_mcontext.gregs[REG_CX] = 0;
#ifdef __i386__
/*
* Make sure the kernel doesn't inadvertently use DS or ES-relative
* accesses in a region where user DS or ES is loaded.
*
* Skip this for 64-bit builds because long mode doesn't care about
* DS and ES and skipping it increases test coverage a little bit,
* since 64-bit kernels can still run the 32-bit build.
*/
ctx->uc_mcontext.gregs[REG_DS] = 0;
ctx->uc_mcontext.gregs[REG_ES] = 0;
#endif
memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */ requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment