Commit 1255f440 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86_paravirt_for_v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 paravirt updates from Borislav Petkov:
 "Part one of a major conversion of the paravirt infrastructure to our
  kernel patching facilities and getting rid of the custom-grown ones"

* tag 'x86_paravirt_for_v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/pv: Rework arch_local_irq_restore() to not use popf
  x86/xen: Drop USERGS_SYSRET64 paravirt call
  x86/pv: Switch SWAPGS to ALTERNATIVE
  x86/xen: Use specific Xen pv interrupt entry for DF
  x86/xen: Use specific Xen pv interrupt entry for MCE
parents 4f7a4028 ab234a26
......@@ -46,14 +46,6 @@
.code64
.section .entry.text, "ax"
#ifdef CONFIG_PARAVIRT_XXL
SYM_CODE_START(native_usergs_sysret64)
UNWIND_HINT_EMPTY
swapgs
sysretq
SYM_CODE_END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT_XXL */
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
......@@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
X86_FEATURE_XENPV
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
......@@ -215,7 +212,8 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
USERGS_SYSRET64
swapgs
sysretq
SYM_CODE_END(entry_SYSCALL_64)
/*
......@@ -669,7 +667,7 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
SWAPGS /* to kernel GS */
swapgs /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
......@@ -699,7 +697,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
SWAPGS /* to user GS */
swapgs /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
......@@ -943,7 +941,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
ret
.Lparanoid_entry_swapgs:
SWAPGS
swapgs
/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
......@@ -1001,7 +999,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
jnz restore_regs_and_return_to_kernel
/* We are returning to a context with user GSBASE */
SWAPGS_UNSAFE_STACK
swapgs
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
......@@ -1426,7 +1424,7 @@ nmi_no_fsgsbase:
jnz nmi_restore
nmi_swapgs:
SWAPGS_UNSAFE_STACK
swapgs
nmi_restore:
POP_REGS
......
......@@ -585,6 +585,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
#else
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
#endif
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
#endif
#endif
/* NMI */
......@@ -605,6 +608,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
#endif
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
......
......@@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void)
return flags;
}
extern inline void native_restore_fl(unsigned long flags);
extern inline void native_restore_fl(unsigned long flags)
{
asm volatile("push %0 ; popf"
: /* no output */
:"g" (flags)
:"memory", "cc");
}
static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
......@@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void)
return native_save_fl();
}
static __always_inline void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
static __always_inline void arch_local_irq_disable(void)
{
native_irq_disable();
......@@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
#define SWAPGS swapgs
/*
* Currently paravirt can't handle swapgs nicely when we
* don't have a stack we can rely on (such as a user space
* stack). So we either find a way around these or just fault
* and emulate if a guest tries to call swapgs directly.
*
* Either way, this is a good way to document that we don't
* have a reliable stack. x86_64 only.
*/
#define SWAPGS_UNSAFE_STACK swapgs
#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
#define USERGS_SYSRET32 \
swapgs; \
sysretl
#else
#define INTERRUPT_RETURN iret
......@@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void)
return arch_irqs_disabled_flags(flags);
}
static __always_inline void arch_local_irq_restore(unsigned long flags)
{
if (!arch_irqs_disabled_flags(flags))
arch_local_irq_enable();
}
#else
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
#else
#define SWAPGS swapgs
#endif
#endif
#endif /* !__ASSEMBLY__ */
#endif
......@@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void)
return PVOP_CALLEE0(unsigned long, irq.save_fl);
}
static inline notrace void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(irq.restore_fl, f);
}
static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(irq.irq_disable);
......@@ -776,31 +771,6 @@ extern void default_banner(void);
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
/*
* If swapgs is used while the userspace stack is still current,
* there's no way to call a pvop. The PV replacement *must* be
* inlined, or the swapgs instruction must be trapped and emulated.
*/
#define SWAPGS_UNSAFE_STACK \
PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
/*
* Note: swapgs is very special, and in practise is either going to be
* implemented with a single "swapgs" instruction or something very
* special. Either way, we don't need to save any registers for
* it.
*/
#define SWAPGS \
PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
ANNOTATE_RETPOLINE_SAFE; \
call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
)
#define USERGS_SYSRET64 \
PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
ANNOTATE_RETPOLINE_SAFE; \
jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
......
......@@ -156,20 +156,10 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
/*
* Switch to usermode gs and return to 64-bit usermode using
* sysret. Only used in 64-bit kernels to return to 64-bit
* processes. Usermode register state, including %rsp, must
* already be restored.
*/
void (*usergs_sysret64)(void);
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
void (*swapgs)(void);
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
......@@ -178,16 +168,13 @@ struct pv_cpu_ops {
struct pv_irq_ops {
#ifdef CONFIG_PARAVIRT_XXL
/*
* Get/set interrupt state. save_fl and restore_fl are only
* expected to use X86_EFLAGS_IF; all other bits
* returned from save_fl are undefined, and may be ignored by
* restore_fl.
* Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF;
* all other bits returned from save_fl are undefined.
*
* NOTE: These functions callers expect the callee to preserve
* more registers than the standard C calling convention.
*/
struct paravirt_callee_save save_fl;
struct paravirt_callee_save restore_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
......
......@@ -13,9 +13,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_PARAVIRT_XXL
OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
cpu.usergs_sysret64);
OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
#endif
......
......@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
ret
SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
/*
* void native_restore_fl(unsigned long flags)
* %eax/%rdi: flags
*/
SYM_FUNC_START(native_restore_fl)
push %_ASM_ARG1
popf
ret
SYM_FUNC_END(native_restore_fl)
EXPORT_SYMBOL(native_restore_fl)
......@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insn_buff, len);
else if (type == PARAVIRT_PATCH(cpu.iret) ||
type == PARAVIRT_PATCH(cpu.usergs_sysret64))
else if (type == PARAVIRT_PATCH(cpu.iret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
......@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
.start = 0,
......@@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
.cpu.usergs_sysret64 = native_usergs_sysret64,
.cpu.iret = native_iret,
.cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
......@@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
.irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.irq.safe_halt = native_safe_halt,
......
......@@ -25,10 +25,7 @@ struct patch_xxl {
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
const unsigned char irq_restore_fl[2];
const unsigned char cpu_wbinvd[2];
const unsigned char cpu_usergs_sysret64[6];
const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
};
......@@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = {
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
.irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
.cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
0x48, 0x0f, 0x07 }, // swapgs; sysretq
.cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
};
......@@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
......@@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
#endif
......
......@@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{
/* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */
/* On Xen PV, NMI doesn't use IST. The C part is the same as native. */
exc_nmi(regs);
}
DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
{
/* On Xen PV, DF doesn't use IST. The C part is the same as native. */
exc_double_fault(regs, error_code);
}
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{
/*
......@@ -590,6 +596,20 @@ DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
BUG();
}
#ifdef CONFIG_X86_MCE
DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
{
/*
* There's no IST on Xen PV, but we still need to dispatch
* to the correct handler.
*/
if (user_mode(regs))
noist_exc_machine_check(regs);
else
exc_machine_check(regs);
}
#endif
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
......@@ -608,9 +628,9 @@ struct trap_array_entry {
static struct trap_array_entry trap_array[] = {
TRAP_ENTRY_REDIR(exc_debug, true ),
TRAP_ENTRY(exc_double_fault, true ),
TRAP_ENTRY_REDIR(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
TRAP_ENTRY(exc_machine_check, true ),
TRAP_ENTRY_REDIR(exc_machine_check, true ),
#endif
TRAP_ENTRY_REDIR(exc_nmi, true ),
TRAP_ENTRY(exc_int3, false ),
......@@ -1015,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void)
*/
if (xen_have_vcpu_info_placement) {
pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_ops.irq.restore_fl =
__PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_ops.irq.irq_disable =
__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_ops.irq.irq_enable =
......@@ -1053,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
.iret = xen_iret,
.usergs_sysret64 = xen_sysret64,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
......@@ -1078,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
/* Xen takes care of %gs when switching to usermode for us */
.swapgs = paravirt_nop,
.start_context_switch = paravirt_start_context_switch,
.end_context_switch = xen_end_context_switch,
};
......
......@@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
__visible void xen_restore_fl(unsigned long flags)
{
struct vcpu_info *vcpu;
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
/* See xen_irq_enable() for why preemption must be disabled. */
preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
if (flags == 0) {
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
preempt_enable();
} else
preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
......@@ -118,7 +96,6 @@ static void xen_halt(void)
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
......
......@@ -72,34 +72,6 @@ SYM_FUNC_START(xen_save_fl_direct)
ret
SYM_FUNC_END(xen_save_fl_direct)
/*
* In principle the caller should be passing us a value return from
* xen_save_fl_direct, but for robustness sake we test only the
* X86_EFLAGS_IF flag rather than the whole byte. After setting the
* interrupt mask state, it checks for unmasked pending events and
* enters the hypervisor to get them delivered if so.
*/
SYM_FUNC_START(xen_restore_fl_direct)
FRAME_BEGIN
testw $X86_EFLAGS_IF, %di
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jnz 1f
call check_events
1:
FRAME_END
ret
SYM_FUNC_END(xen_restore_fl_direct)
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
......@@ -161,7 +133,7 @@ xen_pv_trap asm_exc_overflow
xen_pv_trap asm_exc_bounds
xen_pv_trap asm_exc_invalid_op
xen_pv_trap asm_exc_device_not_available
xen_pv_trap asm_exc_double_fault
xen_pv_trap asm_xenpv_exc_double_fault
xen_pv_trap asm_exc_coproc_segment_overrun
xen_pv_trap asm_exc_invalid_tss
xen_pv_trap asm_exc_segment_not_present
......@@ -172,7 +144,7 @@ xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
#ifdef CONFIG_X86_MCE
xen_pv_trap asm_exc_machine_check
xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
......@@ -215,26 +187,6 @@ SYM_CODE_START(xen_iret)
jmp hypercall_iret
SYM_CODE_END(xen_iret)
SYM_CODE_START(xen_sysret64)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back.
*
* tss.sp2 is scratch space.
*/
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq $__USER_DS
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
pushq %r11
pushq $__USER_CS
pushq %rcx
pushq $VGCF_in_syscall
jmp hypercall_iret
SYM_CODE_END(xen_sysret64)
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
......
......@@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params)
__visible void xen_irq_enable_direct(void);
__visible void xen_irq_disable_direct(void);
__visible unsigned long xen_save_fl_direct(void);
__visible void xen_restore_fl_direct(unsigned long);
__visible unsigned long xen_read_cr2(void);
__visible unsigned long xen_read_cr2_direct(void);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
extern int xen_panic_handler_init(void);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment