Commit b5df1b3a authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "The main changes are the PCID fixes from Andy, but there's also two
  hyperv fixes and two paravirt updates"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/hyper-v: Remove duplicated HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED definition
  x86/hyper-V: Allocate the IDT entry early in boot
  paravirt: Switch maintainer
  x86/paravirt: Remove no longer used paravirt functions
  x86/mm/64: Initialize CR4.PCIDE early
  x86/hibernate/64: Mask off CR3's PCID bits in the saved CR3
  x86/mm: Get rid of VM_BUG_ON in switch_tlb_irqs_off()
parents 9888e4d4 1278f58c
......@@ -10135,7 +10135,7 @@ F: include/uapi/linux/ppdev.h
F: Documentation/parport*.txt
PARAVIRT_OPS INTERFACE
M: Jeremy Fitzhardinge <jeremy@goop.org>
M: Juergen Gross <jgross@suse.com>
M: Chris Wright <chrisw@sous-sol.org>
M: Alok Kataria <akataria@vmware.com>
M: Rusty Russell <rusty@rustcorp.com.au>
......@@ -10143,7 +10143,7 @@ L: virtualization@lists.linux-foundation.org
S: Supported
F: Documentation/virtual/paravirt_ops.txt
F: arch/*/kernel/paravirt*
F: arch/*/include/asm/paravirt.h
F: arch/*/include/asm/paravirt*.h
F: include/linux/hypervisor.h
PARIDE DRIVERS FOR PARALLEL PORT IDE DEVICES
......
......@@ -121,7 +121,6 @@ static inline int desc_empty(const void *ptr)
#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
#define store_gdt(dtr) native_store_gdt(dtr)
#define store_idt(dtr) native_store_idt(dtr)
#define store_tr(tr) (tr = native_store_tr())
#define load_TLS(t, cpu) native_load_tls(t, cpu)
......@@ -228,7 +227,7 @@ static inline void native_store_gdt(struct desc_ptr *dtr)
asm volatile("sgdt %0":"=m" (*dtr));
}
static inline void native_store_idt(struct desc_ptr *dtr)
static inline void store_idt(struct desc_ptr *dtr)
{
asm volatile("sidt %0":"=m" (*dtr));
}
......
......@@ -71,11 +71,6 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}
static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
......@@ -228,10 +223,6 @@ static inline void set_ldt(const void *addr, unsigned entries)
{
PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
}
static inline void store_idt(struct desc_ptr *dtr)
{
PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
}
static inline unsigned long paravirt_store_tr(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
......@@ -365,12 +356,6 @@ static inline void paravirt_release_p4d(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
}
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
......@@ -472,28 +457,6 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
if (sizeof(pmdval_t) > sizeof(long))
/* 5 arg words */
pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
else
PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
native_pmd_val(pmd));
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
if (sizeof(pudval_t) > sizeof(long))
/* 5 arg words */
pv_mmu_ops.set_pud_at(mm, addr, pudp, pud);
else
PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp,
native_pud_val(pud));
}
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd);
......
......@@ -107,7 +107,6 @@ struct pv_cpu_ops {
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
unsigned long (*read_cr4)(void);
void (*write_cr4)(unsigned long);
#ifdef CONFIG_X86_64
......@@ -119,8 +118,6 @@ struct pv_cpu_ops {
void (*load_tr_desc)(void);
void (*load_gdt)(const struct desc_ptr *);
void (*load_idt)(const struct desc_ptr *);
/* store_gdt has been removed. */
void (*store_idt)(struct desc_ptr *);
void (*set_ldt)(const void *desc, unsigned entries);
unsigned long (*store_tr)(void);
void (*load_tls)(struct thread_struct *t, unsigned int cpu);
......@@ -245,12 +242,6 @@ struct pv_mmu_ops {
void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmdval);
void (*set_pud_at)(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pudval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
......
......@@ -55,8 +55,6 @@ extern pmdval_t early_pmd_flags;
#else /* !CONFIG_PARAVIRT */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
......@@ -87,8 +85,6 @@ extern pmdval_t early_pmd_flags;
#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
#define pmd_clear(pmd) native_pmd_clear(pmd)
#define pte_update(mm, addr, ptep) do { } while (0)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
......@@ -979,31 +975,18 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
native_set_pte(ptep, pte);
}
static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp , pmd_t pmd)
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
native_set_pmd(pmdp, pmd);
}
static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
native_set_pud(pudp, pud);
}
#ifndef CONFIG_PARAVIRT
/*
* Rules for using pte_update - it must be called after any PTE update which
* has not been done using the set_pte / clear_pte interfaces. It is used by
* shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
* updates should either be sets, clears, or set_pte_atomic for P->P
* transitions, which means this hook should only be called for user PTEs.
* This hook implies a P->P protection or access change has taken place, which
* requires a subsequent TLB flush.
*/
#define pte_update(mm, addr, ptep) do { } while (0)
#endif
/*
* We only update the dirty/accessed state if we set
* the dirty bit by hand in the kernel, since the hardware
......@@ -1031,7 +1014,6 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
pte_update(mm, addr, ptep);
return pte;
}
......@@ -1058,7 +1040,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
pte_update(mm, addr, ptep);
}
#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
......
......@@ -135,6 +135,11 @@ static inline void native_wbinvd(void)
extern asmlinkage void native_load_gs_index(unsigned);
static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
}
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
......@@ -173,11 +178,6 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}
static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
}
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
......
......@@ -152,12 +152,6 @@
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
/*
* HV_VP_SET available
*/
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11)
/*
* Crash notification flag.
*/
......
......@@ -169,21 +169,21 @@ static int __init x86_mpx_setup(char *s)
__setup("nompx", x86_mpx_setup);
#ifdef CONFIG_X86_64
static int __init x86_pcid_setup(char *s)
static int __init x86_nopcid_setup(char *s)
{
/* require an exact match without trailing characters */
if (strlen(s))
return 0;
/* nopcid doesn't accept parameters */
if (s)
return -EINVAL;
/* do not emit a message if the feature is not present */
if (!boot_cpu_has(X86_FEATURE_PCID))
return 1;
return 0;
setup_clear_cpu_cap(X86_FEATURE_PCID);
pr_info("nopcid: PCID feature disabled\n");
return 1;
return 0;
}
__setup("nopcid", x86_pcid_setup);
early_param("nopcid", x86_nopcid_setup);
#endif
static int __init x86_noinvpcid_setup(char *s)
......@@ -329,38 +329,6 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
static void setup_pcid(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_PCID)) {
if (cpu_has(c, X86_FEATURE_PGE)) {
/*
* We'd like to use cr4_set_bits_and_update_boot(),
* but we can't. CR4.PCIDE is special and can only
* be set in long mode, and the early CPU init code
* doesn't know this and would try to restore CR4.PCIDE
* prior to entering long mode.
*
* Instead, we rely on the fact that hotplug, resume,
* etc all fully restore CR4 before they write anything
* that could have nonzero PCID bits to CR3. CR4.PCIDE
* has no effect on the page tables themselves, so we
* don't need it to be restored early.
*/
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
* work if PCID is on but PGE is not. Since that
* combination doesn't exist on real hardware, there's
* no reason to try to fully support it, but it's
* polite to avoid corrupting data if we're on
* an improperly configured VM.
*/
clear_cpu_cap(c, X86_FEATURE_PCID);
}
}
}
/*
* Protection Keys are not available in 32-bit mode.
*/
......@@ -1175,9 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
/* Set up PCID */
setup_pcid(c);
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
......
......@@ -59,8 +59,6 @@ void hyperv_vector_handler(struct pt_regs *regs)
void hv_setup_vmbus_irq(void (*handler)(void))
{
vmbus_handler = handler;
/* Setup the IDT for hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
}
void hv_remove_vmbus_irq(void)
......@@ -251,6 +249,8 @@ static void __init ms_hyperv_init_platform(void)
*/
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
#endif
}
......
......@@ -327,7 +327,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.set_debugreg = native_set_debugreg,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
.write_cr4 = native_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = native_read_cr8,
......@@ -343,7 +342,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
.load_idt = native_load_idt,
.store_idt = native_store_idt,
.store_tr = native_store_tr,
.load_tls = native_load_tls,
#ifdef CONFIG_X86_64
......@@ -411,8 +409,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
.set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
......@@ -424,7 +420,6 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.pmd_clear = native_pmd_clear,
#endif
.set_pud = native_set_pud,
.set_pud_at = native_set_pud_at,
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
......
......@@ -1178,8 +1178,11 @@ void __init setup_arch(char **cmdline_p)
* with the current CR4 value. This may not be necessary, but
* auditing all the early-boot CR4 manipulation would be needed to
* rule it out.
*
* Mask off features that don't work outside long mode (just
* PCIDE for now).
*/
mmu_cr4_features = __read_cr4();
mmu_cr4_features = __read_cr4() & ~X86_CR4_PCIDE;
memblock_set_current_limit(get_max_mapped());
......
......@@ -226,10 +226,12 @@ static int enable_start_cpu0;
static void notrace start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
* fragile that we want to limit the things done here to the
* most necessary things.
* Don't put *anything* except direct CPU state initialization
* before cpu_init(), SMP booting is too fragile that we want to
* limit the things done here to the most necessary things.
*/
if (boot_cpu_has(X86_FEATURE_PCID))
__write_cr4(__read_cr4() | X86_CR4_PCIDE);
cpu_init();
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
......
......@@ -5192,7 +5192,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
native_store_idt(&dt);
store_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
vmx->host_idt_base = dt.address;
......
......@@ -19,6 +19,7 @@
#include <asm/microcode.h>
#include <asm/kaslr.h>
#include <asm/hypervisor.h>
#include <asm/cpufeature.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
......@@ -193,6 +194,38 @@ static void __init probe_page_size_mask(void)
}
}
static void setup_pcid(void)
{
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_PCID)) {
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() --
* the trampoline code can't handle CR4.PCIDE and
* it wouldn't do any good anyway. Despite the name,
* cr4_set_bits_and_update_boot() doesn't actually
* cause the bits in question to remain set all the
* way through the secondary boot asm.
*
* Instead, we brute-force it and set CR4.PCIDE
* manually in start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't
* work if PCID is on but PGE is not. Since that
* combination doesn't exist on real hardware, there's
* no reason to try to fully support it, but it's
* polite to avoid corrupting data if we're on
* an improperly configured VM.
*/
setup_clear_cpu_cap(X86_FEATURE_PCID);
}
}
#endif
}
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
......@@ -592,6 +625,7 @@ void __init init_mem_mapping(void)
unsigned long end;
probe_page_size_mask();
setup_pcid();
#ifdef CONFIG_X86_64
end = max_pfn << PAGE_SHIFT;
......
......@@ -426,10 +426,8 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
{
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
if (changed && dirty)
*ptep = entry;
pte_update(vma->vm_mm, address, ptep);
}
return changed;
}
......@@ -486,9 +484,6 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
if (ret)
pte_update(vma->vm_mm, addr, ptep);
return ret;
}
......
......@@ -121,8 +121,28 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* hypothetical buggy code that directly switches to swapper_pg_dir
* without going through leave_mm() / switch_mm_irqs_off() or that
* does something like write_cr3(read_cr3_pa()).
*
* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
* isn't free.
*/
VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
#ifdef CONFIG_DEBUG_VM
if (WARN_ON_ONCE(__read_cr3() !=
(__sme_pa(real_prev->pgd) | prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
* Try to recover instead by ignoring the error and doing
* a global flush to minimize the chance of corruption.
*
* (This is far from being a fully correct recovery.
* Architecturally, the CPU could prefetch something
* back into an incorrect ASID slot and leave it there
* to cause trouble down the road. It's better than
* nothing, though.)
*/
__flush_tlb_all();
}
#endif
if (real_prev == next) {
VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
......
......@@ -295,7 +295,26 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
return -EOVERFLOW;
rdr->jump_address = (unsigned long)restore_registers;
rdr->jump_address_phys = __pa_symbol(restore_registers);
rdr->cr3 = restore_cr3;
/*
* The restore code fixes up CR3 and CR4 in the following sequence:
*
* [in hibernation asm]
* 1. CR3 <= temporary page tables
* 2. CR4 <= mmu_cr4_features (from the kernel that restores us)
* 3. CR3 <= rdr->cr3
* 4. CR4 <= mmu_cr4_features (from us, i.e. the image kernel)
* [in restore_processor_state()]
* 5. CR4 <= saved CR4
* 6. CR3 <= saved CR3
*
* Our mmu_cr4_features has CR4.PCIDE=0, and toggling
* CR4.PCIDE while CR3's PCID bits are nonzero is illegal, so
* rdr->cr3 needs to point to valid page tables but must not
* have any of the PCID bits set.
*/
rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
rdr->magic = RESTORE_MAGIC;
hibernation_e820_save(rdr->e820_digest);
......
......@@ -1038,7 +1038,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
.write_cr4 = xen_write_cr4,
#ifdef CONFIG_X86_64
......@@ -1073,7 +1072,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.alloc_ldt = xen_alloc_ldt,
.free_ldt = xen_free_ldt,
.store_idt = native_store_idt,
.store_tr = xen_store_tr,
.write_ldt_entry = xen_write_ldt_entry,
......
......@@ -2409,8 +2409,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_single = xen_flush_tlb_single,
.flush_tlb_others = xen_flush_tlb_others,
.pte_update = paravirt_nop,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment