Commit ec6f5e0e authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:
 "A set of x86 and membarrier fixes:

   - Correct a few problems in the x86 and the generic membarrier
     implementation. Small corrections for assumptions about visibility
     which have turned out not to be true.

   - Make the PAT bits for memory encryption correct vs 4K and 2M/1G
     page table entries as they are at a different location.

   - Fix a concurrency issue in the the local bandwidth readout of
     resource control leading to incorrect values

   - Fix the ordering of allocating a vector for an interrupt. The order
     missed to respect the provided cpumask when the first attempt of
     allocating node local in the mask fails. It then tries the node
     instead of trying the full provided mask first. This leads to
     erroneous error messages and breaking the (user) supplied affinity
     request. Reorder it.

   - Make the INT3 padding detection in optprobe work correctly"

* tag 'x86-urgent-2020-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/kprobes: Fix optprobe to detect INT3 padding correctly
  x86/apic/vector: Fix ordering in vector assignment
  x86/resctrl: Fix incorrect local bandwidth when mba_sc is enabled
  x86/mm/mem_encrypt: Fix definition of PMD_FLAGS_DEC_WP
  membarrier: Execute SYNC_CORE on the calling thread
  membarrier: Explicitly sync remote cores when SYNC_CORE is requested
  membarrier: Add an actual barrier before rseq_preempt()
  x86/membarrier: Get rid of a dubious optimization
parents d2360a39 0d07c0ec
...@@ -155,6 +155,7 @@ enum page_cache_mode { ...@@ -155,6 +155,7 @@ enum page_cache_mode {
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
......
...@@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void) ...@@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void)
/* With PTI, we unconditionally serialize before running user code. */ /* With PTI, we unconditionally serialize before running user code. */
if (static_cpu_has(X86_FEATURE_PTI)) if (static_cpu_has(X86_FEATURE_PTI))
return; return;
/* /*
* Return from interrupt and NMI is done through iret, which is core * Even if we're in an interrupt, we might reschedule before returning,
* serializing. * in which case we could switch to a different thread in the same mm
* and return using SYSRET or SYSEXIT. Instead of trying to keep
* track of our need to sync the core, just sync right away.
*/ */
if (in_irq() || in_nmi())
return;
sync_core(); sync_core();
} }
......
...@@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd) ...@@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd)
const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
int node = irq_data_get_node(irqd); int node = irq_data_get_node(irqd);
if (node == NUMA_NO_NODE) if (node != NUMA_NO_NODE) {
goto all; /* Try the intersection of @affmsk and node mask */
/* Try the intersection of @affmsk and node mask */ cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); if (!assign_vector_locked(irqd, vector_searchmask))
if (!assign_vector_locked(irqd, vector_searchmask)) return 0;
return 0; }
/* Try the node mask */
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
return 0;
all:
/* Try the full affinity mask */ /* Try the full affinity mask */
cpumask_and(vector_searchmask, affmsk, cpu_online_mask); cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
if (!assign_vector_locked(irqd, vector_searchmask)) if (!assign_vector_locked(irqd, vector_searchmask))
return 0; return 0;
if (node != NUMA_NO_NODE) {
/* Try the node mask */
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
return 0;
}
/* Try the full online mask */ /* Try the full online mask */
return assign_vector_locked(irqd, cpu_online_mask); return assign_vector_locked(irqd, cpu_online_mask);
} }
......
...@@ -279,7 +279,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) ...@@ -279,7 +279,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
return; return;
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
m->chunks += chunks;
cur_bw = (chunks * r->mon_scale) >> 20; cur_bw = (chunks * r->mon_scale) >> 20;
if (m->delta_comp) if (m->delta_comp)
...@@ -450,15 +449,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) ...@@ -450,15 +449,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
} }
if (is_mbm_local_enabled()) { if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
__mon_event_count(rmid, &rr);
/* /*
* Call the MBA software controller only for the * Call the MBA software controller only for the
* control groups and when user has enabled * control groups and when user has enabled
* the software controller explicitly. * the software controller explicitly.
*/ */
if (!is_mba_sc(NULL)) if (is_mba_sc(NULL))
__mon_event_count(rmid, &rr);
else
mbm_bw_count(rmid, &rr); mbm_bw_count(rmid, &rr);
} }
} }
......
...@@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn) ...@@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn)
return ret; return ret;
} }
static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
{
unsigned char ops;
for (; addr < eaddr; addr++) {
if (get_kernel_nofault(ops, (void *)addr) < 0 ||
ops != INT3_INSN_OPCODE)
return false;
}
return true;
}
/* Decode whole function to ensure any instructions don't jump into target */ /* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr) static int can_optimize(unsigned long paddr)
{ {
...@@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr) ...@@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr)
return 0; return 0;
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn); insn_get_length(&insn);
/* Another subsystem puts a breakpoint */ /*
* In the case of detecting unknown breakpoint, this could be
* a padding INT3 between functions. Let's check that all the
* rest of the bytes are also INT3.
*/
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0; return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
/* Recover address */ /* Recover address */
insn.kaddr = (void *)addr; insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length); insn.next_byte = (void *)(addr + insn.length);
......
...@@ -45,8 +45,8 @@ ...@@ -45,8 +45,8 @@
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) #define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE #define PMD_FLAGS_DEC PMD_FLAGS_LARGE
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ #define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
(_PAGE_PAT | _PAGE_PWT)) (_PAGE_PAT_LARGE | _PAGE_PWT))
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
......
...@@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/* /*
* The membarrier system call requires a full memory barrier and * The membarrier system call requires a full memory barrier and
* core serialization before returning to user-space, after * core serialization before returning to user-space, after
* storing to rq->curr. Writing to CR3 provides that full * storing to rq->curr, when changing mm. This is because
* memory barrier and core serializing instruction. * membarrier() sends IPIs to all CPUs that are in the target mm
* to make them issue memory barriers. However, if another CPU
* switches to/from the target mm concurrently with
* membarrier(), it can cause that CPU not to receive an IPI
* when it really should issue a memory barrier. Writing to CR3
* provides that full memory barrier and core serializing
* instruction.
*/ */
if (real_prev == next) { if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
......
...@@ -38,8 +38,33 @@ static void ipi_mb(void *info) ...@@ -38,8 +38,33 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */ smp_mb(); /* IPIs should be serializing but paranoid. */
} }
static void ipi_sync_core(void *info)
{
/*
* The smp_mb() in membarrier after all the IPIs is supposed to
* ensure that memory on remote CPUs that occur before the IPI
* become visible to membarrier()'s caller -- see scenario B in
* the big comment at the top of this file.
*
* A sync_core() would provide this guarantee, but
* sync_core_before_usermode() might end up being deferred until
* after membarrier()'s smp_mb().
*/
smp_mb(); /* IPIs should be serializing but paranoid. */
sync_core_before_usermode();
}
static void ipi_rseq(void *info) static void ipi_rseq(void *info)
{ {
/*
* Ensure that all stores done by the calling thread are visible
* to the current task before the current task resumes. We could
* probably optimize this away on most architectures, but by the
* time we've already sent an IPI, the cost of the extra smp_mb()
* is negligible.
*/
smp_mb();
rseq_preempt(current); rseq_preempt(current);
} }
...@@ -154,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) ...@@ -154,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (!(atomic_read(&mm->membarrier_state) & if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM; return -EPERM;
ipi_func = ipi_sync_core;
} else if (flags == MEMBARRIER_FLAG_RSEQ) { } else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ)) if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL; return -EINVAL;
...@@ -168,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id) ...@@ -168,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return -EPERM; return -EPERM;
} }
if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
(atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
return 0; return 0;
/* /*
...@@ -187,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) ...@@ -187,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
goto out; goto out;
if (cpu_id == raw_smp_processor_id())
goto out;
rcu_read_lock(); rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr); p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) { if (!p || p->mm != mm) {
...@@ -203,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) ...@@ -203,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
for_each_online_cpu(cpu) { for_each_online_cpu(cpu) {
struct task_struct *p; struct task_struct *p;
/*
* Skipping the current CPU is OK even through we can be
* migrated at any point. The current CPU, at the point
* where we read raw_smp_processor_id(), is ensured to
* be in program order with respect to the caller
* thread. Therefore, we can skip this CPU from the
* iteration.
*/
if (cpu == raw_smp_processor_id())
continue;
p = rcu_dereference(cpu_rq(cpu)->curr); p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm) if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask); __cpumask_set_cpu(cpu, tmpmask);
...@@ -220,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id) ...@@ -220,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id)
rcu_read_unlock(); rcu_read_unlock();
} }
preempt_disable(); if (cpu_id >= 0) {
if (cpu_id >= 0) /*
* smp_call_function_single() will call ipi_func() if cpu_id
* is the calling CPU.
*/
smp_call_function_single(cpu_id, ipi_func, NULL, 1); smp_call_function_single(cpu_id, ipi_func, NULL, 1);
else } else {
smp_call_function_many(tmpmask, ipi_func, NULL, 1); /*
preempt_enable(); * For regular membarrier, we can save a few cycles by
* skipping the current cpu -- we're about to do smp_mb()
* below, and if we migrate to a different cpu, this cpu
* and the new cpu will execute a full barrier in the
* scheduler.
*
* For SYNC_CORE, we do need a barrier on the current cpu --
* otherwise, if we are migrated and replaced by a different
* task in the same mm just before, during, or after
* membarrier, we will end up with some thread in the mm
* running without a core sync.
*
* For RSEQ, don't rseq_preempt() the caller. User code
* is not supposed to issue syscalls at all from inside an
* rseq critical section.
*/
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
preempt_disable();
smp_call_function_many(tmpmask, ipi_func, NULL, true);
preempt_enable();
} else {
on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
}
}
out: out:
if (cpu_id < 0) if (cpu_id < 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment