Commit 67535736 authored by Andy Lutomirski's avatar Andy Lutomirski Committed by Ingo Molnar

Revert "x86/mm: Stop calling leave_mm() in idle code"

This reverts commit 43858b4f.

The reason I removed the leave_mm() calls in question is because the
heuristic wasn't needed after that patch.  With the original version
of my PCID series, we never flushed a "lazy cpu" (i.e. a CPU running
kernel thread) due a flush on the loaded mm.

Unfortunately, that caused architectural issues, so now I've
reinstated these flushes on non-PCID systems in:

    commit b956575b ("x86/mm: Flush more aggressively in lazy TLB mode").

That, in turn, gives us a power management and occasionally
performance regression as compared to old kernels: a process that
goes into a deep idle state on a given CPU and gets its mm flushed
due to activity on a different CPU will wake the idle CPU.

Reinstate the old ugly heuristic: if a CPU goes into ACPI C3 or an
intel_idle state that is likely to cause a TLB flush gets its mm
switched to init_mm before going idle.

FWIW, this heuristic is lousy.  Whether we should change CR3 before
idle isn't a good hint except insofar as the performance hit is a bit
lower if the TLB is getting flushed by the idle code anyway.  What we
really want to know is whether we anticipate being idle long enough
that the mm is likely to be flushed before we wake up.  This is more a
matter of the expected latency than the idle state that gets chosen.
This heuristic also completely fails on systems that don't know
whether the TLB will be flushed (e.g. AMD systems?).  OTOH it may be a
bit obsolete anyway -- PCID systems don't presently benefit from this
heuristic at all.

We also shouldn't do this callback from innermost bit of the idle code
due to the RCU nastiness it causes.  All the information need is
available before rcu_idle_enter() needs to happen.
Signed-off-by: default avatarAndy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 43858b4f "x86/mm: Stop calling leave_mm() in idle code"
Link: http://lkml.kernel.org/r/c513bbd4e653747213e05bc7062de000bf0202a5.1509793738.git.luto@kernel.orgSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 5f479447
...@@ -112,6 +112,8 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf) ...@@ -112,6 +112,8 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP;
} }
#define acpi_unlazy_tlb(x)
#ifdef CONFIG_ACPI_NUMA #ifdef CONFIG_ACPI_NUMA
extern cpumask_t early_cpu_possible_map; extern cpumask_t early_cpu_possible_map;
#define for_each_possible_early_cpu(cpu) \ #define for_each_possible_early_cpu(cpu) \
......
...@@ -150,6 +150,8 @@ static inline void disable_acpi(void) { } ...@@ -150,6 +150,8 @@ static inline void disable_acpi(void) { }
extern int x86_acpi_numa_init(void); extern int x86_acpi_numa_init(void);
#endif /* CONFIG_ACPI_NUMA */ #endif /* CONFIG_ACPI_NUMA */
#define acpi_unlazy_tlb(x) leave_mm(x)
#ifdef CONFIG_ACPI_APEI #ifdef CONFIG_ACPI_APEI
static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)
{ {
......
...@@ -85,6 +85,7 @@ void leave_mm(int cpu) ...@@ -85,6 +85,7 @@ void leave_mm(int cpu)
switch_mm(NULL, &init_mm, NULL); switch_mm(NULL, &init_mm, NULL);
} }
EXPORT_SYMBOL_GPL(leave_mm);
void switch_mm(struct mm_struct *prev, struct mm_struct *next, void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk) struct task_struct *tsk)
...@@ -195,12 +196,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, ...@@ -195,12 +196,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
write_cr3(build_cr3(next, new_asid)); write_cr3(build_cr3(next, new_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL); /*
* NB: This gets called via leave_mm() in the idle path
* where RCU functions differently. Tracing normally
* uses RCU, so we need to use the _rcuidle variant.
*
* (There is no good reason for this. The idle code should
* be rearranged to call this before rcu_idle_enter().)
*/
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else { } else {
/* The new ASID is already up to date. */ /* The new ASID is already up to date. */
write_cr3(build_cr3_noflush(next, new_asid)); write_cr3(build_cr3_noflush(next, new_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
/* See above wrt _rcuidle. */
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
} }
this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm, next);
......
...@@ -710,6 +710,8 @@ static DEFINE_RAW_SPINLOCK(c3_lock); ...@@ -710,6 +710,8 @@ static DEFINE_RAW_SPINLOCK(c3_lock);
static void acpi_idle_enter_bm(struct acpi_processor *pr, static void acpi_idle_enter_bm(struct acpi_processor *pr,
struct acpi_processor_cx *cx, bool timer_bc) struct acpi_processor_cx *cx, bool timer_bc)
{ {
acpi_unlazy_tlb(smp_processor_id());
/* /*
* Must be done before busmaster disable as we might need to * Must be done before busmaster disable as we might need to
* access HPET ! * access HPET !
......
...@@ -913,15 +913,16 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, ...@@ -913,15 +913,16 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
struct cpuidle_state *state = &drv->states[index]; struct cpuidle_state *state = &drv->states[index];
unsigned long eax = flg2MWAIT(state->flags); unsigned long eax = flg2MWAIT(state->flags);
unsigned int cstate; unsigned int cstate;
int cpu = smp_processor_id();
cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
/* /*
* NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition * leave_mm() to avoid costly and often unnecessary wakeups
* will probably flush the TLB. It's not guaranteed to flush * for flushing the user TLB's associated with the active mm.
* the TLB, though, so it's not clear that we can do anything
* useful with this knowledge.
*/ */
if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED)
leave_mm(cpu);
if (!(lapic_timer_reliable_states & (1 << (cstate)))) if (!(lapic_timer_reliable_states & (1 << (cstate))))
tick_broadcast_enter(); tick_broadcast_enter();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment