Commit c4885bbb authored by Pingfan Liu's avatar Pingfan Liu Committed by Catalin Marinas

arm64/mm: save memory access in check_and_switch_context() fast switch path

On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
using the per-cpu offset stored in the tpidr_el1 system register. In
some cases we generate a per-cpu address with a sequence like:

  cpu_ptr = &per_cpu(ptr, smp_processor_id());

Which potentially incurs a cache miss for both `cpu_number` and the
in-memory `__per_cpu_offset` array. This can be written more optimally
as:

  cpu_ptr = this_cpu_ptr(ptr);

Which only needs the offset from tpidr_el1, and does not need to
load from memory.

The following two test cases show a small performance improvement measured
on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.

Test 1: (about 0.3% improvement)
    #cat b.sh
    make clean && make all -j138
    #perf stat --repeat 10 --null --sync sh b.sh

    - before this patch
     Performance counter stats for 'sh b.sh' (10 runs):

                298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )

    - after this patch
     Performance counter stats for 'sh b.sh' (10 runs):

               297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )

Test 2: (about 1.69% improvement)
     'perf stat -r 10 perf bench sched messaging'
        Then sum the total time of 'sched/messaging' by manual.

    - before this patch
      total 0.707 sec for 10 times
    - after this patch
      totol 0.695 sec for 10 times
Signed-off-by: default avatarPingfan Liu <kernelfans@gmail.com>
Acked-by: default avatarMark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/1594389852-19949-1-git-send-email-kernelfans@gmail.comSigned-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent ea0eada4
...@@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp) ...@@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
* take CPU migration into account. * take CPU migration into account.
*/ */
#define destroy_context(mm) do { } while(0) #define destroy_context(mm) do { } while(0)
void check_and_switch_context(struct mm_struct *mm, unsigned int cpu); void check_and_switch_context(struct mm_struct *mm);
#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; }) #define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
...@@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) ...@@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
static inline void __switch_mm(struct mm_struct *next) static inline void __switch_mm(struct mm_struct *next)
{ {
unsigned int cpu = smp_processor_id();
/* /*
* init_mm.pgd does not contain any user mappings and it is always * init_mm.pgd does not contain any user mappings and it is always
* active for kernel addresses in TTBR1. Just set the reserved TTBR0. * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
...@@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next) ...@@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
return; return;
} }
check_and_switch_context(next, cpu); check_and_switch_context(next);
} }
static inline void static inline void
......
...@@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm) ...@@ -198,9 +198,10 @@ static u64 new_context(struct mm_struct *mm)
return idx2asid(asid) | generation; return idx2asid(asid) | generation;
} }
void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) void check_and_switch_context(struct mm_struct *mm)
{ {
unsigned long flags; unsigned long flags;
unsigned int cpu;
u64 asid, old_active_asid; u64 asid, old_active_asid;
if (system_supports_cnp()) if (system_supports_cnp())
...@@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) ...@@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
* relaxed xchg in flush_context will treat us as reserved * relaxed xchg in flush_context will treat us as reserved
* because atomic RmWs are totally ordered for a given location. * because atomic RmWs are totally ordered for a given location.
*/ */
old_active_asid = atomic64_read(&per_cpu(active_asids, cpu)); old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
if (old_active_asid && asid_gen_match(asid) && if (old_active_asid && asid_gen_match(asid) &&
atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu), atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
old_active_asid, asid)) old_active_asid, asid))
goto switch_mm_fastpath; goto switch_mm_fastpath;
...@@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) ...@@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
atomic64_set(&mm->context.id, asid); atomic64_set(&mm->context.id, asid);
} }
cpu = smp_processor_id();
if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
local_flush_tlb_all(); local_flush_tlb_all();
atomic64_set(&per_cpu(active_asids, cpu), asid); atomic64_set(this_cpu_ptr(&active_asids), asid);
raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
switch_mm_fastpath: switch_mm_fastpath:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment