Commit 223e23e8 authored by Will Deacon's avatar Will Deacon Committed by Catalin Marinas

arm64: lib: improve copy_page to deal with 128 bytes at a time

We want to avoid lots of different copy_page implementations, settling
for something that is "good enough" everywhere and hopefully easy to
understand and maintain whilst we're at it.

This patch reworks our copy_page implementation based on discussions
with Cavium on the list and benchmarking on Cortex-A processors so that:

  - The loop is unrolled to copy 128 bytes per iteration

  - The reads are offset so that we read from the next 128-byte block
    in the same iteration that we store the previous block

  - Explicit prefetch instructions are removed for now, since they hurt
    performance on CPUs with hardware prefetching

  - The loop exit condition is calculated at the start of the loop
Signed-off-by: default avatarWill Deacon <will.deacon@arm.com>
Tested-by: default avatarAndrew Pinski <apinski@cavium.com>
Signed-off-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
parent d5370f75
...@@ -27,20 +27,50 @@ ...@@ -27,20 +27,50 @@
* x1 - src * x1 - src
*/ */
ENTRY(copy_page) ENTRY(copy_page)
/* Assume cache line size is 64 bytes. */ ldp x2, x3, [x1]
prfm pldl1strm, [x1, #64]
1: ldp x2, x3, [x1]
ldp x4, x5, [x1, #16] ldp x4, x5, [x1, #16]
ldp x6, x7, [x1, #32] ldp x6, x7, [x1, #32]
ldp x8, x9, [x1, #48] ldp x8, x9, [x1, #48]
add x1, x1, #64 ldp x10, x11, [x1, #64]
prfm pldl1strm, [x1, #64] ldp x12, x13, [x1, #80]
ldp x14, x15, [x1, #96]
ldp x16, x17, [x1, #112]
mov x18, #(PAGE_SIZE - 128)
add x1, x1, #128
1:
subs x18, x18, #128
stnp x2, x3, [x0] stnp x2, x3, [x0]
ldp x2, x3, [x1]
stnp x4, x5, [x0, #16] stnp x4, x5, [x0, #16]
ldp x4, x5, [x1, #16]
stnp x6, x7, [x0, #32] stnp x6, x7, [x0, #32]
ldp x6, x7, [x1, #32]
stnp x8, x9, [x0, #48] stnp x8, x9, [x0, #48]
add x0, x0, #64 ldp x8, x9, [x1, #48]
tst x1, #(PAGE_SIZE - 1) stnp x10, x11, [x0, #64]
b.ne 1b ldp x10, x11, [x1, #64]
stnp x12, x13, [x0, #80]
ldp x12, x13, [x1, #80]
stnp x14, x15, [x0, #96]
ldp x14, x15, [x1, #96]
stnp x16, x17, [x0, #112]
ldp x16, x17, [x1, #112]
add x0, x0, #128
add x1, x1, #128
b.gt 1b
stnp x2, x3, [x0]
stnp x4, x5, [x0, #16]
stnp x6, x7, [x0, #32]
stnp x8, x9, [x0, #48]
stnp x10, x11, [x0, #64]
stnp x12, x13, [x0, #80]
stnp x14, x15, [x0, #96]
stnp x16, x17, [x0, #112]
ret ret
ENDPROC(copy_page) ENDPROC(copy_page)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment