Commit cf8fb553 authored by Anton Blanchard's avatar Anton Blanchard Committed by Benjamin Herrenschmidt

powerpc: Optimise the 64bit optimised __clear_user

I blame Mikey for this. He elevated my slightly dubious testcase:

to benchmark status. And naturally we need to be number 1 at creating
zeros. So lets improve __clear_user some more.

As Paul suggests we can use dcbz for large lengths. This patch gets
the destination cacheline aligned then uses dcbz on whole cachelines.

Before:
10485760000 bytes (10 GB) copied, 0.414744 s, 25.3 GB/s

After:
10485760000 bytes (10 GB) copied, 0.268597 s, 39.0 GB/s

39 GB/s, a new record.
Signed-off-by: default avatarAnton Blanchard <anton@samba.org>
Tested-by: default avatarOlof Johansson <olof@lixom.net>
Acked-by: default avatarOlof Johansson <olof@lixom.net>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent b4c3a872
...@@ -19,6 +19,12 @@ ...@@ -19,6 +19,12 @@
*/ */
#include <asm/ppc_asm.h> #include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
.section ".toc","aw"
PPC64_CACHES:
.tc ppc64_caches[TC],ppc64_caches
.section ".text"
/** /**
* __clear_user: - Zero a block of memory in user space, with less checking. * __clear_user: - Zero a block of memory in user space, with less checking.
...@@ -94,9 +100,14 @@ err1; stw r0,0(r3) ...@@ -94,9 +100,14 @@ err1; stw r0,0(r3)
addi r3,r3,4 addi r3,r3,4
3: sub r4,r4,r6 3: sub r4,r4,r6
srdi r6,r4,5
cmpdi r4,32 cmpdi r4,32
cmpdi cr1,r4,512
blt .Lshort_clear blt .Lshort_clear
bgt cr1,.Llong_clear
.Lmedium_clear:
srdi r6,r4,5
mtctr r6 mtctr r6
/* Do 32 byte chunks */ /* Do 32 byte chunks */
...@@ -139,3 +150,53 @@ err1; stb r0,0(r3) ...@@ -139,3 +150,53 @@ err1; stb r0,0(r3)
10: li r3,0 10: li r3,0
blr blr
.Llong_clear:
ld r5,PPC64_CACHES@toc(r2)
bf cr7*4+0,11f
err2; std r0,0(r3)
addi r3,r3,8
addi r4,r4,-8
/* Destination is 16 byte aligned, need to get it cacheline aligned */
11: lwz r7,DCACHEL1LOGLINESIZE(r5)
lwz r9,DCACHEL1LINESIZE(r5)
/*
* With worst case alignment the long clear loop takes a minimum
* of 1 byte less than 2 cachelines.
*/
sldi r10,r9,2
cmpd r4,r10
blt .Lmedium_clear
neg r6,r3
addi r10,r9,-1
and. r5,r6,r10
beq 13f
srdi r6,r5,4
mtctr r6
mr r8,r3
12:
err1; std r0,0(r3)
err1; std r0,8(r3)
addi r3,r3,16
bdnz 12b
sub r4,r4,r5
13: srd r6,r4,r7
mtctr r6
mr r8,r3
14:
err1; dcbz r0,r3
add r3,r3,r9
bdnz 14b
and r4,r4,r10
cmpdi r4,32
blt .Lshort_clear
b .Lmedium_clear
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment