Commit 59f87247 authored by Paul Mackerras's avatar Paul Mackerras Committed by Linus Torvalds

[PATCH] Better memset

Anton noticed in some traces that we were spending an awfully long time
doing a memset.  The ppc64 memset is basically unchanged from the ppc32
version, and it only does 4-byte stores and doesn't unroll the loop. 
Here's a memset that performs a bit better.

I have been using it for 3 weeks now, and Anton has tested it on a
variety of machines, without problems. 
Signed-off-by: default avatarPaul Mackerras <paulus@samba.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent fe3a3a65
......@@ -66,28 +66,69 @@ _GLOBAL(strlen)
blr
_GLOBAL(memset)
neg r0,r5
rlwimi r4,r4,8,16,23
andi. r0,r0,7 /* # bytes to be 8-byte aligned */
rlwimi r4,r4,16,0,15
addi r6,r3,-4
cmplwi 0,r5,4
blt 7f
stwu r4,4(r6)
beqlr
andi. r0,r6,3
add r5,r0,r5
subf r6,r0,r6
srwi r0,r5,2
cmplw cr1,r5,r0 /* do we get that far? */
rldimi r4,r4,32,0
mr r6,r3
mtcrf 1,r0
mr r6,r3
blt cr1,8f
beq+ 3f /* if already 8-byte aligned */
subf r5,r0,r5
bf 31,1f
stb r4,0(r6)
addi r6,r6,1
1: bf 30,2f
sth r4,0(r6)
addi r6,r6,2
2: bf 29,3f
stw r4,0(r6)
addi r6,r6,4
3: srdi. r0,r5,6
clrldi r5,r5,58
mtctr r0
bdz 6f
1: stwu r4,4(r6)
bdnz 1b
6: andi. r5,r5,3
7: cmpwi 0,r5,0
beqlr
mtctr r5
addi r6,r6,3
8: stbu r4,1(r6)
bdnz 8b
beq 5f
4: std r4,0(r6)
std r4,8(r6)
std r4,16(r6)
std r4,24(r6)
std r4,32(r6)
std r4,40(r6)
std r4,48(r6)
std r4,56(r6)
addi r6,r6,64
bdnz 4b
5: srwi. r0,r5,3
clrlwi r5,r5,29
mtcrf 1,r0
beq 8f
bf 29,6f
std r4,0(r6)
std r4,8(r6)
std r4,16(r6)
std r4,24(r6)
addi r6,r6,32
6: bf 30,7f
std r4,0(r6)
std r4,8(r6)
addi r6,r6,16
7: bf 31,8f
std r4,0(r6)
addi r6,r6,8
8: cmpwi r5,0
mtcrf 1,r5
beqlr+
bf 29,9f
stw r4,0(r6)
addi r6,r6,4
9: bf 30,10f
sth r4,0(r6)
addi r6,r6,2
10: bflr 31
stb r4,0(r6)
blr
_GLOBAL(memmove)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment