Commit a6faa972 authored by Nicolas Pitre's avatar Nicolas Pitre Committed by Russell King

[ARM PATCH] 1363/1: memcpy with preload support and other optimisations

Patch from Nicolas Pitre

This improves on what I did with patch #1362/1 by adding preloads for 
architectures that support it.  On an XScale PXA255 this provides a 20% 
performance gain.

Tested with all combinations of sizes and alignments.
parent 49744e17
......@@ -27,15 +27,15 @@
/*
* Prototype: void memcpy(void *to,const void *from,unsigned long n);
* ARM3: cant use memcopy here!!!
*/
ENTRY(memcpy)
ENTRY(memmove)
ENTER
cmp r1, r0
bcc 19f
bcc 23f
subs r2, r2, #4
blt 6f
PLD( pld [r1, #0] )
ands ip, r0, #3
bne 7f
ands ip, r1, #3
......@@ -43,29 +43,42 @@ ENTRY(memmove)
1: subs r2, r2, #8
blt 5f
subs r2, r2, #0x14
blt 3f
2: ldmia r1!,{r3 - r9, ip}
stmia r0!,{r3 - r9, ip}
subs r2, r2, #20
blt 4f
PLD( pld [r1, #28] )
PLD( subs r2, r2, #64 )
PLD( blt 3f )
2: PLD( pld [r1, #60] )
PLD( pld [r1, #92] )
ldmia r1!, {r3 - r9, ip}
subs r2, r2, #32
stmgeia r0!, {r3 - r9, ip}
ldmgeia r1!, {r3 - r9, ip}
subges r2, r2, #32
stmia r0!, {r3 - r9, ip}
bge 2b
cmn r2, #16
3: PLD( ldmia r1!, {r3 - r9, ip} )
PLD( adds r2, r2, #32 )
PLD( stmgeia r0!, {r3 - r9, ip} )
PLD( ldmgeia r1!, {r3 - r9, ip} )
PLD( subges r2, r2, #32 )
PLD( stmia r0!, {r3 - r9, ip} )
4: cmn r2, #16
ldmgeia r1!, {r3 - r6}
subge r2, r2, #16
stmgeia r0!, {r3 - r6}
subge r2, r2, #0x10
3: adds r2, r2, #0x14
4: ldmgeia r1!, {r3 - r5}
adds r2, r2, #20
ldmgeia r1!, {r3 - r5}
subge r2, r2, #12
stmgeia r0!, {r3 - r5}
subges r2, r2, #12
bge 4b
5: adds r2, r2, #8
blt 6f
subs r2, r2, #4
ldrlt r3, [r1], #4
ldmgeia r1!, {r4, r5}
subge r2, r2, #4
strlt r3, [r0], #4
stmgeia r0!, {r4, r5}
subge r2, r2, #4
6: adds r2, r2, #4
EXITEQ
......@@ -94,13 +107,19 @@ ENTRY(memmove)
8: bic r1, r1, #3
ldr r7, [r1], #4
cmp ip, #2
bgt 15f
beq 11f
bgt 18f
beq 13f
cmp r2, #12
blt 10f
blt 11f
PLD( pld [r1, #12] )
sub r2, r2, #12
9: mov r3, r7, pull #8
PLD( subs r2, r2, #32 )
PLD( blt 10f )
PLD( pld [r1, #28] )
9: PLD( pld [r1, #44] )
10: mov r3, r7, pull #8
ldmia r1!, {r4 - r7}
subs r2, r2, #16
orr r3, r3, r4, push #24
mov r4, r4, pull #8
orr r4, r4, r5, push #24
......@@ -109,24 +128,32 @@ ENTRY(memmove)
mov r6, r6, pull #8
orr r6, r6, r7, push #24
stmia r0!, {r3 - r6}
subs r2, r2, #16
bge 9b
PLD( cmn r2, #32 )
PLD( bge 10b )
PLD( add r2, r2, #32 )
adds r2, r2, #12
blt 100f
10: mov r3, r7, pull #8
blt 12f
11: mov r3, r7, pull #8
ldr r7, [r1], #4
subs r2, r2, #4
orr r3, r3, r7, push #24
str r3, [r0], #4
bge 10b
100: sub r1, r1, #3
bge 11b
12: sub r1, r1, #3
b 6b
11: cmp r2, #12
blt 13f /* */
13: cmp r2, #12
blt 16f
PLD( pld [r1, #12] )
sub r2, r2, #12
12: mov r3, r7, pull #16
PLD( subs r2, r2, #32 )
PLD( blt 15f )
PLD( pld [r1, #28] )
14: PLD( pld [r1, #44] )
15: mov r3, r7, pull #16
ldmia r1!, {r4 - r7}
subs r2, r2, #16
orr r3, r3, r4, push #16
mov r4, r4, pull #16
orr r4, r4, r5, push #16
......@@ -135,24 +162,32 @@ ENTRY(memmove)
mov r6, r6, pull #16
orr r6, r6, r7, push #16
stmia r0!, {r3 - r6}
subs r2, r2, #16
bge 12b
bge 14b
PLD( cmn r2, #32 )
PLD( bge 15b )
PLD( add r2, r2, #32 )
adds r2, r2, #12
blt 14f
13: mov r3, r7, pull #16
blt 17f
16: mov r3, r7, pull #16
ldr r7, [r1], #4
subs r2, r2, #4
orr r3, r3, r7, push #16
str r3, [r0], #4
bge 13b
14: sub r1, r1, #2
bge 16b
17: sub r1, r1, #2
b 6b
15: cmp r2, #12
blt 17f
18: cmp r2, #12
blt 21f
PLD( pld [r1, #12] )
sub r2, r2, #12
16: mov r3, r7, pull #24
PLD( subs r2, r2, #32 )
PLD( blt 20f )
PLD( pld [r1, #28] )
19: PLD( pld [r1, #44] )
20: mov r3, r7, pull #24
ldmia r1!, {r4 - r7}
subs r2, r2, #16
orr r3, r3, r4, push #8
mov r4, r4, pull #24
orr r4, r4, r5, push #8
......@@ -161,55 +196,72 @@ ENTRY(memmove)
mov r6, r6, pull #24
orr r6, r6, r7, push #8
stmia r0!, {r3 - r6}
subs r2, r2, #16
bge 16b
bge 19b
PLD( cmn r2, #32 )
PLD( bge 20b )
PLD( add r2, r2, #32 )
adds r2, r2, #12
blt 18f
17: mov r3, r7, pull #24
blt 22f
21: mov r3, r7, pull #24
ldr r7, [r1], #4
subs r2, r2, #4
orr r3, r3, r7, push #8
str r3, [r0], #4
bge 17b
18: sub r1, r1, #1
bge 21b
22: sub r1, r1, #1
b 6b
19: add r1, r1, r2
23: add r1, r1, r2
add r0, r0, r2
subs r2, r2, #4
blt 24f
blt 29f
PLD( pld [r1, #-4] )
ands ip, r0, #3
bne 25f
bne 30f
ands ip, r1, #3
bne 26f
bne 31f
20: subs r2, r2, #8
blt 23f
subs r2, r2, #0x14
blt 22f
21: ldmdb r1!, {r3 - r9, ip}
stmdb r0!, {r3 - r9, ip}
24: subs r2, r2, #8
blt 28f
subs r2, r2, #20
blt 27f
PLD( pld [r1, #-32] )
PLD( subs r2, r2, #64 )
PLD( blt 26f )
25: PLD( pld [r1, #-64] )
PLD( pld [r1, #-96] )
ldmdb r1!, {r3 - r9, ip}
subs r2, r2, #32
bge 21b
22: cmn r2, #16
stmgedb r0!, {r3 - r9, ip}
ldmgedb r1!, {r3 - r9, ip}
subges r2, r2, #32
stmdb r0!, {r3 - r9, ip}
bge 25b
26: PLD( ldmdb r1!, {r3 - r9, ip} )
PLD( adds r2, r2, #32 )
PLD( stmgedb r0!, {r3 - r9, ip} )
PLD( ldmgedb r1!, {r3 - r9, ip} )
PLD( subges r2, r2, #32 )
PLD( stmdb r0!, {r3 - r9, ip} )
27: cmn r2, #16
ldmgedb r1!, {r3 - r6}
stmgedb r0!, {r3 - r6}
subge r2, r2, #16
stmgedb r0!, {r3 - r6}
adds r2, r2, #20
ldmgedb r1!, {r3 - r5}
stmgedb r0!, {r3 - r5}
subge r2, r2, #12
23: adds r2, r2, #8
blt 24f
stmgedb r0!, {r3 - r5}
28: adds r2, r2, #8
blt 29f
subs r2, r2, #4
ldrlt r3, [r1, #-4]!
ldmgedb r1!, {r4, r5}
subge r2, r2, #4
strlt r3, [r0, #-4]!
stmgedb r0!, {r4, r5}
subge r2, r2, #4
24: adds r2, r2, #4
29: adds r2, r2, #4
EXITEQ
cmp r2, #2
ldrb r3, [r1, #-1]!
......@@ -220,7 +272,7 @@ ENTRY(memmove)
strgtb r5, [r0, #-1]!
EXIT
25: cmp ip, #2
30: cmp ip, #2
ldrb r3, [r1, #-1]!
ldrgeb r4, [r1, #-1]!
ldrgtb r5, [r1, #-1]!
......@@ -228,20 +280,26 @@ ENTRY(memmove)
strgeb r4, [r0, #-1]!
strgtb r5, [r0, #-1]!
subs r2, r2, ip
blt 24b
blt 29b
ands ip, r1, #3
beq 20b
beq 24b
26: bic r1, r1, #3
31: bic r1, r1, #3
ldr r3, [r1], #0
cmp ip, #2
blt 34f
beq 30f
blt 41f
beq 36f
cmp r2, #12
blt 28f
blt 34f
PLD( pld [r1, #-16] )
sub r2, r2, #12
27: mov r7, r3, push #8
PLD( subs r2, r2, #32 )
PLD( blt 33f )
PLD( pld [r1, #-32] )
32: PLD( pld [r1, #-48] )
33: mov r7, r3, push #8
ldmdb r1!, {r3, r4, r5, r6}
subs r2, r2, #16
orr r7, r7, r6, pull #24
mov r6, r6, push #8
orr r6, r6, r5, pull #24
......@@ -250,24 +308,32 @@ ENTRY(memmove)
mov r4, r4, push #8
orr r4, r4, r3, pull #24
stmdb r0!, {r4, r5, r6, r7}
subs r2, r2, #16
bge 27b
bge 32b
PLD( cmn r2, #32 )
PLD( bge 33b )
PLD( add r2, r2, #32 )
adds r2, r2, #12
blt 29f
28: mov ip, r3, push #8
blt 35f
34: mov ip, r3, push #8
ldr r3, [r1, #-4]!
subs r2, r2, #4
orr ip, ip, r3, pull #24
str ip, [r0, #-4]!
bge 28b
29: add r1, r1, #3
b 24b
bge 34b
35: add r1, r1, #3
b 29b
30: cmp r2, #12
blt 32f
36: cmp r2, #12
blt 39f
PLD( pld [r1, #-16] )
sub r2, r2, #12
31: mov r7, r3, push #16
PLD( subs r2, r2, #32 )
PLD( blt 38f )
PLD( pld [r1, #-32] )
37: PLD( pld [r1, #-48] )
38: mov r7, r3, push #16
ldmdb r1!, {r3, r4, r5, r6}
subs r2, r2, #16
orr r7, r7, r6, pull #16
mov r6, r6, push #16
orr r6, r6, r5, pull #16
......@@ -276,24 +342,32 @@ ENTRY(memmove)
mov r4, r4, push #16
orr r4, r4, r3, pull #16
stmdb r0!, {r4, r5, r6, r7}
subs r2, r2, #16
bge 31b
bge 37b
PLD( cmn r2, #32 )
PLD( bge 38b )
PLD( add r2, r2, #32 )
adds r2, r2, #12
blt 33f
32: mov ip, r3, push #16
blt 40f
39: mov ip, r3, push #16
ldr r3, [r1, #-4]!
subs r2, r2, #4
orr ip, ip, r3, pull #16
str ip, [r0, #-4]!
bge 32b
33: add r1, r1, #2
b 24b
bge 39b
40: add r1, r1, #2
b 29b
34: cmp r2, #12
blt 36f
41: cmp r2, #12
blt 44f
PLD( pld [r1, #-16] )
sub r2, r2, #12
35: mov r7, r3, push #24
PLD( subs r2, r2, #32 )
PLD( blt 43f )
PLD( pld [r1, #-32] )
42: PLD( pld [r1, #-48] )
43: mov r7, r3, push #24
ldmdb r1!, {r3, r4, r5, r6}
subs r2, r2, #16
orr r7, r7, r6, pull #8
mov r6, r6, push #24
orr r6, r6, r5, pull #8
......@@ -302,17 +376,18 @@ ENTRY(memmove)
mov r4, r4, push #24
orr r4, r4, r3, pull #8
stmdb r0!, {r4, r5, r6, r7}
subs r2, r2, #16
bge 35b
bge 42b
PLD( cmn r2, #32 )
PLD( bge 43b )
PLD( add r2, r2, #32 )
adds r2, r2, #12
blt 37f
36: mov ip, r3, push #24
blt 45f
44: mov ip, r3, push #24
ldr r3, [r1, #-4]!
subs r2, r2, #4
orr ip, ip, r3, pull #8
str ip, [r0, #-4]!
bge 36b
37: add r1, r1, #1
b 24b
bge 44b
45: add r1, r1, #1
b 29b
.align
......@@ -26,3 +26,13 @@
#define push lsr
#define byte(x) ((3-x)*8)
#endif
/*
* Data preload for architectures that support it
*/
#if __LINUX_ARM_ARCH__ >= 5
#define PLD(code...) code
#else
#define PLD(code...)
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment