Commit 32ee1e18 authored by Anton Blanchard's avatar Anton Blanchard Committed by Benjamin Herrenschmidt

powerpc: Fix endian issues in VMX copy loops

Fix the permute loops for little endian.
Signed-off-by: default avatarAnton Blanchard <anton@samba.org>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 8b5ede69
...@@ -19,6 +19,14 @@ ...@@ -19,6 +19,14 @@
*/ */
#include <asm/ppc_asm.h> #include <asm/ppc_asm.h>
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
.macro err1 .macro err1
100: 100:
.section __ex_table,"a" .section __ex_table,"a"
...@@ -552,13 +560,13 @@ err3; stw r7,4(r3) ...@@ -552,13 +560,13 @@ err3; stw r7,4(r3)
li r10,32 li r10,32
li r11,48 li r11,48
lvsl vr16,0,r4 /* Setup permute control vector */ LVS(vr16,0,r4) /* Setup permute control vector */
err3; lvx vr0,0,r4 err3; lvx vr0,0,r4
addi r4,r4,16 addi r4,r4,16
bf cr7*4+3,5f bf cr7*4+3,5f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16
...@@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3 ...@@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3
5: bf cr7*4+2,6f 5: bf cr7*4+2,6f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9 err3; lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
...@@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9 ...@@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9
6: bf cr7*4+1,7f 6: bf cr7*4+1,7f
err3; lvx vr3,r0,r4 err3; lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9 err3; lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10 err3; lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11 err3; lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
...@@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11 ...@@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11
.align 5 .align 5
8: 8:
err4; lvx vr7,r0,r4 err4; lvx vr7,r0,r4
vperm vr8,vr0,vr7,vr16 VPERM(vr8,vr0,vr7,vr16)
err4; lvx vr6,r4,r9 err4; lvx vr6,r4,r9
vperm vr9,vr7,vr6,vr16 VPERM(vr9,vr7,vr6,vr16)
err4; lvx vr5,r4,r10 err4; lvx vr5,r4,r10
vperm vr10,vr6,vr5,vr16 VPERM(vr10,vr6,vr5,vr16)
err4; lvx vr4,r4,r11 err4; lvx vr4,r4,r11
vperm vr11,vr5,vr4,vr16 VPERM(vr11,vr5,vr4,vr16)
err4; lvx vr3,r4,r12 err4; lvx vr3,r4,r12
vperm vr12,vr4,vr3,vr16 VPERM(vr12,vr4,vr3,vr16)
err4; lvx vr2,r4,r14 err4; lvx vr2,r4,r14
vperm vr13,vr3,vr2,vr16 VPERM(vr13,vr3,vr2,vr16)
err4; lvx vr1,r4,r15 err4; lvx vr1,r4,r15
vperm vr14,vr2,vr1,vr16 VPERM(vr14,vr2,vr1,vr16)
err4; lvx vr0,r4,r16 err4; lvx vr0,r4,r16
vperm vr15,vr1,vr0,vr16 VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128 addi r4,r4,128
err4; stvx vr8,r0,r3 err4; stvx vr8,r0,r3
err4; stvx vr9,r3,r9 err4; stvx vr9,r3,r9
...@@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16 ...@@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16
bf cr7*4+1,9f bf cr7*4+1,9f
err3; lvx vr3,r0,r4 err3; lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9 err3; lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10 err3; lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11 err3; lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
...@@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11 ...@@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11
9: bf cr7*4+2,10f 9: bf cr7*4+2,10f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9 err3; lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9 err3; stvx vr9,r3,r9
...@@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9 ...@@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9
10: bf cr7*4+3,11f 10: bf cr7*4+3,11f
err3; lvx vr1,r0,r4 err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
err3; stvx vr8,r0,r3 err3; stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16
......
...@@ -20,6 +20,15 @@ ...@@ -20,6 +20,15 @@
#include <asm/ppc_asm.h> #include <asm/ppc_asm.h>
_GLOBAL(memcpy_power7) _GLOBAL(memcpy_power7)
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
#ifdef CONFIG_ALTIVEC #ifdef CONFIG_ALTIVEC
cmpldi r5,16 cmpldi r5,16
cmpldi cr1,r5,4096 cmpldi cr1,r5,4096
...@@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7) ...@@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)
li r10,32 li r10,32
li r11,48 li r11,48
lvsl vr16,0,r4 /* Setup permute control vector */ LVS(vr16,0,r4) /* Setup permute control vector */
lvx vr0,0,r4 lvx vr0,0,r4
addi r4,r4,16 addi r4,r4,16
bf cr7*4+3,5f bf cr7*4+3,5f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
stvx vr8,r0,r3 stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16
...@@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7) ...@@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)
5: bf cr7*4+2,6f 5: bf cr7*4+2,6f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9 lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
...@@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7) ...@@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)
6: bf cr7*4+1,7f 6: bf cr7*4+1,7f
lvx vr3,r0,r4 lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9 lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10 lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11 lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
...@@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7) ...@@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)
.align 5 .align 5
8: 8:
lvx vr7,r0,r4 lvx vr7,r0,r4
vperm vr8,vr0,vr7,vr16 VPERM(vr8,vr0,vr7,vr16)
lvx vr6,r4,r9 lvx vr6,r4,r9
vperm vr9,vr7,vr6,vr16 VPERM(vr9,vr7,vr6,vr16)
lvx vr5,r4,r10 lvx vr5,r4,r10
vperm vr10,vr6,vr5,vr16 VPERM(vr10,vr6,vr5,vr16)
lvx vr4,r4,r11 lvx vr4,r4,r11
vperm vr11,vr5,vr4,vr16 VPERM(vr11,vr5,vr4,vr16)
lvx vr3,r4,r12 lvx vr3,r4,r12
vperm vr12,vr4,vr3,vr16 VPERM(vr12,vr4,vr3,vr16)
lvx vr2,r4,r14 lvx vr2,r4,r14
vperm vr13,vr3,vr2,vr16 VPERM(vr13,vr3,vr2,vr16)
lvx vr1,r4,r15 lvx vr1,r4,r15
vperm vr14,vr2,vr1,vr16 VPERM(vr14,vr2,vr1,vr16)
lvx vr0,r4,r16 lvx vr0,r4,r16
vperm vr15,vr1,vr0,vr16 VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128 addi r4,r4,128
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
...@@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7) ...@@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)
bf cr7*4+1,9f bf cr7*4+1,9f
lvx vr3,r0,r4 lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16 VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9 lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16 VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10 lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16 VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11 lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16 VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64 addi r4,r4,64
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
...@@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7) ...@@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)
9: bf cr7*4+2,10f 9: bf cr7*4+2,10f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9 lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16 VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32 addi r4,r4,32
stvx vr8,r0,r3 stvx vr8,r0,r3
stvx vr9,r3,r9 stvx vr9,r3,r9
...@@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7) ...@@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)
10: bf cr7*4+3,11f 10: bf cr7*4+3,11f
lvx vr1,r0,r4 lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16 VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16 addi r4,r4,16
stvx vr8,r0,r3 stvx vr8,r0,r3
addi r3,r3,16 addi r3,r3,16
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment