internal/bytealg: improve asm for memequal on ppc64x

This includes two changes to the memequal function. Previously the asm implementation on ppc64x for Equal called the internal function memequal using a BL, whereas the other asm implementations for bytes functions on ppc64x used BR. The BR is preferred because the BL causes the calling function to stack a frame. This changes Equal so it uses BR and is consistent with the others. This also uses vsx instructions where possible to improve performance of the compares for sizes over 32. Here are results from the sizes affected: Equal/32 8.40ns ± 0% 7.66ns ± 0% -8.81% (p=0.029 n=4+4) Equal/4K 193ns ± 0% 144ns ± 0% -25.39% (p=0.029 n=4+4) Equal/4M 346µs ± 0% 277µs ± 0% -20.08% (p=0.029 n=4+4) Equal/64M 7.66ms ± 1% 7.27ms ± 0% -5.10% (p=0.029 n=4+4) Change-Id: Ib6ee2cdc3e5d146e2705e3338858b8e965d25420 Reviewed-on: https://go-review.googlesource.com/c/143060 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com> Reviewed-by: David Chase <drchase@google.com>

internal/bytealg: improve asm for memequal on ppc64x
This includes two changes to the memequal function. Previously the asm implementation on ppc64x for Equal called the internal function memequal using a BL, whereas the other asm implementations for bytes functions on ppc64x used BR. The BR is preferred because the BL causes the calling function to stack a frame. This changes Equal so it uses BR and is consistent with the others. This also uses vsx instructions where possible to improve performance of the compares for sizes over 32. Here are results from the sizes affected: Equal/32 8.40ns ± 0% 7.66ns ± 0% -8.81% (p=0.029 n=4+4) Equal/4K 193ns ± 0% 144ns ± 0% -25.39% (p=0.029 n=4+4) Equal/4M 346µs ± 0% 277µs ± 0% -20.08% (p=0.029 n=4+4) Equal/64M 7.66ms ± 1% 7.27ms ± 0% -5.10% (p=0.029 n=4+4) Change-Id: Ib6ee2cdc3e5d146e2705e3338858b8e965d25420 Reviewed-on: https://go-review.googlesource.com/c/143060 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com> Reviewed-by: David Chase <drchase@google.com>
6994731e · Lynn Boger · 5c472132 · 6994731e
Commit 6994731e authored Aug 27, 2018 by Lynn Boger
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 38 deletions

src/internal/bytealg/equal_ppc64x.s src/internal/bytealg/equal_ppc64x.s +26 -38

No files found.
--- a/src/internal/bytealg/equal_ppc64x.s
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -7,17 +7,15 @@
 #include "go_asm.h"
 #include "textflag.h"

-TEXT ·Equal(SB),NOSPLIT,$0-49
+TEXT ·Equal(SB),NOSPLIT|NOFRAME,$0-49
 	MOVD	a_len+8(FP), R4
 	MOVD	b_len+32(FP), R5
 	CMP	R5, R4		// unequal lengths are not equal
 	BNE	noteq
 	MOVD	a_base+0(FP), R3
 	MOVD	b_base+24(FP), R4
-	BL	memeqbody<>(SB)
-
-	MOVBZ	R9,ret+48(FP)
-	RET
+	MOVD	$ret+48(FP), R10
+	BR	memeqbody<>(SB)

 noteq:
 	MOVBZ	$0,ret+48(FP)
@@ -28,7 +26,7 @@ equal:
 	MOVBZ	R3,ret+48(FP)
 	RET

-TEXT bytes·Equal(SB),NOSPLIT,$0-49
+TEXT bytes·Equal(SB),NOSPLIT|NOFRAME,$0-49
 	FUNCDATA $0, ·Equal·args_stackmap(SB)
 	MOVD	a_len+8(FP), R4
 	MOVD	b_len+32(FP), R5
@@ -36,10 +34,8 @@ TEXT bytes·Equal(SB),NOSPLIT,$0-49
 	BNE	noteq
 	MOVD	a_base+0(FP), R3
 	MOVD	b_base+24(FP), R4
-	BL	memeqbody<>(SB)
-
-	MOVBZ	R9,ret+48(FP)
-	RET
+	MOVD	$ret+48(FP), R10
+	BR	memeqbody<>(SB)

 noteq:
 	MOVBZ	$0,ret+48(FP)
@@ -51,25 +47,23 @@ equal:
 	RET

 // memequal(a, b unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal(SB),NOSPLIT,$0-25
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
 	MOVD    a+0(FP), R3
 	MOVD    b+8(FP), R4
 	MOVD    size+16(FP), R5
+	MOVD    $ret+24(FP), R10

-	BL	memeqbody<>(SB)
-	MOVB    R9, ret+24(FP)
-	RET
+	BR	memeqbody<>(SB)

 // memequal_varlen(a, b unsafe.Pointer) bool
-TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
 	MOVD	a+0(FP), R3
 	MOVD	b+8(FP), R4
 	CMP	R3, R4
 	BEQ	eq
 	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
-	BL	memeqbody<>(SB)
-	MOVB	R9, ret+16(FP)
-	RET
+	MOVD    $ret+16(FP), R10
+	BR	memeqbody<>(SB)
 eq:
 	MOVD	$1, R3
 	MOVB	R3, ret+16(FP)
@@ -79,7 +73,7 @@ eq:
 // R3 = s1
 // R4 = s2
 // R5 = len
-// R9 = return value
+// R10 = addr of return value (byte)
 TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD    R5,CTR
 	CMP     R5,$8		// only optimize >=8
@@ -92,26 +86,19 @@ TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
 setup32a:                       // 8 byte aligned, >= 32 bytes
 	SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
 	MOVD	R6,CTR
+	MOVD	$16,R14		// index for VSX loads and stores
 loop32a:
-	MOVD    0(R3),R6        // doublewords to compare
-	MOVD    0(R4),R7
-	MOVD	8(R3),R8	//
-	MOVD	8(R4),R9
-	CMP     R6,R7           // bytes batch?
-	BNE     noteq
-	MOVD	16(R3),R6
-	MOVD	16(R4),R7
-	CMP     R8,R9		// bytes match?
-	MOVD	24(R3),R8
-	MOVD	24(R4),R9
-	BNE     noteq
-	CMP     R6,R7           // bytes match?
-	BNE	noteq
+	LXVD2X  (R3+R0), VS32	// VS32 = V0
+	LXVD2X  (R4+R0), VS33	// VS33 = V1
+	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
+	BGE     CR6, noteq
+	LXVD2X  (R3+R14), VS32
+	LXVD2X  (R4+R14), VS33
+	VCMPEQUBCC V0, V1, V2
+	BGE     CR6, noteq
 	ADD     $32,R3		// bump up to next 32
 	ADD     $32,R4
-	CMP     R8,R9           // bytes match?
-	BC      8,2,loop32a	// br ctr and cr
-	BNE	noteq
+	BC      16, 0, loop32a  // br ctr and cr
 	ANDCC	$24,R5,R6       // Any 8 byte chunks?
 	BEQ	leftover	// and result is 0
 setup8a:
@@ -145,9 +132,10 @@ simple:
 	BNE	noteq
 	BR	equal
 noteq:
-	MOVD    $0, R9
+	MOVB    $0, (R10)
 	RET
 equal:
-	MOVD    $1, R9
+	MOVD	$1, R3
+	MOVB	R3, (R10)
 	RET