runtime: improve IndexByte for ppc64x

This change adds a better implementation of IndexByte for ppc64x. Improvement for bytes·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-16 12.5 8.48 -32.16% BenchmarkIndexByte/32-16 34.4 9.85 -71.37% BenchmarkIndexByte/4K-16 3089 217 -92.98% BenchmarkIndexByte/4M-16 3154810 207051 -93.44% BenchmarkIndexByte/64M-16 50564811 5579093 -88.97% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-16 800.41 1179.64 1.47x BenchmarkIndexByte/32-16 930.60 3249.10 3.49x BenchmarkIndexByte/4K-16 1325.71 18832.53 14.21x BenchmarkIndexByte/4M-16 1329.49 20257.29 15.24x BenchmarkIndexByte/64M-16 1327.19 12028.63 9.06x Improvement for strings·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte-16 25.9 7.69 -70.31% Fixes #19030 Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11 Reviewed-on: https://go-review.googlesource.com/37670 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>

runtime: improve IndexByte for ppc64x
This change adds a better implementation of IndexByte for ppc64x. Improvement for bytes·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-16 12.5 8.48 -32.16% BenchmarkIndexByte/32-16 34.4 9.85 -71.37% BenchmarkIndexByte/4K-16 3089 217 -92.98% BenchmarkIndexByte/4M-16 3154810 207051 -93.44% BenchmarkIndexByte/64M-16 50564811 5579093 -88.97% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-16 800.41 1179.64 1.47x BenchmarkIndexByte/32-16 930.60 3249.10 3.49x BenchmarkIndexByte/4K-16 1325.71 18832.53 14.21x BenchmarkIndexByte/4M-16 1329.49 20257.29 15.24x BenchmarkIndexByte/64M-16 1327.19 12028.63 9.06x Improvement for strings·IndexByte: benchmark old ns/op new ns/op delta BenchmarkIndexByte-16 25.9 7.69 -70.31% Fixes #19030 Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11 Reviewed-on: https://go-review.googlesource.com/37670 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
d60166d5 · Carlos Eduardo Seo · Lynn Boger · d5dc4905 · d60166d5
Commit d60166d5 authored Feb 27, 2017 by Carlos Eduardo Seo Committed by Lynn Boger Mar 16, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 165 additions and 35 deletions

src/runtime/asm_ppc64x.s src/runtime/asm_ppc64x.s +165 -35

No files found.
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1113,53 +1113,183 @@ equal:
 	MOVBZ	R3,ret+48(FP)
 	RET

-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
-	MOVD	s+0(FP), R3
-	MOVD	s_len+8(FP), R4
-	MOVBZ	c+24(FP), R5	// byte to find
-	MOVD	R3, R6		// store base for later
-	SUB	$1, R3
-	ADD	R3, R4		// end-1
+TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	s+0(FP), R3		// R3 = byte array pointer
+	MOVD	s_len+8(FP), R4		// R4 = length
+	MOVBZ	c+24(FP), R5		// R5 = byte
+	MOVD	$ret+32(FP), R14	// R14 = &ret
+	BR	runtime·indexbytebody<>(SB)
+
+TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s+0(FP), R3	  // R3 = string
+	MOVD	s_len+8(FP), R4	  // R4 = length
+	MOVBZ	c+16(FP), R5	  // R5 = byte
+	MOVD	$ret+24(FP), R14  // R14 = &ret
+	BR	runtime·indexbytebody<>(SB)
+
+TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	DCBT	(R3)		// Prepare cache line.
+	MOVD	R3,R10		// Save base address for calculating the index later.
+	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+
+	// Calculate last acceptable address and check for possible overflow
+	// using a saturated add.
+	// Overflows set last acceptable address to 0xffffffffffffffff.
+	ADD	R4,R3,R7
+	SUBC	R3,R7,R6
+	SUBE	R0,R0,R9
+	MOVW	R9,R6
+	OR	R6,R7,R7
+
+	RLDIMI	$16,R5,$32,R5
+	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
+	MOVD	$-1,R9
+	WORD $0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	RLDIMI	$32,R5,$0,R5
+	ADD	$-1,R7,R7
+#ifdef GOARCH_ppc64le
+	SLD	R6,R9,R9	// Prepare mask for Little Endian
+#else
+	SRD	R6,R9,R9	// Same for Big Endian
+#endif
+	BLE	small_string	// Jump to the small string case if it's <32 bytes.
+
+	// Case for length >32 bytes
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base
+	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
+	BNE	CR7,done
+
+	// Check for doubleword alignment and jump to the loop setup if aligned.
+	MOVFL	R8,CR7
+	BC	12,28,loop_setup
+
+	// Not aligned, so handle the second doubleword
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR7
+	BNE	CR7,done
+
+loop_setup:
+	// We are now aligned to a 16-byte boundary. We will load two doublewords
+	// per loop iteration. The last doubleword is in R7, so our loop counter
+	// starts at (R7-R8)/16.
+	SUB	R8,R7,R6
+	SRD	$4,R6,R6
+	MOVD	R6,CTR

+	// Note: when we have an align directive, align this loop to 32 bytes so
+	// it fits in a single icache sector.
 loop:
-	CMP	R3, R4
+	// Load two doublewords, then compare and merge in a single register. We
+	// will check two doublewords per iteration, then find out which of them
+	// contains the byte later. This speeds up the search.
+	MOVD	8(R8),R12
+	MOVDU	16(R8),R11
+	CMPB	R12,R5,R3
+	CMPB	R11,R5,R9
+	OR	R3,R9,R6
+	CMPU	R6,$0,CR7
+	BNE	CR7,found
+	BC	16,0,loop
+
+	// Counter zeroed, but we may have another doubleword to read
+	CMPU	R8,R7
 	BEQ	notfound
-	MOVBZU	1(R3), R7
-	CMP	R7, R5
-	BNE	loop

-	SUB	R6, R3		// remove base
-	MOVD	R3, ret+32(FP)
-	RET
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	BNE	CR6,done

 notfound:
-	MOVD	$-1, R3
-	MOVD	R3, ret+32(FP)
+	MOVD	$-1,R3
+	MOVD	R3,(R14)
 	RET

-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
-	MOVD	p+0(FP), R3
-	MOVD	b_len+8(FP), R4
-	MOVBZ	c+16(FP), R5	// byte to find
-	MOVD	R3, R6		// store base for later
-	SUB	$1, R3
-	ADD	R3, R4		// end-1
+found:
+	// One of the doublewords from the loop contains the byte we are looking
+	// for. Check the first doubleword and adjust the address if found.
+	CMPU	R3,$0,CR6
+	ADD	$-8,R8,R8
+	BNE	CR6,done
+
+	// Not found, so it must be in the second doubleword of the merged pair.
+	MOVD	R9,R3
+	ADD	$8,R8,R8
+
+done:
+	// At this point, R3 has 0xFF in the same position as the byte we are
+	// looking for in the doubleword. Use that to calculate the exact index
+	// of the byte.
+#ifdef GOARCH_ppc64le
+	ADD	$-1,R3,R11
+	ANDN	R3,R11,R11
+	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	CMPU	R8,R7		// Check if we are at the last doubleword.
+	SRD	$3,R11		// Convert trailing zeros to bytes.
+	ADD	R11,R8,R3
+	CMPU	R11,R4,CR7	// If at the last doubleword, check the byte offset.
+	BNE	return
+	BLE	CR7,return
+	MOVD	$-1,R3
+	MOVD	R3,(R14)
+	RET

-loop:
-	CMP	R3, R4
+return:
+	SUB	R10,R3		// Calculate index.
+	MOVD	R3,(R14)
+	RET
+
+small_string:
+	// We unroll this loop for better performance.
+	CMPU	R4,$0		// Check for length=0
 	BEQ	notfound
-	MOVBZU	1(R3), R7
-	CMP	R7, R5
-	BNE	loop

-	SUB	R6, R3		// remove base
-	MOVD	R3, ret+24(FP)
-	RET
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base.
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
+	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
+        CMPU	R8,R7
+	BNE	CR7,done
+	BEQ	notfound	// Hit length.
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound

-notfound:
-	MOVD	$-1, R3
-	MOVD	R3, ret+24(FP)
-	RET
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BR	notfound

 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s1_base+0(FP), R5