runtime: improve IndexByte for ppc64x

This change adds a better implementation of IndexByte in asm that uses the vector registers/instructions on ppc64x. benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-8 9.70 9.37 -3.40% BenchmarkIndexByte/32-8 10.9 10.9 +0.00% BenchmarkIndexByte/4K-8 254 92.8 -63.46% BenchmarkIndexByte/4M-8 249246 118435 -52.48% BenchmarkIndexByte/64M-8 10737987 7383096 -31.24% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-8 1030.63 1067.24 1.04x BenchmarkIndexByte/32-8 2922.69 2928.53 1.00x BenchmarkIndexByte/4K-8 16065.95 44156.45 2.75x BenchmarkIndexByte/4M-8 16827.96 35414.21 2.10x BenchmarkIndexByte/64M-8 6249.67 9089.53 1.45x Change-Id: I81dbdd620f7bb4e395ce4d1f2a14e8e91e39f9a1 Reviewed-on: https://go-review.googlesource.com/71710 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>

runtime: improve IndexByte for ppc64x
This change adds a better implementation of IndexByte in asm that uses the vector registers/instructions on ppc64x. benchmark old ns/op new ns/op delta BenchmarkIndexByte/10-8 9.70 9.37 -3.40% BenchmarkIndexByte/32-8 10.9 10.9 +0.00% BenchmarkIndexByte/4K-8 254 92.8 -63.46% BenchmarkIndexByte/4M-8 249246 118435 -52.48% BenchmarkIndexByte/64M-8 10737987 7383096 -31.24% benchmark old MB/s new MB/s speedup BenchmarkIndexByte/10-8 1030.63 1067.24 1.04x BenchmarkIndexByte/32-8 2922.69 2928.53 1.00x BenchmarkIndexByte/4K-8 16065.95 44156.45 2.75x BenchmarkIndexByte/4M-8 16827.96 35414.21 2.10x BenchmarkIndexByte/64M-8 6249.67 9089.53 1.45x Change-Id: I81dbdd620f7bb4e395ce4d1f2a14e8e91e39f9a1 Reviewed-on: https://go-review.googlesource.com/71710 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
be943df5 · Carlos Eduardo Seo · Lynn Boger · 4fcc8359 · be943df5 · be943df5
Commit be943df5 authored Aug 07, 2017 by Carlos Eduardo Seo Committed by Lynn Boger Nov 06, 2017
5 changed files
--- a/src/cmd/asm/internal/asm/testdata/ppc64.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64.s
@@ -932,6 +932,12 @@ label1:
 //	<mnemonic> VRT,VRA,VRB,VRC
 	VPERM V3, V2, V1, V0
+//	Vector bit permute, VX-form
+//	<MNEMONIC> VRA,VRB,VRT produces
+//	<mnemonic> VRT,VRA,VRB
+	VBPERMQ	V3,V1,V2
+	VBPERMD	V3,V1,V2
 //	Vector select, VA-form
 //	<MNEMONIC> VRA,VRB,VRC,VRT produces
 //	<mnemonic> VRT,VRA,VRB,VRC

--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -859,6 +859,8 @@ const (
 	AVCMPNEZB
 	AVCMPNEZBCC
 	AVPERM
+	AVBPERMQ
+	AVBPERMD
 	AVSEL
 	AVSPLT
 	AVSPLTB

--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -474,6 +474,8 @@ var Anames = []string{
 	"VCMPNEZB",
 	"VCMPNEZBCC",
 	"VPERM",
+	"VBPERMQ",
+	"VBPERMD",
 	"VSEL",
 	"VSPLT",
 	"VSPLTB",

--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -421,6 +421,9 @@ var optab = []Optab{
 	/* Vector permute */
 	{AVPERM, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector permute, va-form */
+	/* Vector bit permute */
+	{AVBPERMQ, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector bit permute, vx-form */
 	/* Vector select */
 	{AVSEL, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector select, va-form */
@@ -1378,6 +1381,9 @@ func buildop(ctxt *obj.Link) {
 		case AVPERM: /* vperm */
 			opset(AVPERM, r0)
+		case AVBPERMQ: /* vbpermq, vbpermd */
+			opset(AVBPERMD, r0)
 		case AVSEL: /* vsel */
 			opset(AVSEL, r0)
@@ -4165,6 +4171,11 @@ func (c *ctxt9) oprrr(a obj.As) uint32 {
 	case AVSRAD:
 		return OPVX(4, 964, 0, 0) /* vsrad - v2.07 */
+	case AVBPERMQ:
+		return OPVC(4, 1356, 0, 0) /* vbpermq - v2.07 */
+	case AVBPERMD:
+		return OPVC(4, 1484, 0, 0) /* vbpermd - v3.00 */
 	case AVCLZB:
 		return OPVX(4, 1794, 0, 0) /* vclzb - v2.07 */
 	case AVCLZH:

--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1084,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
 TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 	DCBT	(R3)		// Prepare cache line.
-	MOVD	R3,R10		// Save base address for calculating the index later.
+	MOVD	R3,R17		// Save base address for calculating the index later.
 	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
 	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	ADD	R4,R3,R7	// Last acceptable address in R7.
-	// Calculate last acceptable address and check for possible overflow
-	// using a saturated add.
-	// Overflows set last acceptable address to 0xffffffffffffffff.
-	ADD	R4,R3,R7
-	SUBC	R3,R7,R6
-	SUBE	R0,R0,R9
-	MOVW	R9,R6
-	OR	R6,R7,R7
 	RLDIMI	$16,R5,$32,R5
 	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
 	MOVD	$-1,R9
-	WORD $0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	WORD	$0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
 	RLDIMI	$32,R5,$0,R5
+	MOVD	R7,R10		// Save last acceptable address in R10 for later.
 	ADD	$-1,R7,R7
 #ifdef GOARCH_ppc64le
 	SLD	R6,R9,R9	// Prepare mask for Little Endian
@@ -1110,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 #endif
 	BLE	small_string	// Jump to the small string case if it's <32 bytes.
-	// Case for length >32 bytes
+	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+	// in V0, V1 and V10, then branch to the preloop.
+	ANDCC	$63,R3,R11
+	BEQ	CR0,qw_align
+	RLDICL	$0,R3,$61,R11
 	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base
-	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICL	$0,R7,$61,R6	// length-1
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
 	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	ADD	R4,R11,R4
-	// Check for doubleword alignment and jump to the loop setup if aligned.
+	// Check for quadword alignment
-	MOVFL	R8,CR7
+	ANDCC	$15,R8,R11
-	BC	12,28,loop_setup
+	BEQ	CR0,qw_align
-	// Not aligned, so handle the second doubleword
+	// Not aligned, so handle the next doubleword
-	MOVDU	8(R8),R12
+	MOVD	0(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR7
 	BNE	CR7,done
+	ADD	$8,R8,R8
+	ADD	$-8,R4,R4
+	// Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+	// Set up auxiliary data for the vectorized algorithm.
+	VSPLTISB  $0,V0		// Replicate 0 across V0
+	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
+	MTVRD	  R5,V1
+	LVSL	  (R0+R0),V11
+	VSLB	  V11,V10,V10
+	VSPLTB	  $7,V1,V1	// Replicate byte across V1
+	CMPU	  R4, $64	// If len <= 64, don't use the vectorized loop
+	BLE	  tail
+	// We will load 4 quardwords per iteration in the loop, so check for
+	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+	ANDCC	  $63,R8,R11
+	BEQ	  CR0,preloop
+	// Not 64-byte aligned. Load one quadword at a time until aligned.
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	ADD	    $-16,R4,R4
+	ANDCC	    $63,R8,R11
+	BEQ	    CR0,preloop
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
+	BNE	    CR6,found_qw_align
+	ADD	    $-16,R4,R4
+	ADD	    $16,R8,R8
+	// 64-byte aligned. Prepare for the main loop.
+preloop:
+	CMPU	R4,$64
+	BLE	tail	      // If len <= 64, don't use the vectorized loop
+	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
+	// per loop iteration. The last doubleword is in R10, so our loop counter
+	// starts at (R10-R8)/64.
+	SUB	R8,R10,R6
+	SRD	$6,R6,R9      // Loop counter in R9
+	MOVD	R9,CTR
-loop_setup:
+	MOVD	$16,R11      // Load offsets for the vector loads
-	// We are now aligned to a 16-byte boundary. We will load two doublewords
+	MOVD	$32,R9
-	// per loop iteration. The last doubleword is in R7, so our loop counter
+	MOVD	$48,R7
-	// starts at (R7-R8)/16.
-	SUB	R8,R7,R6
-	SRD	$4,R6,R6
-	MOVD	R6,CTR
-	// Note: when we have an align directive, align this loop to 32 bytes so
+	// Main loop we will load 64 bytes per iteration
-	// it fits in a single icache sector.
 loop:
-	// Load two doublewords, then compare and merge in a single register. We
+	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
-	// will check two doublewords per iteration, then find out which of them
+	LVX	    (R11+R8),V3
-	// contains the byte later. This speeds up the search.
+	LVX	    (R9+R8),V4
-	MOVD	8(R8),R12
+	LVX	    (R7+R8),V5
-	MOVDU	16(R8),R11
+	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
-	CMPB	R12,R5,R3
+	VCMPEQUB    V1,V3,V7
-	CMPB	R11,R5,R9
+	VCMPEQUB    V1,V4,V8
-	OR	R3,R9,R6
+	VCMPEQUB    V1,V5,V9
-	CMPU	R6,$0,CR7
+	VOR	    V6,V7,V11	      // Compress the result in a single vector
-	BNE	CR7,found
+	VOR	    V8,V9,V12
-	BC	16,0,loop
+	VOR	    V11,V12,V11
+	VCMPEQUBCC  V0,V11,V11	      // Check for byte
-	// Counter zeroed, but we may have another doubleword to read
+	BGE	    CR6,found
-	CMPU	R8,R7
+	ADD	    $64,R8,R8
-	BEQ	notfound
+	BC	    16,0,loop	      // bdnz loop
-	MOVDU	8(R8),R12
+	// Handle the tailing bytes or R4 <= 64
-	CMPB	R12,R5,R3
+	RLDICL	$0,R6,$58,R4
-	CMPU	R3,$0,CR6
+tail:
-	BNE	CR6,done
+	CMPU	    R4,$0
+	BEQ	    notfound
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
+	ADD	    $16,R8,R8
+	CMPU	    R4,$16,CR6
+	BLE	    CR6,notfound
+	ADD	    $-16,R4,R4
+	LVX	    (R8+R0),V4
+	VCMPEQUBCC  V1,V4,V6
+	BNE	    CR6,found_qw_align
 notfound:
 	MOVD	$-1,R3
@@ -1167,15 +1246,68 @@ notfound:
 	RET
 found:
-	// One of the doublewords from the loop contains the byte we are looking
+	// We will now compress the results into a single doubleword,
-	// for. Check the first doubleword and adjust the address if found.
+	// so it can be moved to a GPR for the final index calculation.
-	CMPU	R3,$0,CR6
-	ADD	$-8,R8,R8
+	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
-	BNE	CR6,done
+	// first bit of each byte into bits 48-63.
+	VBPERMQ	  V6,V10,V6
+	VBPERMQ	  V7,V10,V7
+	VBPERMQ	  V8,V10,V8
+	VBPERMQ	  V9,V10,V9
+	// Shift each 16-bit component into its correct position for
+	// merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+	VSLDOI	  $2,V7,V7,V7
+	VSLDOI	  $4,V8,V8,V8
+	VSLDOI	  $6,V9,V9,V9
+#else
+	VSLDOI	  $6,V6,V6,V6
+	VSLDOI	  $4,V7,V7,V7
+	VSLDOI	  $2,V8,V8,V8
+#endif
-	// Not found, so it must be in the second doubleword of the merged pair.
+	// Merge V6-V9 into a single doubleword and move to a GPR.
-	MOVD	R9,R3
+	VOR	V6,V7,V11
-	ADD	$8,R8,R8
+	VOR	V8,V9,V4
+	VOR	V4,V11,V4
+	MFVRD	V4,R3
+#ifdef GOARCH_ppc64le
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	ADD	R8,R11,R3	// Calculate byte address
+return:
+	SUB	R17,R3
+	MOVD	R3,(R14)
+	RET
+found_qw_align:
+	// Use the same algorithm as above. Compress the result into
+	// a single doubleword and move it to a GPR for the final
+	// calculation.
+	VBPERMQ	  V6,V10,V6
+#ifdef GOARCH_ppc64le
+	MFVRD	  V6,R3
+	ADD	  $-1,R3,R11
+	ANDN	  R3,R11,R11
+	POPCNTD	  R11,R11
+#else
+	VSLDOI	  $6,V6,V6,V6
+	MFVRD	  V6,R3
+	CNTLZD	  R3,R11
+#endif
+	ADD	  R8,R11,R3
+	CMPU	  R11,R4
+	BLT	  return
+	BR	  notfound
 done:
 	// At this point, R3 has 0xFF in the same position as the byte we are
@@ -1191,17 +1323,10 @@ done:
 	CMPU	R8,R7		// Check if we are at the last doubleword.
 	SRD	$3,R11		// Convert trailing zeros to bytes.
 	ADD	R11,R8,R3
-	CMPU	R11,R4,CR7	// If at the last doubleword, check the byte offset.
+	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
 	BNE	return
 	BLE	CR7,return
-	MOVD	$-1,R3
+	BR	notfound
-	MOVD	R3,(R14)
-	RET
-return:
-	SUB	R10,R3		// Calculate index.
-	MOVD	R3,(R14)
-	RET
 small_string:
 	// We unroll this loop for better performance.
@@ -1212,9 +1337,9 @@ small_string:
 	CMPB	R12,R5,R3	// Check for a match.
 	AND	R9,R3,R3	// Mask bytes below s_base.
 	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
-	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICL	$0,R7,$61,R6	// length-1
 	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
-        CMPU	R8,R7
+	CMPU	R8,R7
 	BNE	CR7,done
 	BEQ	notfound	// Hit length.
@@ -1242,7 +1367,6 @@ small_string:
 	MOVDU	8(R8),R12
 	CMPB	R12,R5,R3
 	CMPU	R3,$0,CR6
-	CMPU	R8,R7
 	BNE	CR6,done
 	BR	notfound