Commit be943df5 authored by Carlos Eduardo Seo's avatar Carlos Eduardo Seo Committed by Lynn Boger

runtime: improve IndexByte for ppc64x

This change adds a better implementation of IndexByte in asm that uses the
vector registers/instructions on ppc64x.

benchmark                            old ns/op     new ns/op     delta
BenchmarkIndexByte/10-8              9.70          9.37          -3.40%
BenchmarkIndexByte/32-8              10.9          10.9          +0.00%
BenchmarkIndexByte/4K-8              254           92.8          -63.46%
BenchmarkIndexByte/4M-8              249246        118435        -52.48%
BenchmarkIndexByte/64M-8             10737987      7383096       -31.24%

benchmark                            old MB/s     new MB/s     speedup
BenchmarkIndexByte/10-8              1030.63      1067.24      1.04x
BenchmarkIndexByte/32-8              2922.69      2928.53      1.00x
BenchmarkIndexByte/4K-8              16065.95     44156.45     2.75x
BenchmarkIndexByte/4M-8              16827.96     35414.21     2.10x
BenchmarkIndexByte/64M-8             6249.67      9089.53      1.45x

Change-Id: I81dbdd620f7bb4e395ce4d1f2a14e8e91e39f9a1
Reviewed-on: https://go-review.googlesource.com/71710
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarLynn Boger <laboger@linux.vnet.ibm.com>
parent 4fcc8359
...@@ -932,6 +932,12 @@ label1: ...@@ -932,6 +932,12 @@ label1:
// <mnemonic> VRT,VRA,VRB,VRC // <mnemonic> VRT,VRA,VRB,VRC
VPERM V3, V2, V1, V0 VPERM V3, V2, V1, V0
// Vector bit permute, VX-form
// <MNEMONIC> VRA,VRB,VRT produces
// <mnemonic> VRT,VRA,VRB
VBPERMQ V3,V1,V2
VBPERMD V3,V1,V2
// Vector select, VA-form // Vector select, VA-form
// <MNEMONIC> VRA,VRB,VRC,VRT produces // <MNEMONIC> VRA,VRB,VRC,VRT produces
// <mnemonic> VRT,VRA,VRB,VRC // <mnemonic> VRT,VRA,VRB,VRC
......
...@@ -859,6 +859,8 @@ const ( ...@@ -859,6 +859,8 @@ const (
AVCMPNEZB AVCMPNEZB
AVCMPNEZBCC AVCMPNEZBCC
AVPERM AVPERM
AVBPERMQ
AVBPERMD
AVSEL AVSEL
AVSPLT AVSPLT
AVSPLTB AVSPLTB
......
...@@ -474,6 +474,8 @@ var Anames = []string{ ...@@ -474,6 +474,8 @@ var Anames = []string{
"VCMPNEZB", "VCMPNEZB",
"VCMPNEZBCC", "VCMPNEZBCC",
"VPERM", "VPERM",
"VBPERMQ",
"VBPERMD",
"VSEL", "VSEL",
"VSPLT", "VSPLT",
"VSPLTB", "VSPLTB",
......
...@@ -421,6 +421,9 @@ var optab = []Optab{ ...@@ -421,6 +421,9 @@ var optab = []Optab{
/* Vector permute */ /* Vector permute */
{AVPERM, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector permute, va-form */ {AVPERM, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector permute, va-form */
/* Vector bit permute */
{AVBPERMQ, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector bit permute, vx-form */
/* Vector select */ /* Vector select */
{AVSEL, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector select, va-form */ {AVSEL, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector select, va-form */
...@@ -1378,6 +1381,9 @@ func buildop(ctxt *obj.Link) { ...@@ -1378,6 +1381,9 @@ func buildop(ctxt *obj.Link) {
case AVPERM: /* vperm */ case AVPERM: /* vperm */
opset(AVPERM, r0) opset(AVPERM, r0)
case AVBPERMQ: /* vbpermq, vbpermd */
opset(AVBPERMD, r0)
case AVSEL: /* vsel */ case AVSEL: /* vsel */
opset(AVSEL, r0) opset(AVSEL, r0)
...@@ -4165,6 +4171,11 @@ func (c *ctxt9) oprrr(a obj.As) uint32 { ...@@ -4165,6 +4171,11 @@ func (c *ctxt9) oprrr(a obj.As) uint32 {
case AVSRAD: case AVSRAD:
return OPVX(4, 964, 0, 0) /* vsrad - v2.07 */ return OPVX(4, 964, 0, 0) /* vsrad - v2.07 */
case AVBPERMQ:
return OPVC(4, 1356, 0, 0) /* vbpermq - v2.07 */
case AVBPERMD:
return OPVC(4, 1484, 0, 0) /* vbpermd - v3.00 */
case AVCLZB: case AVCLZB:
return OPVX(4, 1794, 0, 0) /* vclzb - v2.07 */ return OPVX(4, 1794, 0, 0) /* vclzb - v2.07 */
case AVCLZH: case AVCLZH:
......
...@@ -1084,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32 ...@@ -1084,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
DCBT (R3) // Prepare cache line. DCBT (R3) // Prepare cache line.
MOVD R3,R10 // Save base address for calculating the index later. MOVD R3,R17 // Save base address for calculating the index later.
RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
ADD R4,R3,R7 // Last acceptable address in R7.
// Calculate last acceptable address and check for possible overflow
// using a saturated add.
// Overflows set last acceptable address to 0xffffffffffffffff.
ADD R4,R3,R7
SUBC R3,R7,R6
SUBE R0,R0,R9
MOVW R9,R6
OR R6,R7,R7
RLDIMI $16,R5,$32,R5 RLDIMI $16,R5,$32,R5
CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently. CMPU R4,$32 // Check if it's a small string (<32 bytes). Those will be processed differently.
MOVD $-1,R9 MOVD $-1,R9
WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
RLDIMI $32,R5,$0,R5 RLDIMI $32,R5,$0,R5
MOVD R7,R10 // Save last acceptable address in R10 for later.
ADD $-1,R7,R7 ADD $-1,R7,R7
#ifdef GOARCH_ppc64le #ifdef GOARCH_ppc64le
SLD R6,R9,R9 // Prepare mask for Little Endian SLD R6,R9,R9 // Prepare mask for Little Endian
...@@ -1110,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 ...@@ -1110,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
#endif #endif
BLE small_string // Jump to the small string case if it's <32 bytes. BLE small_string // Jump to the small string case if it's <32 bytes.
// Case for length >32 bytes // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
// in V0, V1 and V10, then branch to the preloop.
ANDCC $63,R3,R11
BEQ CR0,qw_align
RLDICL $0,R3,$61,R11
MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
CMPB R12,R5,R3 // Check for a match. CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base AND R9,R3,R3 // Mask bytes below s_base
RLDICL $0,R7,$61,R4 // length-1 RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7 RLDICR $0,R7,$60,R7 // Last doubleword in R7
CMPU R3,$0,CR7 // If we have a match, jump to the final computation CMPU R3,$0,CR7 // If we have a match, jump to the final computation
BNE CR7,done BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
ADD R4,R11,R4
// Check for doubleword alignment and jump to the loop setup if aligned. // Check for quadword alignment
MOVFL R8,CR7 ANDCC $15,R8,R11
BC 12,28,loop_setup BEQ CR0,qw_align
// Not aligned, so handle the second doubleword // Not aligned, so handle the next doubleword
MOVDU 8(R8),R12 MOVD 0(R8),R12
CMPB R12,R5,R3 CMPB R12,R5,R3
CMPU R3,$0,CR7 CMPU R3,$0,CR7
BNE CR7,done BNE CR7,done
ADD $8,R8,R8
ADD $-8,R4,R4
// Either quadword aligned or 64-byte at this point. We can use LVX.
qw_align:
// Set up auxiliary data for the vectorized algorithm.
VSPLTISB $0,V0 // Replicate 0 across V0
VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
MTVRD R5,V1
LVSL (R0+R0),V11
VSLB V11,V10,V10
VSPLTB $7,V1,V1 // Replicate byte across V1
CMPU R4, $64 // If len <= 64, don't use the vectorized loop
BLE tail
// We will load 4 quardwords per iteration in the loop, so check for
// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
ANDCC $63,R8,R11
BEQ CR0,preloop
// Not 64-byte aligned. Load one quadword at a time until aligned.
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $16,R8,R8
ADD $-16,R4,R4
ANDCC $63,R8,R11
BEQ CR0,preloop
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6 // Check for byte in V4
BNE CR6,found_qw_align
ADD $-16,R4,R4
ADD $16,R8,R8
// 64-byte aligned. Prepare for the main loop.
preloop:
CMPU R4,$64
BLE tail // If len <= 64, don't use the vectorized loop
// We are now aligned to a 64-byte boundary. We will load 4 quadwords
// per loop iteration. The last doubleword is in R10, so our loop counter
// starts at (R10-R8)/64.
SUB R8,R10,R6
SRD $6,R6,R9 // Loop counter in R9
MOVD R9,CTR
loop_setup: MOVD $16,R11 // Load offsets for the vector loads
// We are now aligned to a 16-byte boundary. We will load two doublewords MOVD $32,R9
// per loop iteration. The last doubleword is in R7, so our loop counter MOVD $48,R7
// starts at (R7-R8)/16.
SUB R8,R7,R6
SRD $4,R6,R6
MOVD R6,CTR
// Note: when we have an align directive, align this loop to 32 bytes so // Main loop we will load 64 bytes per iteration
// it fits in a single icache sector.
loop: loop:
// Load two doublewords, then compare and merge in a single register. We LVX (R8+R0),V2 // Load 4 16-byte vectors
// will check two doublewords per iteration, then find out which of them LVX (R11+R8),V3
// contains the byte later. This speeds up the search. LVX (R9+R8),V4
MOVD 8(R8),R12 LVX (R7+R8),V5
MOVDU 16(R8),R11 VCMPEQUB V1,V2,V6 // Look for byte in each vector
CMPB R12,R5,R3 VCMPEQUB V1,V3,V7
CMPB R11,R5,R9 VCMPEQUB V1,V4,V8
OR R3,R9,R6 VCMPEQUB V1,V5,V9
CMPU R6,$0,CR7 VOR V6,V7,V11 // Compress the result in a single vector
BNE CR7,found VOR V8,V9,V12
BC 16,0,loop VOR V11,V12,V11
VCMPEQUBCC V0,V11,V11 // Check for byte
// Counter zeroed, but we may have another doubleword to read BGE CR6,found
CMPU R8,R7 ADD $64,R8,R8
BEQ notfound BC 16,0,loop // bdnz loop
MOVDU 8(R8),R12 // Handle the tailing bytes or R4 <= 64
CMPB R12,R5,R3 RLDICL $0,R6,$58,R4
CMPU R3,$0,CR6 tail:
BNE CR6,done CMPU R4,$0
BEQ notfound
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
ADD $16,R8,R8
CMPU R4,$16,CR6
BLE CR6,notfound
ADD $-16,R4,R4
LVX (R8+R0),V4
VCMPEQUBCC V1,V4,V6
BNE CR6,found_qw_align
notfound: notfound:
MOVD $-1,R3 MOVD $-1,R3
...@@ -1167,15 +1246,68 @@ notfound: ...@@ -1167,15 +1246,68 @@ notfound:
RET RET
found: found:
// One of the doublewords from the loop contains the byte we are looking // We will now compress the results into a single doubleword,
// for. Check the first doubleword and adjust the address if found. // so it can be moved to a GPR for the final index calculation.
CMPU R3,$0,CR6
ADD $-8,R8,R8 // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
BNE CR6,done // first bit of each byte into bits 48-63.
VBPERMQ V6,V10,V6
VBPERMQ V7,V10,V7
VBPERMQ V8,V10,V8
VBPERMQ V9,V10,V9
// Shift each 16-bit component into its correct position for
// merging into a single doubleword.
#ifdef GOARCH_ppc64le
VSLDOI $2,V7,V7,V7
VSLDOI $4,V8,V8,V8
VSLDOI $6,V9,V9,V9
#else
VSLDOI $6,V6,V6,V6
VSLDOI $4,V7,V7,V7
VSLDOI $2,V8,V8,V8
#endif
// Not found, so it must be in the second doubleword of the merged pair. // Merge V6-V9 into a single doubleword and move to a GPR.
MOVD R9,R3 VOR V6,V7,V11
ADD $8,R8,R8 VOR V8,V9,V4
VOR V4,V11,V4
MFVRD V4,R3
#ifdef GOARCH_ppc64le
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11 // Count trailing zeros (Little Endian).
#else
CNTLZD R3,R11 // Count leading zeros (Big Endian).
#endif
ADD R8,R11,R3 // Calculate byte address
return:
SUB R17,R3
MOVD R3,(R14)
RET
found_qw_align:
// Use the same algorithm as above. Compress the result into
// a single doubleword and move it to a GPR for the final
// calculation.
VBPERMQ V6,V10,V6
#ifdef GOARCH_ppc64le
MFVRD V6,R3
ADD $-1,R3,R11
ANDN R3,R11,R11
POPCNTD R11,R11
#else
VSLDOI $6,V6,V6,V6
MFVRD V6,R3
CNTLZD R3,R11
#endif
ADD R8,R11,R3
CMPU R11,R4
BLT return
BR notfound
done: done:
// At this point, R3 has 0xFF in the same position as the byte we are // At this point, R3 has 0xFF in the same position as the byte we are
...@@ -1191,17 +1323,10 @@ done: ...@@ -1191,17 +1323,10 @@ done:
CMPU R8,R7 // Check if we are at the last doubleword. CMPU R8,R7 // Check if we are at the last doubleword.
SRD $3,R11 // Convert trailing zeros to bytes. SRD $3,R11 // Convert trailing zeros to bytes.
ADD R11,R8,R3 ADD R11,R8,R3
CMPU R11,R4,CR7 // If at the last doubleword, check the byte offset. CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
BNE return BNE return
BLE CR7,return BLE CR7,return
MOVD $-1,R3 BR notfound
MOVD R3,(R14)
RET
return:
SUB R10,R3 // Calculate index.
MOVD R3,(R14)
RET
small_string: small_string:
// We unroll this loop for better performance. // We unroll this loop for better performance.
...@@ -1212,9 +1337,9 @@ small_string: ...@@ -1212,9 +1337,9 @@ small_string:
CMPB R12,R5,R3 // Check for a match. CMPB R12,R5,R3 // Check for a match.
AND R9,R3,R3 // Mask bytes below s_base. AND R9,R3,R3 // Mask bytes below s_base.
CMPU R3,$0,CR7 // If we have a match, jump to the final computation. CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
RLDICL $0,R7,$61,R4 // length-1 RLDICL $0,R7,$61,R6 // length-1
RLDICR $0,R7,$60,R7 // Last doubleword in R7. RLDICR $0,R7,$60,R7 // Last doubleword in R7.
CMPU R8,R7 CMPU R8,R7
BNE CR7,done BNE CR7,done
BEQ notfound // Hit length. BEQ notfound // Hit length.
...@@ -1242,7 +1367,6 @@ small_string: ...@@ -1242,7 +1367,6 @@ small_string:
MOVDU 8(R8),R12 MOVDU 8(R8),R12
CMPB R12,R5,R3 CMPB R12,R5,R3
CMPU R3,$0,CR6 CMPU R3,$0,CR6
CMPU R8,R7
BNE CR6,done BNE CR6,done
BR notfound BR notfound
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment