internal/bytealg: add SIMD byte count implementation for s390x

Add a 'single lane' SIMD implemementation of the single byte count function for use on machines that support the vector facility. This allows up to 16 bytes to be counted per loop iteration. We can probably improve performance further by adding more 'lanes' (i.e. counting more bytes in parallel) however this will increase the complexity of the function so I'm not sure it is worth doing yet. name old speed new speed delta pkg:strings goos:linux goarch:s390x CountByte/10 789MB/s ± 0% 1131MB/s ± 0% +43.44% (p=0.000 n=9+9) CountByte/32 936MB/s ± 0% 3236MB/s ± 0% +245.87% (p=0.000 n=8+9) CountByte/4096 1.06GB/s ± 0% 21.26GB/s ± 0% +1907.07% (p=0.000 n=10+10) CountByte/4194304 1.06GB/s ± 0% 20.54GB/s ± 0% +1838.50% (p=0.000 n=10+10) CountByte/67108864 1.06GB/s ± 0% 18.31GB/s ± 0% +1629.51% (p=0.000 n=10+10) pkg:bytes goos:linux goarch:s390x CountSingle/10 800MB/s ± 0% 986MB/s ± 0% +23.21% (p=0.000 n=9+10) CountSingle/32 925MB/s ± 0% 2744MB/s ± 0% +196.55% (p=0.000 n=9+10) CountSingle/4K 1.26GB/s ± 0% 19.44GB/s ± 0% +1445.59% (p=0.000 n=10+10) CountSingle/4M 1.26GB/s ± 0% 20.28GB/s ± 0% +1510.26% (p=0.000 n=8+10) CountSingle/64M 1.23GB/s ± 0% 17.78GB/s ± 0% +1350.67% (p=0.000 n=9+10) Change-Id: I230d57905db92a8fdfc50b1d5be338941ae3a7a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/199979 Run-TryBot: Michael Munday <mike.munday@ibm.com> Reviewed-by: Keith Randall <khr@golang.org>

internal/bytealg: add SIMD byte count implementation for s390x
Add a 'single lane' SIMD implemementation of the single byte count function for use on machines that support the vector facility. This allows up to 16 bytes to be counted per loop iteration. We can probably improve performance further by adding more 'lanes' (i.e. counting more bytes in parallel) however this will increase the complexity of the function so I'm not sure it is worth doing yet. name old speed new speed delta pkg:strings goos:linux goarch:s390x CountByte/10 789MB/s ± 0% 1131MB/s ± 0% +43.44% (p=0.000 n=9+9) CountByte/32 936MB/s ± 0% 3236MB/s ± 0% +245.87% (p=0.000 n=8+9) CountByte/4096 1.06GB/s ± 0% 21.26GB/s ± 0% +1907.07% (p=0.000 n=10+10) CountByte/4194304 1.06GB/s ± 0% 20.54GB/s ± 0% +1838.50% (p=0.000 n=10+10) CountByte/67108864 1.06GB/s ± 0% 18.31GB/s ± 0% +1629.51% (p=0.000 n=10+10) pkg:bytes goos:linux goarch:s390x CountSingle/10 800MB/s ± 0% 986MB/s ± 0% +23.21% (p=0.000 n=9+10) CountSingle/32 925MB/s ± 0% 2744MB/s ± 0% +196.55% (p=0.000 n=9+10) CountSingle/4K 1.26GB/s ± 0% 19.44GB/s ± 0% +1445.59% (p=0.000 n=10+10) CountSingle/4M 1.26GB/s ± 0% 20.28GB/s ± 0% +1510.26% (p=0.000 n=8+10) CountSingle/64M 1.23GB/s ± 0% 17.78GB/s ± 0% +1350.67% (p=0.000 n=9+10) Change-Id: I230d57905db92a8fdfc50b1d5be338941ae3a7a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/199979 Run-TryBot: Michael Munday <mike.munday@ibm.com> Reviewed-by: Keith Randall <khr@golang.org>
b6245cef · Michael Munday · d9ee4b28 · b6245cef · b6245cef · b6245cef
Commit b6245cef authored Oct 07, 2019 by Michael Munday
3 changed files
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64,!arm,!arm64,!ppc64le,!ppc64
+// +build !amd64,!arm,!arm64,!ppc64le,!ppc64,!s390x

 package bytealg


--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build amd64 arm arm64 ppc64le ppc64
+// +build amd64 arm arm64 ppc64le ppc64 s390x

 package bytealg


--- a/src/internal/bytealg/count_s390x.s
+++ b/src/internal/bytealg/count_s390x.s
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// condition code masks
+#define EQ 8
+#define NE 7
+
+// register assignments
+#define R_ZERO R0
+#define R_VAL  R1
+#define R_TMP  R2
+#define R_PTR  R3
+#define R_LEN  R4
+#define R_CHAR R5
+#define R_RET  R6
+#define R_ITER R7
+#define R_CNT  R8
+#define R_MPTR R9
+
+// vector register assignments
+#define V_ZERO V0
+#define V_CHAR V1
+#define V_MASK V2
+#define V_VAL  V3
+#define V_CNT  V4
+
+// mask for trailing bytes in vector implementation
+GLOBL countbytemask<>(SB), RODATA, $16
+DATA countbytemask<>+0(SB)/8, $0x0101010101010101
+DATA countbytemask<>+8(SB)/8, $0x0101010101010101
+
+// func Count(b []byte, c byte) int
+TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
+	LMG   b+0(FP), R_PTR, R_LEN
+	MOVBZ c+24(FP), R_CHAR
+	MOVD  $ret+32(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// func CountString(s string, c byte) int
+TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
+	LMG   s+0(FP), R_PTR, R_LEN
+	MOVBZ c+16(FP), R_CHAR
+	MOVD  $ret+24(FP), R_RET
+	BR    countbytebody<>(SB)
+
+// input:
+// R_PTR  = address of array of bytes
+// R_LEN  = number of bytes in array
+// R_CHAR = byte value to count zero (extended to register width)
+// R_RET  = address of return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD  $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP
+	MOVD  $countbytemask<>(SB), R_MPTR
+	CGIJ  $EQ, R_LEN, $0, ret0 // return if length is 0.
+	SRD   $4, R_LEN, R_ITER    // R_ITER is the number of 16-byte chunks
+	MOVBZ (R_TMP), R_TMP       // load bool indicating support for vector facility
+	CGIJ  $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
+
+	// Start of vector code (have vector facility).
+	//
+	// Set R_LEN to be the length mod 16 minus 1 to use as an index for
+	// vector 'load with length' (VLL). It will be in the range [-1,14].
+	// Also replicate c across a 16-byte vector and initialize V_ZERO.
+	ANDW  $0xf, R_LEN
+	VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
+	VZERO V_ZERO             // V_ZERO = [1]uint128{0}
+	ADDW  $-1, R_LEN
+	VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
+
+	// Jump to loop if we have more than 15 bytes to process.
+	CGIJ $NE, R_ITER, $0, vxchunks
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+vxchunks:
+	// Load 0x01 into every byte element in the 16-byte mask vector.
+	VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
+	VZERO  V_CNT      // intial uint128 count of 0
+
+vxloop:
+	// Load input bytes in 16-byte chunks.
+	VL (R_PTR), V_VAL
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
+	VN    V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
+
+	// Increment input string address.
+	MOVD $16(R_PTR), R_PTR
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+	// Repeat until all 16-byte chunks are done.
+	BRCTG R_ITER, vxloop
+
+	// Skip to end if there are no trailing bytes.
+	CIJ $EQ, R_LEN, $-1, vxret
+
+	// Load 1-15 bytes and corresponding mask.
+	// Note: only the low 32-bits of R_LEN are used for the index.
+	VLL R_LEN, (R_PTR), V_VAL
+	VLL R_LEN, (R_MPTR), V_MASK
+
+	// Compare each byte in input chunk against byte to be counted.
+	// Each byte element will be set to either 0 (no match) or 1 (match).
+	VCEQB V_CHAR, V_VAL, V_VAL
+	VN    V_MASK, V_VAL, V_VAL
+
+	// Accumulate matched byte count in 128-bit integer value.
+	VSUMB  V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
+	VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3}
+	VAQ    V_VAL, V_CNT, V_CNT  // accumulate
+
+vxret:
+	// Return rightmost (lowest) 64-bit part of accumulator.
+	VSTEG $1, V_CNT, (R_RET)
+	RET
+
+novx:
+	// Start of non-vector code (the vector facility not available).
+	//
+	// Initialise counter and constant zero.
+	MOVD $0, R_CNT
+	MOVD $0, R_ZERO
+
+loop:
+	// Read 1-byte from input and compare.
+	// Note: avoid putting LOCGR in critical path.
+	MOVBZ (R_PTR), R_VAL
+	MOVD  $1, R_TMP
+	MOVD  $1(R_PTR), R_PTR
+	CMPW  R_VAL, R_CHAR
+	LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
+	ADD   R_TMP, R_CNT       // accumulate 64-bit result
+
+	// Repeat until all bytes have been checked.
+	BRCTG R_LEN, loop
+
+ret:
+	MOVD R_CNT, (R_RET)
+	RET
+
+ret0:
+	MOVD $0, (R_RET)
+	RET