Commit b6245cef authored by Michael Munday's avatar Michael Munday

internal/bytealg: add SIMD byte count implementation for s390x

Add a 'single lane' SIMD implemementation of the single byte count
function for use on machines that support the vector facility. This
allows up to 16 bytes to be counted per loop iteration.

We can probably improve performance further by adding more 'lanes'
(i.e. counting more bytes in parallel) however this will increase
the complexity of the function so I'm not sure it is worth doing
yet.

name                old speed      new speed       delta
pkg:strings goos:linux goarch:s390x
CountByte/10         789MB/s ± 0%   1131MB/s ± 0%    +43.44%  (p=0.000 n=9+9)
CountByte/32         936MB/s ± 0%   3236MB/s ± 0%   +245.87%  (p=0.000 n=8+9)
CountByte/4096      1.06GB/s ± 0%  21.26GB/s ± 0%  +1907.07%  (p=0.000 n=10+10)
CountByte/4194304   1.06GB/s ± 0%  20.54GB/s ± 0%  +1838.50%  (p=0.000 n=10+10)
CountByte/67108864  1.06GB/s ± 0%  18.31GB/s ± 0%  +1629.51%  (p=0.000 n=10+10)
pkg:bytes goos:linux goarch:s390x
CountSingle/10       800MB/s ± 0%    986MB/s ± 0%    +23.21%  (p=0.000 n=9+10)
CountSingle/32       925MB/s ± 0%   2744MB/s ± 0%   +196.55%  (p=0.000 n=9+10)
CountSingle/4K      1.26GB/s ± 0%  19.44GB/s ± 0%  +1445.59%  (p=0.000 n=10+10)
CountSingle/4M      1.26GB/s ± 0%  20.28GB/s ± 0%  +1510.26%  (p=0.000 n=8+10)
CountSingle/64M     1.23GB/s ± 0%  17.78GB/s ± 0%  +1350.67%  (p=0.000 n=9+10)

Change-Id: I230d57905db92a8fdfc50b1d5be338941ae3a7a1
Reviewed-on: https://go-review.googlesource.com/c/go/+/199979
Run-TryBot: Michael Munday <mike.munday@ibm.com>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent d9ee4b28
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!arm,!arm64,!ppc64le,!ppc64
// +build !amd64,!arm,!arm64,!ppc64le,!ppc64,!s390x
package bytealg
......
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 arm arm64 ppc64le ppc64
// +build amd64 arm arm64 ppc64le ppc64 s390x
package bytealg
......
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// condition code masks
#define EQ 8
#define NE 7
// register assignments
#define R_ZERO R0
#define R_VAL R1
#define R_TMP R2
#define R_PTR R3
#define R_LEN R4
#define R_CHAR R5
#define R_RET R6
#define R_ITER R7
#define R_CNT R8
#define R_MPTR R9
// vector register assignments
#define V_ZERO V0
#define V_CHAR V1
#define V_MASK V2
#define V_VAL V3
#define V_CNT V4
// mask for trailing bytes in vector implementation
GLOBL countbytemask<>(SB), RODATA, $16
DATA countbytemask<>+0(SB)/8, $0x0101010101010101
DATA countbytemask<>+8(SB)/8, $0x0101010101010101
// func Count(b []byte, c byte) int
TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40
LMG b+0(FP), R_PTR, R_LEN
MOVBZ c+24(FP), R_CHAR
MOVD $ret+32(FP), R_RET
BR countbytebody<>(SB)
// func CountString(s string, c byte) int
TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32
LMG s+0(FP), R_PTR, R_LEN
MOVBZ c+16(FP), R_CHAR
MOVD $ret+24(FP), R_RET
BR countbytebody<>(SB)
// input:
// R_PTR = address of array of bytes
// R_LEN = number of bytes in array
// R_CHAR = byte value to count zero (extended to register width)
// R_RET = address of return value
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
MOVD $internalcpu·S390X+const_offsetS390xHasVX(SB), R_TMP
MOVD $countbytemask<>(SB), R_MPTR
CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0.
SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks
MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility
CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available
// Start of vector code (have vector facility).
//
// Set R_LEN to be the length mod 16 minus 1 to use as an index for
// vector 'load with length' (VLL). It will be in the range [-1,14].
// Also replicate c across a 16-byte vector and initialize V_ZERO.
ANDW $0xf, R_LEN
VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0}
VZERO V_ZERO // V_ZERO = [1]uint128{0}
ADDW $-1, R_LEN
VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c}
// Jump to loop if we have more than 15 bytes to process.
CGIJ $NE, R_ITER, $0, vxchunks
// Load 1-15 bytes and corresponding mask.
// Note: only the low 32-bits of R_LEN are used for the index.
VLL R_LEN, (R_PTR), V_VAL
VLL R_LEN, (R_MPTR), V_MASK
// Compare each byte in input chunk against byte to be counted.
// Each byte element will be set to either 0 (no match) or 1 (match).
VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
// Accumulate matched byte count in 128-bit integer value.
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} [1]uint128{x0+x1+x2+x3}
// Return rightmost (lowest) 64-bit part of accumulator.
VSTEG $1, V_CNT, (R_RET)
RET
vxchunks:
// Load 0x01 into every byte element in the 16-byte mask vector.
VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1}
VZERO V_CNT // intial uint128 count of 0
vxloop:
// Load input bytes in 16-byte chunks.
VL (R_PTR), V_VAL
// Compare each byte in input chunk against byte to be counted.
// Each byte element will be set to either 0 (no match) or 1 (match).
VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00
VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits
// Increment input string address.
MOVD $16(R_PTR), R_PTR
// Accumulate matched byte count in 128-bit integer value.
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} [1]uint128{x0+x1+x2+x3}
VAQ V_VAL, V_CNT, V_CNT // accumulate
// Repeat until all 16-byte chunks are done.
BRCTG R_ITER, vxloop
// Skip to end if there are no trailing bytes.
CIJ $EQ, R_LEN, $-1, vxret
// Load 1-15 bytes and corresponding mask.
// Note: only the low 32-bits of R_LEN are used for the index.
VLL R_LEN, (R_PTR), V_VAL
VLL R_LEN, (R_MPTR), V_MASK
// Compare each byte in input chunk against byte to be counted.
// Each byte element will be set to either 0 (no match) or 1 (match).
VCEQB V_CHAR, V_VAL, V_VAL
VN V_MASK, V_VAL, V_VAL
// Accumulate matched byte count in 128-bit integer value.
VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15}
VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} [1]uint128{x0+x1+x2+x3}
VAQ V_VAL, V_CNT, V_CNT // accumulate
vxret:
// Return rightmost (lowest) 64-bit part of accumulator.
VSTEG $1, V_CNT, (R_RET)
RET
novx:
// Start of non-vector code (the vector facility not available).
//
// Initialise counter and constant zero.
MOVD $0, R_CNT
MOVD $0, R_ZERO
loop:
// Read 1-byte from input and compare.
// Note: avoid putting LOCGR in critical path.
MOVBZ (R_PTR), R_VAL
MOVD $1, R_TMP
MOVD $1(R_PTR), R_PTR
CMPW R_VAL, R_CHAR
LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match)
ADD R_TMP, R_CNT // accumulate 64-bit result
// Repeat until all bytes have been checked.
BRCTG R_LEN, loop
ret:
MOVD R_CNT, (R_RET)
RET
ret0:
MOVD $0, (R_RET)
RET
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment