Commit 9a14cd9e authored by Wei Xiao's avatar Wei Xiao Committed by Brad Fitzpatrick

bytes: add optimized countByte for arm64

Use SIMD instructions when counting a single byte.
Inspired from runtime IndexByte implementation.

Benchmark results of bytes, where 1 byte in every 8 is the one we are looking:

name               old time/op   new time/op    delta
CountSingle/10-8    96.1ns ± 1%    38.8ns ± 0%    -59.64%  (p=0.000 n=9+7)
CountSingle/32-8     172ns ± 2%      36ns ± 1%    -79.27%  (p=0.000 n=10+10)
CountSingle/4K-8    18.2µs ± 1%     0.9µs ± 0%    -95.17%  (p=0.000 n=9+10)
CountSingle/4M-8    18.4ms ± 0%     0.9ms ± 0%    -95.00%  (p=0.000 n=10+9)
CountSingle/64M-8    284ms ± 4%      19ms ± 0%    -93.40%  (p=0.000 n=10+10)

name               old speed     new speed      delta
CountSingle/10-8   104MB/s ± 1%   258MB/s ± 0%   +147.99%  (p=0.000 n=9+10)
CountSingle/32-8   185MB/s ± 1%   897MB/s ± 1%   +385.33%  (p=0.000 n=9+10)
CountSingle/4K-8   225MB/s ± 1%  4658MB/s ± 0%  +1967.40%  (p=0.000 n=9+10)
CountSingle/4M-8   228MB/s ± 0%  4555MB/s ± 0%  +1901.71%  (p=0.000 n=10+9)
CountSingle/64M-8  236MB/s ± 4%  3575MB/s ± 0%  +1414.69%  (p=0.000 n=10+10)

Change-Id: Ifccb51b3c8658c49773fe05147c3cf3aead361e5
Reviewed-on: https://go-review.googlesource.com/71111Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 63ef3cde
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bytes
func countByte(s []byte, c byte) int // bytes_arm64.s
// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
func Index(s, sep []byte) int {
n := len(sep)
switch {
case n == 0:
return 0
case n == 1:
return IndexByte(s, sep[0])
case n == len(s):
if Equal(sep, s) {
return 0
}
return -1
case n > len(s):
return -1
}
c := sep[0]
i := 0
fails := 0
t := s[:len(s)-n+1]
for i < len(t) {
if t[i] != c {
o := IndexByte(t[i:], c)
if o < 0 {
break
}
i += o
}
if Equal(s[i:i+n], sep) {
return i
}
i++
fails++
if fails >= 4+i>>4 && i < len(t) {
// Give up on IndexByte, it isn't skipping ahead
// far enough to be better than Rabin-Karp.
// Experiments (using IndexPeriodic) suggest
// the cutover is about 16 byte skips.
// TODO: if large prefixes of sep are matching
// we should cutover at even larger average skips,
// because Equal becomes that much more expensive.
// This code does not take that effect into account.
j := indexRabinKarp(s[i:], sep)
if j < 0 {
return -1
}
return i + j
}
}
return -1
}
// Count counts the number of non-overlapping instances of sep in s.
// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
func Count(s, sep []byte) int {
if len(sep) == 1 {
return countByte(s, sep[0])
}
return countGeneric(s, sep)
}
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// countByte(s []byte, c byte) int
TEXT bytes·countByte(SB),NOSPLIT,$0-40
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+24(FP), R1
// R11 = count of byte to search
MOVD $0, R11
// short path to handle 0-byte case
CBZ R2, done
CMP $0x20, R2
// jump directly to tail if length < 32
BLO tail
ANDS $0x1f, R0, R9
BEQ chunk
// Work with not 32-byte aligned head
BIC $0x1f, R0, R3
ADD $0x20, R3
head_loop:
MOVBU.P 1(R0), R5
CMP R5, R1
CINC EQ, R11, R11
SUB $1, R2, R2
CMP R0, R3
BNE head_loop
// Work with 32-byte aligned chunks
chunk:
BIC $0x1f, R2, R9
// The first chunk can also be the last
CBZ R9, tail
// R3 = end of 32-byte chunks
ADD R0, R9, R3
MOVD $1, R5
VMOV R5, V5.B16
// R2 = length of tail
SUB R9, R2, R2
// Duplicate R1 (byte to search) to 16 1-byte elements of V0
VMOV R1, V0.B16
// Clear the low 64-bit element of V7 and V8
VEOR V7.B8, V7.B8, V7.B8
VEOR V8.B8, V8.B8, V8.B8
// Count the target byte in 32-byte chunk
chunk_loop:
VLD1.P (R0), [V1.B16, V2.B16]
CMP R0, R3
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
// Clear the higher 7 bits
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
// Count lanes match the requested byte
VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
VUADDLV V6.B16, V7
// Accumulate the count in low 64-bit element of V8 when inside the loop
VADD V7, V8
BNE chunk_loop
VMOV V8.D[0], R6
ADD R6, R11, R11
CBZ R2, done
tail:
// Work with tail shorter than 32 bytes
MOVBU.P 1(R0), R5
SUB $1, R2, R2
CMP R5, R1
CINC EQ, R11, R11
CBNZ R2, tail
done:
MOVD R11, ret+32(FP)
RET
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!s390x
// +build !amd64,!s390x,!arm64
package bytes
......
......@@ -51,6 +51,11 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8
SHA1P V11.S4, V10, V9 // 49110b5e
VADDV V0.S4, V0 // 00b8b14e
VMOVI $82, V0.B16 // 40e6024f
VUADDLV V6.B16, V6 // c638306e
VADD V1, V2, V3 // 4384e15e
VADD V1, V3, V3 // 6384e15e
VSUB V12, V30, V30 // de87ec7e
VSUB V12, V20, V30 // 9e86ec7e
// LTYPE1 imsr ',' spreg ','
// {
......
......@@ -745,6 +745,8 @@ const (
AVMOVS
AVADDV
AVMOVI
AVUADDLV
AVSUB
ALAST
AB = obj.AJMP
ABL = obj.ACALL
......
......@@ -380,5 +380,7 @@ var Anames = []string{
"VMOVS",
"VADDV",
"VMOVI",
"VUADDLV",
"VSUB",
"LAST",
}
......@@ -579,6 +579,9 @@ var optab = []Optab{
{ASHA1SU0, C_ARNG, C_ARNG, C_ARNG, 1, 4, 0, 0, 0},
{ASHA256H, C_ARNG, C_VREG, C_VREG, 1, 4, 0, 0, 0},
{AVADDP, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0},
{AVADD, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0},
{AVADD, C_VREG, C_VREG, C_VREG, 89, 4, 0, 0, 0},
{AVADD, C_VREG, C_NONE, C_VREG, 89, 4, 0, 0, 0},
{AVLD1, C_ZOREG, C_NONE, C_LIST, 81, 4, 0, 0, 0},
{AVLD1, C_LOREG, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST},
{AVMOV, C_ELEM, C_NONE, C_REG, 73, 4, 0, 0, 0},
......@@ -2063,9 +2066,11 @@ func buildop(ctxt *obj.Link) {
oprangeset(AVAND, t)
oprangeset(AVCMEQ, t)
oprangeset(AVORR, t)
oprangeset(AVADD, t)
oprangeset(AVEOR, t)
case AVADD:
oprangeset(AVSUB, t)
case AAESD:
oprangeset(AAESE, t)
oprangeset(AAESMC, t)
......@@ -2083,6 +2088,9 @@ func buildop(ctxt *obj.Link) {
case ASHA1SU0:
oprangeset(ASHA256SU1, t)
case AVADDV:
oprangeset(AVUADDLV, t)
case ASHA1H,
AVMOV,
AVLD1,
......@@ -2090,7 +2098,6 @@ func buildop(ctxt *obj.Link) {
AVST1,
AVDUP,
AVMOVS,
AVADDV,
AVMOVI:
break
......@@ -3612,7 +3619,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
o1 |= uint32(p.From.Offset)
o1 |= uint32(r&31) << 5
case 85: /* vaddv Vn.<T>, Vd*/
case 85: /* vaddv/vuaddlv Vn.<T>, Vd*/
af := int((p.From.Reg >> 5) & 15)
o1 = c.oprrr(p, p.As)
rf := int((p.From.Reg) & 31)
......@@ -3681,6 +3688,27 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) {
rel.Type = objabi.R_ADDRARM64
o3 |= 2<<30 | 5<<27 | 2<<23 | 1<<22 | uint32(p.To.Offset&31)<<10 | (REGTMP&31)<<5 | uint32(p.To.Reg&31)
case 89: /* vadd/vsub Vm, Vn, Vd */
switch p.As {
case AVADD:
o1 = 5<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10
case AVSUB:
o1 = 7<<28 | 7<<25 | 7<<21 | 1<<15 | 1<<10
default:
c.ctxt.Diag("bad opcode: %v\n", p)
break
}
rf := int(p.From.Reg)
rt := int(p.To.Reg)
r := int(p.Reg)
if r == 0 {
r = rt
}
o1 |= (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31)
// This is supposed to be something that stops execution.
// It's not supposed to be reached, ever, but if it is, we'd
// like to be able to tell how we got there. Assemble as
......@@ -4245,6 +4273,9 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 {
case AVADDV:
return 7<<25 | 3<<20 | 3<<15 | 7<<11
case AVUADDLV:
return 1<<29 | 7<<25 | 3<<20 | 7<<11
}
c.ctxt.Diag("%v: bad rrr %d %v", p, a, a)
......
......@@ -15,6 +15,10 @@ Go Assembly for ARM64 Reference Manual
// TODO
3. Alphabetical list of SIMD instructions
VADD: Add (scalar)
VADD <Vm>, <Vn>, <Vd>
Add corresponding low 64-bit elements in <Vm> and <Vn>,
place the result into low 64-bit element of <Vd>.
VADD: Add (vector).
VADD <Vm>.T, <Vn>.<T>, <Vd>.<T>
......@@ -115,6 +119,16 @@ Go Assembly for ARM64 Reference Manual
<T> Is an arrangement specifier and can have the following values:
B8, B16, H4, H8, S2, S4, D1, D2
VSUB: Sub (scalar)
VSUB <Vm>, <Vn>, <Vd>
Subtract low 64-bit element in <Vm> from the correponding element in <Vn>,
place the result into low 64-bit element of <Vd>.
VUADDLV: Unsigned sum Long across Vector.
VUADDLV <Vn>.<T>, Vd
<T> Is an arrangement specifier and can have the following values:
8B, 16B, H4, H8, S4
4. Alphabetical list of cryptographic extension instructions
SHA1C, SHA1M, SHA1P: SHA1 hash update.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment