Commit d206af1e authored by Josselin Costanzi's avatar Josselin Costanzi Committed by Josh Bleecher Snyder

strings: optimize Count for amd64

Move optimized Count implementation from bytes to runtime. Use in
both bytes and strings packages.
Add CountByte benchmark to strings.

Strings benchmarks:
name                       old time/op    new time/op    delta
CountHard1-4                 226µs ± 1%      226µs ± 2%      ~     (p=0.247 n=10+10)
CountHard2-4                 316µs ± 1%      315µs ± 0%      ~     (p=0.133 n=9+10)
CountHard3-4                 919µs ± 1%      920µs ± 1%      ~     (p=0.968 n=10+9)
CountTorture-4              15.4µs ± 1%     15.7µs ± 1%    +2.47%  (p=0.000 n=10+9)
CountTortureOverlapping-4   9.60ms ± 0%     9.65ms ± 1%      ~     (p=0.247 n=10+10)
CountByte/10-4              26.3ns ± 1%     10.9ns ± 1%   -58.71%  (p=0.000 n=9+9)
CountByte/32-4              42.7ns ± 0%     14.2ns ± 0%   -66.64%  (p=0.000 n=10+10)
CountByte/4096-4            3.07µs ± 0%     0.31µs ± 2%   -89.99%  (p=0.000 n=9+10)
CountByte/4194304-4         3.48ms ± 1%     0.34ms ± 1%   -90.09%  (p=0.000 n=10+9)
CountByte/67108864-4        55.6ms ± 1%      7.0ms ± 0%   -87.49%  (p=0.000 n=9+8)

name                      old speed      new speed       delta
CountByte/10-4             380MB/s ± 1%    919MB/s ± 1%  +142.21%  (p=0.000 n=9+9)
CountByte/32-4             750MB/s ± 0%   2247MB/s ± 0%  +199.62%  (p=0.000 n=10+10)
CountByte/4096-4          1.33GB/s ± 0%  13.32GB/s ± 2%  +898.13%  (p=0.000 n=9+10)
CountByte/4194304-4       1.21GB/s ± 1%  12.17GB/s ± 1%  +908.87%  (p=0.000 n=10+9)
CountByte/67108864-4      1.21GB/s ± 1%   9.65GB/s ± 0%  +699.29%  (p=0.000 n=9+8)

Fixes #19411

Change-Id: I8d2d409f0fa6df6d03b60790aa86e540b4a4e3b0
Reviewed-on: https://go-review.googlesource.com/38693Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent 03d1aa60
......@@ -8,9 +8,10 @@ package bytes
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
// indexShortStr requires 2 <= len(c) <= shortStringLen
func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
func supportAVX2() bool // ../runtime/asm_$GOARCH.s
func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
func indexShortStr(s, c []byte) int // ../runtime/asm_$GOARCH.s
func supportAVX2() bool // ../runtime/asm_$GOARCH.s
func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
func countByte(s []byte, c byte) int // ../runtime/asm_$GOARCH.s
var shortStringLen int
......@@ -95,9 +96,6 @@ func Index(s, sep []byte) int {
return -1
}
// Special case for when we must count occurrences of a single byte.
func countByte(s []byte, c byte) int
// Count counts the number of non-overlapping instances of sep in s.
// If sep is an empty slice, Count returns 1 + the number of Unicode code points in s.
func Count(s, sep []byte) int {
......
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// We use:
// SI: data
// BX: data len
// AL: byte sought
// This requires the POPCNT instruction
TEXT ·countByte(SB),NOSPLIT,$0-40
MOVQ s+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+24(FP), AL
// Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16
JLT small
MOVQ $0, R12 // Accumulator
MOVQ SI, DI
CMPQ BX, $32
JA avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1
// Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
// Count number of matching bytes
POPCNTL DX, DX
// Accumulate into R12
ADDQ DX, R12
// Advance to next block.
ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JBE sseloop
// Get the number of bytes to consider in the last 16 bytes
ANDQ $15, BX
JZ end
// Create mask to ignore overlap between previous 16 byte block
// and the next.
MOVQ $16,CX
SUBQ BX, CX
MOVQ $0xFFFF, R10
SARQ CL, R10
SALQ CL, R10
// Process the last 16-byte chunk. This chunk may overlap with the
// chunks we've already searched so we need to mask part of it.
MOVOU (AX), X1
PCMPEQB X0, X1
PMOVMSKB X1, DX
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
ADDQ DX, R12
end:
MOVQ R12, ret+32(FP)
RET
// handle for lengths < 16
small:
TESTQ BX, BX
JEQ endzero
// Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
// We must ignore high bytes as they aren't part of our slice.
// Create mask.
MOVB BX, CX
MOVQ $1, R10
SALQ CL, R10
SUBQ $1, R10
// Load data
MOVOU (SI), X1
// Compare target byte with each byte in data.
PCMPEQB X0, X1
// Move result bits to integer register.
PMOVMSKB X1, DX
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
// Directly return DX, we don't need to accumulate
// since we have <16 bytes.
MOVQ DX, ret+32(FP)
RET
endzero:
MOVQ $0, ret+32(FP)
RET
endofpage:
// We must ignore low bytes as they aren't part of our slice.
MOVQ $16,CX
SUBQ BX, CX
MOVQ $0xFFFF, R10
SARQ CL, R10
SALQ CL, R10
// Load data into the high end of X1.
MOVOU -16(SI)(BX*1), X1
// Compare target byte with each byte in data.
PCMPEQB X0, X1
// Move result bits to integer register.
PMOVMSKB X1, DX
// Apply mask
ANDQ R10, DX
// Directly return DX, we don't need to accumulate
// since we have <16 bytes.
POPCNTL DX, DX
MOVQ DX, ret+32(FP)
RET
avx2:
CMPB runtime·support_avx2(SB), $1
JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
avx2_loop:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, DX
POPCNTL DX, DX
ADDQ DX, R12
ADDQ $32, DI
CMPQ DI, R11
JLE avx2_loop
// If last block is already processed,
// skip to the end.
CMPQ DI, R11
JEQ endavx
// Load address of the last 32 bytes.
// There is an overlap with the previous block.
MOVQ R11, DI
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, DX
// Exit AVX mode.
VZEROUPPER
// Create mask to ignore overlap between previous 32 byte block
// and the next.
ANDQ $31, BX
MOVQ $32,CX
SUBQ BX, CX
MOVQ $0xFFFFFFFF, R10
SARQ CL, R10
SALQ CL, R10
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
ADDQ DX, R12
MOVQ R12, ret+32(FP)
RET
endavx:
// Exit AVX mode.
VZEROUPPER
MOVQ R12, ret+32(FP)
RET
......@@ -17,7 +17,10 @@ runtime/asm_amd64.s: [GOARCH] cannot check cross-package assembly function: Comp
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package strings
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportAVX2 is in package bytes
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package strings
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: supportPOPCNT is in package bytes
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package strings
runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: countByte is in package bytes
// Intentionally missing declarations. These are special assembly routines.
// Some are jumped into from other routines, with values in specific registers.
......
......@@ -1702,6 +1702,11 @@ TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
MOVB AX, ret+0(FP)
RET
TEXT strings·supportPOPCNT(SB),NOSPLIT,$0-1
MOVBLZX runtime·support_popcnt(SB), AX
MOVB AX, ret+0(FP)
RET
TEXT bytes·supportPOPCNT(SB),NOSPLIT,$0-1
MOVBLZX runtime·support_popcnt(SB), AX
MOVB AX, ret+0(FP)
......@@ -2131,6 +2136,196 @@ eqret:
MOVB $0, ret+48(FP)
RET
TEXT bytes·countByte(SB),NOSPLIT,$0-40
MOVQ s+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+24(FP), AL
LEAQ ret+32(FP), R8
JMP runtime·countByte(SB)
TEXT strings·countByte(SB),NOSPLIT,$0-32
MOVQ s+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
LEAQ ret+24(FP), R8
JMP runtime·countByte(SB)
// input:
// SI: data
// BX: data len
// AL: byte sought
// R8: address to put result
// This requires the POPCNT instruction
TEXT runtime·countByte(SB),NOSPLIT,$0
// Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16
JLT small
MOVQ $0, R12 // Accumulator
MOVQ SI, DI
CMPQ BX, $32
JA avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1
// Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX
// Count number of matching bytes
POPCNTL DX, DX
// Accumulate into R12
ADDQ DX, R12
// Advance to next block.
ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JBE sseloop
// Get the number of bytes to consider in the last 16 bytes
ANDQ $15, BX
JZ end
// Create mask to ignore overlap between previous 16 byte block
// and the next.
MOVQ $16,CX
SUBQ BX, CX
MOVQ $0xFFFF, R10
SARQ CL, R10
SALQ CL, R10
// Process the last 16-byte chunk. This chunk may overlap with the
// chunks we've already searched so we need to mask part of it.
MOVOU (AX), X1
PCMPEQB X0, X1
PMOVMSKB X1, DX
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
ADDQ DX, R12
end:
MOVQ R12, (R8)
RET
// handle for lengths < 16
small:
TESTQ BX, BX
JEQ endzero
// Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
// We must ignore high bytes as they aren't part of our slice.
// Create mask.
MOVB BX, CX
MOVQ $1, R10
SALQ CL, R10
SUBQ $1, R10
// Load data
MOVOU (SI), X1
// Compare target byte with each byte in data.
PCMPEQB X0, X1
// Move result bits to integer register.
PMOVMSKB X1, DX
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
// Directly return DX, we don't need to accumulate
// since we have <16 bytes.
MOVQ DX, (R8)
RET
endzero:
MOVQ $0, (R8)
RET
endofpage:
// We must ignore low bytes as they aren't part of our slice.
MOVQ $16,CX
SUBQ BX, CX
MOVQ $0xFFFF, R10
SARQ CL, R10
SALQ CL, R10
// Load data into the high end of X1.
MOVOU -16(SI)(BX*1), X1
// Compare target byte with each byte in data.
PCMPEQB X0, X1
// Move result bits to integer register.
PMOVMSKB X1, DX
// Apply mask
ANDQ R10, DX
// Directly return DX, we don't need to accumulate
// since we have <16 bytes.
POPCNTL DX, DX
MOVQ DX, (R8)
RET
avx2:
CMPB runtime·support_avx2(SB), $1
JNE sse
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1
avx2_loop:
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, DX
POPCNTL DX, DX
ADDQ DX, R12
ADDQ $32, DI
CMPQ DI, R11
JLE avx2_loop
// If last block is already processed,
// skip to the end.
CMPQ DI, R11
JEQ endavx
// Load address of the last 32 bytes.
// There is an overlap with the previous block.
MOVQ R11, DI
VMOVDQU (DI), Y2
VPCMPEQB Y1, Y2, Y3
VPMOVMSKB Y3, DX
// Exit AVX mode.
VZEROUPPER
// Create mask to ignore overlap between previous 32 byte block
// and the next.
ANDQ $31, BX
MOVQ $32,CX
SUBQ BX, CX
MOVQ $0xFFFFFFFF, R10
SARQ CL, R10
SALQ CL, R10
// Apply mask
ANDQ R10, DX
POPCNTL DX, DX
ADDQ DX, R12
MOVQ R12, (R8)
RET
endavx:
// Exit AVX mode.
VZEROUPPER
MOVQ R12, (R8)
RET
TEXT runtime·return0(SB), NOSPLIT, $0
MOVL $0, AX
RET
......
......@@ -72,9 +72,8 @@ func hashStrRev(sep string) (uint32, uint32) {
return hash, pow
}
// Count counts the number of non-overlapping instances of substr in s.
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
func Count(s, substr string) int {
// countGeneric implements Count.
func countGeneric(s, substr string) int {
// special case
if len(substr) == 0 {
return utf8.RuneCountInString(s) + 1
......
......@@ -8,8 +8,10 @@ package strings
// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
// indexShortStr requires 2 <= len(c) <= shortStringLen
func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s
func supportAVX2() bool // ../runtime/asm_$GOARCH.s
func indexShortStr(s, c string) int // ../runtime/asm_$GOARCH.s
func supportAVX2() bool // ../runtime/asm_$GOARCH.s
func supportPOPCNT() bool // ../runtime/asm_$GOARCH.s
func countByte(s string, c byte) int // ../runtime/asm_$GOARCH.s
var shortStringLen int
......@@ -93,3 +95,12 @@ func Index(s, substr string) int {
}
return -1
}
// Count counts the number of non-overlapping instances of substr in s.
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
func Count(s, substr string) int {
if len(substr) == 1 && supportPOPCNT() {
return countByte(s, byte(substr[0]))
}
return countGeneric(s, substr)
}
......@@ -45,3 +45,9 @@ func Index(s, substr string) int {
}
return -1
}
// Count counts the number of non-overlapping instances of substr in s.
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
func Count(s, substr string) int {
return countGeneric(s, substr)
}
......@@ -96,3 +96,9 @@ func Index(s, substr string) int {
}
return -1
}
// Count counts the number of non-overlapping instances of substr in s.
// If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
func Count(s, substr string) int {
return countGeneric(s, substr)
}
......@@ -1457,6 +1457,24 @@ func BenchmarkCountTortureOverlapping(b *testing.B) {
}
}
func BenchmarkCountByte(b *testing.B) {
indexSizes := []int{10, 32, 4 << 10, 4 << 20, 64 << 20}
benchStr := Repeat(benchmarkString,
(indexSizes[len(indexSizes)-1]+len(benchmarkString)-1)/len(benchmarkString))
benchFunc := func(b *testing.B, benchStr string) {
b.SetBytes(int64(len(benchStr)))
for i := 0; i < b.N; i++ {
Count(benchStr, "=")
}
}
for _, size := range indexSizes {
b.Run(fmt.Sprintf("%d", size), func(b *testing.B) {
benchFunc(b, benchStr[:size])
})
}
}
var makeFieldsInput = func() string {
x := make([]byte, 1<<20)
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment