Commit f8ef6ed2 authored by erifan01's avatar erifan01 Committed by Cherry Zhang

internal/bytealg: optimize Index (substring lengths from 9 to 32) on arm64

The current code is not optimized for cases where the length of the
substring to be searched is between 9 bytes and 32 bytes. This CL
optimizes the situations.

Benchmark:
name                             old time/op  new time/op  delta
pkg:strings goos:linux goarch:arm64
IndexHard1-8                     1.06ms ± 0%  1.06ms ± 0%   -0.44%  (p=0.000 n=7+8)
IndexHard2-8                     1.25ms ± 1%  1.26ms ± 2%     ~     (p=0.328 n=8+8)
IndexHard3-8                     2.85ms ± 1%  1.18ms ± 1%  -58.59%  (p=0.000 n=8+8)
IndexHard4-8                     2.90ms ± 1%  2.87ms ± 1%   -0.96%  (p=0.021 n=8+8)

pkg:bytes goos:linux goarch:arm64
IndexByte/4M-8                      726124.200000ns +- 6%     560021.400000ns +-20%  -22.88%  (p=0.008 n=5+5)
IndexRune/4M-8                      928768.600000ns +- 0%     793144.600000ns +- 6%  -14.60%  (p=0.008 n=5+5)

Change-Id: Ieebeb784ae69b2a0642ea96e9486a1d120923568
Reviewed-on: https://go-review.googlesource.com/109895Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 8c4170b2
......@@ -9,8 +9,8 @@ package bytealg
const MaxBruteForce = 16
func init() {
// 8 bytes can be completely loaded into 1 register.
MaxLen = 8
// Optimize cases where the length of the substring is less than 32 bytes
MaxLen = 32
}
// Cutover reports the number of failures of IndexByte we should tolerate
......
......@@ -25,7 +25,7 @@ TEXT ·IndexString(SB),NOSPLIT,$0-40
// R0: haystack
// R1: length of haystack
// R2: needle
// R3: length of needle (2 <= len <= 8)
// R3: length of needle (2 <= len <= 32)
// R9: address to put result
TEXT indexbody<>(SB),NOSPLIT,$0-56
// main idea is to load 'sep' into separate register(s)
......@@ -35,9 +35,12 @@ TEXT indexbody<>(SB),NOSPLIT,$0-56
// R4 contains the start of last substring for comparsion
ADD R0, R4, R4
ADD $1, R0, R8
CMP $8, R3
BHI greater_8
TBZ $3, R3, len_2_7
len_8:
// R5 contains 8-byte sep
// R5 contains 8-byte of sep
MOVD (R2), R5
loop_8:
// R6 contains substring for comparison
......@@ -52,7 +55,7 @@ len_2_7:
TBZ $1, R3, len_4_5
TBZ $0, R3, len_6
len_7:
// R5 and R6 contain 7-byte sep
// R5 and R6 contain 7-byte of sep
MOVWU (R2), R5
// 1-byte overlap with R5
MOVWU 3(R2), R6
......@@ -67,7 +70,7 @@ loop_7:
BNE loop_7
B found
len_6:
// R5 and R6 contain 6-byte sep
// R5 and R6 contain 6-byte of sep
MOVWU (R2), R5
MOVHU 4(R2), R6
loop_6:
......@@ -83,7 +86,7 @@ loop_6:
len_4_5:
TBZ $0, R3, len_4
len_5:
// R5 and R7 contain 5-byte sep
// R5 and R7 contain 5-byte of sep
MOVWU (R2), R5
MOVBU 4(R2), R7
loop_5:
......@@ -97,7 +100,7 @@ loop_5:
BNE loop_5
B found
len_4:
// R5 contains 4-byte sep
// R5 contains 4-byte of sep
MOVWU (R2), R5
loop_4:
CMP R4, R0
......@@ -109,7 +112,7 @@ loop_4:
len_2_3:
TBZ $0, R3, len_2
len_3:
// R6 and R7 contain 3-byte sep
// R6 and R7 contain 3-byte of sep
MOVHU (R2), R6
MOVBU 2(R2), R7
loop_3:
......@@ -123,7 +126,7 @@ loop_3:
BNE loop_3
B found
len_2:
// R5 contains 2-byte sep
// R5 contains 2-byte of sep
MOVHU (R2), R5
loop_2:
CMP R4, R0
......@@ -139,3 +142,65 @@ not_found:
MOVD $-1, R0
MOVD R0, (R9)
RET
greater_8:
SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes
CMP $16, R3
BHI greater_16
len_9_16:
MOVD.P 8(R2), R5 // R5 contains the first 8-byte of sep
SUB $16, R3, R7 // len(sep) - 16, offset of R2 for last 8 bytes
MOVD (R2)(R7), R6 // R6 contains the last 8-byte of sep
loop_9_16:
// search the first 8 bytes first
CMP R4, R0
BHI not_found
MOVD.P 1(R0), R7
CMP R5, R7
BNE loop_9_16
MOVD (R0)(R11), R7
CMP R6, R7 // compare the last 8 bytes
BNE loop_9_16
B found
greater_16:
CMP $24, R3
BHI len_25_32
len_17_24:
LDP.P 16(R2), (R5, R6) // R5 and R6 contain the first 16-byte of sep
SUB $24, R3, R10 // len(sep) - 24
MOVD (R2)(R10), R7 // R7 contains the last 8-byte of sep
loop_17_24:
// search the first 16 bytes first
CMP R4, R0
BHI not_found
MOVD.P 1(R0), R10
CMP R5, R10
BNE loop_17_24
MOVD 7(R0), R10
CMP R6, R10
BNE loop_17_24
MOVD (R0)(R11), R10
CMP R7, R10 // compare the last 8 bytes
BNE loop_17_24
B found
len_25_32:
LDP.P 16(R2), (R5, R6)
MOVD.P 8(R2), R7 // R5, R6 and R7 contain the first 24-byte of sep
SUB $32, R3, R12 // len(sep) - 32
MOVD (R2)(R12), R10 // R10 contains the last 8-byte of sep
loop_25_32:
// search the first 24 bytes first
CMP R4, R0
BHI not_found
MOVD.P 1(R0), R12
CMP R5, R12
BNE loop_25_32
MOVD 7(R0), R12
CMP R6, R12
BNE loop_25_32
MOVD 15(R0), R12
CMP R7, R12
BNE loop_25_32
MOVD (R0)(R11), R12
CMP R10, R12 // compare the last 8 bytes
BNE loop_25_32
B found
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment