Commit d4e936cf authored by erifan01's avatar erifan01 Committed by Cherry Zhang

internal/bytealg: optimize IndexString on arm64

This CL adjusts the order of the branch instructions of the
code to make it easier for the LIKELY branch to happen.

Benchmarks:
name                            old time/op    new time/op    delta
pkg:strings goos:linux goarch:arm64
IndexHard2-8                      2.17ms ± 1%    1.23ms ± 0%  -43.34%  (p=0.008 n=5+5)
CountHard2-8                      2.13ms ± 1%    1.21ms ± 2%  -43.31%  (p=0.008 n=5+5)

pkg:bytes goos:linux goarch:arm64
IndexRune/4M-8                     661µs ±22%     513µs ± 0%  -22.32%  (p=0.008 n=5+5)
IndexEasy/4M-8                     672µs ±23%     513µs ± 0%  -23.71%  (p=0.016 n=5+4)

Change-Id: Ib96f095edf77747edc8a971e79f5c1428e5808ce
Reviewed-on: https://go-review.googlesource.com/109015Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 5d4267e4
...@@ -41,12 +41,12 @@ len_8: ...@@ -41,12 +41,12 @@ len_8:
MOVD (R2), R5 MOVD (R2), R5
loop_8: loop_8:
// R6 contains substring for comparison // R6 contains substring for comparison
CMP R4, R0
BHI not_found
MOVD.P 1(R0), R6 MOVD.P 1(R0), R6
CMP R5, R6 CMP R5, R6
BEQ found BNE loop_8
CMP R4, R0 B found
BLS loop_8
JMP not_found
len_2_7: len_2_7:
TBZ $2, R3, len_2_3 TBZ $2, R3, len_2_3
TBZ $1, R3, len_4_5 TBZ $1, R3, len_4_5
...@@ -57,31 +57,29 @@ len_7: ...@@ -57,31 +57,29 @@ len_7:
// 1-byte overlap with R5 // 1-byte overlap with R5
MOVWU 3(R2), R6 MOVWU 3(R2), R6
loop_7: loop_7:
CMP R4, R0
BHI not_found
MOVWU.P 1(R0), R3 MOVWU.P 1(R0), R3
CMP R5, R3 CMP R5, R3
BNE not_equal_7 BNE loop_7
MOVWU 2(R0), R3 MOVWU 2(R0), R3
CMP R6, R3 CMP R6, R3
BEQ found BNE loop_7
not_equal_7: B found
CMP R4, R0
BLS loop_7
JMP not_found
len_6: len_6:
// R5 and R6 contain 6-byte sep // R5 and R6 contain 6-byte sep
MOVWU (R2), R5 MOVWU (R2), R5
MOVHU 4(R2), R6 MOVHU 4(R2), R6
loop_6: loop_6:
CMP R4, R0
BHI not_found
MOVWU.P 1(R0), R3 MOVWU.P 1(R0), R3
CMP R5, R3 CMP R5, R3
BNE not_equal_6 BNE loop_6
MOVHU 3(R0), R3 MOVHU 3(R0), R3
CMP R6, R3 CMP R6, R3
BEQ found BNE loop_6
not_equal_6: B found
CMP R4, R0
BLS loop_6
JMP not_found
len_4_5: len_4_5:
TBZ $0, R3, len_4 TBZ $0, R3, len_4
len_5: len_5:
...@@ -89,26 +87,25 @@ len_5: ...@@ -89,26 +87,25 @@ len_5:
MOVWU (R2), R5 MOVWU (R2), R5
MOVBU 4(R2), R7 MOVBU 4(R2), R7
loop_5: loop_5:
CMP R4, R0
BHI not_found
MOVWU.P 1(R0), R3 MOVWU.P 1(R0), R3
CMP R5, R3 CMP R5, R3
BNE not_equal_5 BNE loop_5
MOVBU 3(R0), R3 MOVBU 3(R0), R3
CMP R7, R3 CMP R7, R3
BEQ found BNE loop_5
not_equal_5: B found
CMP R4, R0
BLS loop_5
JMP not_found
len_4: len_4:
// R5 contains 4-byte sep // R5 contains 4-byte sep
MOVWU (R2), R5 MOVWU (R2), R5
loop_4: loop_4:
CMP R4, R0
BHI not_found
MOVWU.P 1(R0), R6 MOVWU.P 1(R0), R6
CMP R5, R6 CMP R5, R6
BEQ found BNE loop_4
CMP R4, R0 B found
BLS loop_4
JMP not_found
len_2_3: len_2_3:
TBZ $0, R3, len_2 TBZ $0, R3, len_2
len_3: len_3:
...@@ -116,30 +113,29 @@ len_3: ...@@ -116,30 +113,29 @@ len_3:
MOVHU (R2), R6 MOVHU (R2), R6
MOVBU 2(R2), R7 MOVBU 2(R2), R7
loop_3: loop_3:
CMP R4, R0
BHI not_found
MOVHU.P 1(R0), R3 MOVHU.P 1(R0), R3
CMP R6, R3 CMP R6, R3
BNE not_equal_3 BNE loop_3
MOVBU 1(R0), R3 MOVBU 1(R0), R3
CMP R7, R3 CMP R7, R3
BEQ found BNE loop_3
not_equal_3: B found
CMP R4, R0
BLS loop_3
JMP not_found
len_2: len_2:
// R5 contains 2-byte sep // R5 contains 2-byte sep
MOVHU (R2), R5 MOVHU (R2), R5
loop_2: loop_2:
CMP R4, R0
BHI not_found
MOVHU.P 1(R0), R6 MOVHU.P 1(R0), R6
CMP R5, R6 CMP R5, R6
BEQ found BNE loop_2
CMP R4, R0
BLS loop_2
not_found:
MOVD $-1, R0
MOVD R0, (R9)
RET
found: found:
SUB R8, R0, R0 SUB R8, R0, R0
MOVD R0, (R9) MOVD R0, (R9)
RET RET
not_found:
MOVD $-1, R0
MOVD R0, (R9)
RET
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment