Commit 78ddf274 authored by Wei Xiao's avatar Wei Xiao Committed by Cherry Zhang

bytes: add optimized Equal for arm64

Use SIMD instructions when comparing chunks bigger than 16 bytes.
Benchmark results of bytes:

name                 old time/op    new time/op    delta
Equal/0-8              6.52ns ± 1%    5.51ns ± 0%   -15.43%  (p=0.000 n=8+9)
Equal/1-8              11.5ns ± 0%    10.5ns ± 0%    -8.70%  (p=0.000 n=10+10)
Equal/6-8              19.0ns ± 0%    13.5ns ± 0%   -28.95%  (p=0.000 n=10+10)
Equal/9-8              31.0ns ± 0%    13.5ns ± 0%   -56.45%  (p=0.000 n=10+10)
Equal/15-8             40.0ns ± 0%    15.5ns ± 0%   -61.25%  (p=0.000 n=10+10)
Equal/16-8             41.5ns ± 0%    14.5ns ± 0%   -65.06%  (p=0.000 n=10+10)
Equal/20-8             47.5ns ± 0%    17.0ns ± 0%   -64.21%  (p=0.000 n=10+10)
Equal/32-8             65.6ns ± 0%    17.0ns ± 0%   -74.09%  (p=0.000 n=10+10)
Equal/4K-8             6.17µs ± 0%    0.57µs ± 1%   -90.76%  (p=0.000 n=10+10)
Equal/4M-8             6.41ms ± 0%    1.11ms ±14%   -82.71%  (p=0.000 n=8+10)
Equal/64M-8             104ms ± 0%      33ms ± 0%   -68.64%  (p=0.000 n=10+10)
EqualPort/1-8          13.0ns ± 0%    13.0ns ± 0%      ~     (all equal)
EqualPort/6-8          22.0ns ± 0%    22.7ns ± 0%    +3.06%  (p=0.000 n=8+9)
EqualPort/32-8         78.1ns ± 0%    78.1ns ± 0%      ~     (all equal)
EqualPort/4K-8         7.54µs ± 0%    7.61µs ± 0%    +0.92%  (p=0.000 n=10+8)
EqualPort/4M-8         8.16ms ± 2%    8.05ms ± 1%    -1.31%  (p=0.023 n=10+10)
EqualPort/64M-8         142ms ± 0%     142ms ± 0%    +0.37%  (p=0.000 n=10+10)
CompareBytesEqual-8    39.0ns ± 0%    41.6ns ± 2%    +6.67%  (p=0.000 n=9+10)

name                 old speed      new speed      delta
Equal/1-8            86.9MB/s ± 0%  95.2MB/s ± 0%    +9.53%  (p=0.000 n=8+8)
Equal/6-8             315MB/s ± 0%   444MB/s ± 0%   +40.74%  (p=0.000 n=9+10)
Equal/9-8             290MB/s ± 0%   666MB/s ± 0%  +129.63%  (p=0.000 n=8+10)
Equal/15-8            375MB/s ± 0%   967MB/s ± 0%  +158.09%  (p=0.000 n=10+10)
Equal/16-8            385MB/s ± 0%  1103MB/s ± 0%  +186.24%  (p=0.000 n=10+9)
Equal/20-8            421MB/s ± 0%  1175MB/s ± 0%  +179.44%  (p=0.000 n=9+10)
Equal/32-8            488MB/s ± 0%  1881MB/s ± 0%  +285.34%  (p=0.000 n=10+8)
Equal/4K-8            664MB/s ± 0%  7181MB/s ± 1%  +981.32%  (p=0.000 n=10+10)
Equal/4M-8            654MB/s ± 0%  3822MB/s ±16%  +484.15%  (p=0.000 n=8+10)
Equal/64M-8           645MB/s ± 0%  2056MB/s ± 0%  +218.90%  (p=0.000 n=10+10)
EqualPort/1-8        76.8MB/s ± 0%  76.7MB/s ± 0%    -0.09%  (p=0.023 n=10+10)
EqualPort/6-8         272MB/s ± 0%   264MB/s ± 0%    -2.94%  (p=0.000 n=8+10)
EqualPort/32-8        410MB/s ± 0%   410MB/s ± 0%    +0.01%  (p=0.004 n=9+10)
EqualPort/4K-8        543MB/s ± 0%   538MB/s ± 0%    -0.91%  (p=0.000 n=9+9)
EqualPort/4M-8        514MB/s ± 2%   521MB/s ± 1%    +1.31%  (p=0.023 n=10+10)
EqualPort/64M-8       473MB/s ± 0%   472MB/s ± 0%    -0.37%  (p=0.000 n=10+10)

Benchmark results of go1:

name                     old time/op    new time/op    delta
BinaryTree17-8              6.53s ± 0%     6.52s ± 2%    ~     (p=0.286 n=4+5)
Fannkuch11-8                6.35s ± 1%     6.33s ± 0%    ~     (p=0.690 n=5+5)
FmtFprintfEmpty-8           108ns ± 1%      99ns ± 1%  -8.31%  (p=0.008 n=5+5)
FmtFprintfString-8          172ns ± 1%     188ns ± 0%  +9.43%  (p=0.016 n=5+4)
FmtFprintfInt-8             207ns ± 0%     202ns ± 0%  -2.42%  (p=0.008 n=5+5)
FmtFprintfIntInt-8          277ns ± 1%     271ns ± 1%  -2.02%  (p=0.008 n=5+5)
FmtFprintfPrefixedInt-8     386ns ± 0%     380ns ± 0%  -1.55%  (p=0.008 n=5+5)
FmtFprintfFloat-8           492ns ± 0%     494ns ± 1%    ~     (p=0.175 n=4+5)
FmtManyArgs-8              1.32µs ± 1%    1.31µs ± 2%    ~     (p=0.651 n=5+5)
GobDecode-8                16.8ms ± 2%    16.9ms ± 1%    ~     (p=0.310 n=5+5)
GobEncode-8                14.1ms ± 1%    14.1ms ± 1%    ~     (p=1.000 n=5+5)
Gzip-8                      788ms ± 0%     789ms ± 0%    ~     (p=0.548 n=5+5)
Gunzip-8                   83.6ms ± 0%    83.6ms ± 0%    ~     (p=0.548 n=5+5)
HTTPClientServer-8          120µs ± 0%     120µs ± 1%    ~     (p=0.690 n=5+5)
JSONEncode-8               33.2ms ± 0%    33.6ms ± 0%  +1.20%  (p=0.008 n=5+5)
JSONDecode-8                152ms ± 1%     146ms ± 1%  -3.70%  (p=0.008 n=5+5)
Mandelbrot200-8            10.0ms ± 0%    10.0ms ± 0%    ~     (p=0.151 n=5+5)
GoParse-8                  7.97ms ± 0%    8.06ms ± 0%  +1.15%  (p=0.008 n=5+5)
RegexpMatchEasy0_32-8       233ns ± 1%     239ns ± 4%    ~     (p=0.135 n=5+5)
RegexpMatchEasy0_1K-8      1.86µs ± 0%    1.86µs ± 0%    ~     (p=0.167 n=5+5)
RegexpMatchEasy1_32-8       250ns ± 0%     263ns ± 1%  +5.28%  (p=0.008 n=5+5)
RegexpMatchEasy1_1K-8      2.28µs ± 0%    2.13µs ± 0%  -6.64%  (p=0.000 n=4+5)
RegexpMatchMedium_32-8      332ns ± 1%     319ns ± 0%  -3.97%  (p=0.008 n=5+5)
RegexpMatchMedium_1K-8     85.5µs ± 2%    79.1µs ± 1%  -7.42%  (p=0.008 n=5+5)
RegexpMatchHard_32-8       4.34µs ± 1%    4.42µs ± 7%    ~     (p=0.881 n=5+5)
RegexpMatchHard_1K-8        130µs ± 1%     127µs ± 0%  -2.18%  (p=0.008 n=5+5)
Revcomp-8                   1.35s ± 1%     1.34s ± 0%  -0.58%  (p=0.016 n=5+4)
Template-8                  160ms ± 2%     158ms ± 1%    ~     (p=0.222 n=5+5)
TimeParse-8                 795ns ± 2%     772ns ± 2%  -2.87%  (p=0.024 n=5+5)
TimeFormat-8                782ns ± 0%     784ns ± 0%    ~     (p=0.198 n=5+5)

name                     old speed      new speed      delta
GobDecode-8              45.8MB/s ± 2%  45.5MB/s ± 1%    ~     (p=0.310 n=5+5)
GobEncode-8              54.3MB/s ± 1%  54.4MB/s ± 1%    ~     (p=0.984 n=5+5)
Gzip-8                   24.6MB/s ± 0%  24.6MB/s ± 0%    ~     (p=0.540 n=5+5)
Gunzip-8                  232MB/s ± 0%   232MB/s ± 0%    ~     (p=0.548 n=5+5)
JSONEncode-8             58.4MB/s ± 0%  57.7MB/s ± 0%  -1.19%  (p=0.008 n=5+5)
JSONDecode-8             12.8MB/s ± 1%  13.3MB/s ± 1%  +3.85%  (p=0.008 n=5+5)
GoParse-8                7.27MB/s ± 0%  7.18MB/s ± 0%  -1.13%  (p=0.008 n=5+5)
RegexpMatchEasy0_32-8     137MB/s ± 1%   134MB/s ± 4%    ~     (p=0.151 n=5+5)
RegexpMatchEasy0_1K-8     551MB/s ± 0%   550MB/s ± 0%    ~     (p=0.222 n=5+5)
RegexpMatchEasy1_32-8     128MB/s ± 0%   121MB/s ± 1%  -5.09%  (p=0.008 n=5+5)
RegexpMatchEasy1_1K-8     449MB/s ± 0%   481MB/s ± 0%  +7.12%  (p=0.016 n=4+5)
RegexpMatchMedium_32-8   3.00MB/s ± 0%  3.13MB/s ± 0%  +4.33%  (p=0.016 n=4+5)
RegexpMatchMedium_1K-8   12.0MB/s ± 2%  12.9MB/s ± 1%  +7.98%  (p=0.008 n=5+5)
RegexpMatchHard_32-8     7.38MB/s ± 1%  7.25MB/s ± 7%    ~     (p=0.952 n=5+5)
RegexpMatchHard_1K-8     7.88MB/s ± 1%  8.05MB/s ± 0%  +2.21%  (p=0.008 n=5+5)
Revcomp-8                 188MB/s ± 1%   189MB/s ± 0%  +0.58%  (p=0.016 n=5+4)
Template-8               12.2MB/s ± 2%  12.3MB/s ± 1%    ~     (p=0.183 n=5+5)

Change-Id: I65e79f3f8f8b2914678311c4f1b0a2d98459e220
Reviewed-on: https://go-review.googlesource.com/71110Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
parent 0c68b79e
......@@ -723,26 +723,18 @@ TEXT runtime·abort(SB),NOSPLIT,$-8-0
B (ZR)
UNDEF
// memequal(p, q unsafe.Pointer, size uintptr) bool
// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal(SB),NOSPLIT,$-8-25
MOVD a+0(FP), R1
MOVD size+16(FP), R1
// short path to handle 0-byte case
CBZ R1, equal
MOVD a+0(FP), R0
MOVD b+8(FP), R2
MOVD size+16(FP), R3
ADD R1, R3, R6
MOVD $ret+24(FP), R8
B runtime·memeqbody<>(SB)
equal:
MOVD $1, R0
MOVB R0, ret+24(FP)
CMP R1, R2
BEQ done
loop:
CMP R1, R6
BEQ done
MOVBU.P 1(R1), R4
MOVBU.P 1(R2), R5
CMP R4, R5
BEQ loop
MOVB $0, ret+24(FP)
done:
RET
// memequal_varlen(a, b unsafe.Pointer) bool
......@@ -865,28 +857,110 @@ notfound:
MOVD R0, ret+24(FP)
RET
// TODO: share code with memequal?
// Equal(a, b []byte) bool
TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVD a_len+8(FP), R1
MOVD b_len+32(FP), R3
CMP R1, R3 // unequal lengths are not equal
BNE notequal
CMP R1, R3
// unequal lengths are not equal
BNE not_equal
// short path to handle 0-byte case
CBZ R1, equal
MOVD a+0(FP), R0
MOVD b+24(FP), R2
ADD R0, R1 // end
loop:
CMP R0, R1
BEQ equal // reaches the end
MOVBU.P 1(R0), R4
MOVBU.P 1(R2), R5
CMP R4, R5
BEQ loop
notequal:
MOVD $ret+48(FP), R8
B runtime·memeqbody<>(SB)
equal:
MOVD $1, R0
MOVB R0, ret+48(FP)
RET
not_equal:
MOVB ZR, ret+48(FP)
RET
// input:
// R0: pointer a
// R1: data len
// R2: pointer b
// R8: address to put result
TEXT runtime·memeqbody<>(SB),NOSPLIT,$0
CMP $1, R1
// handle 1-byte special case for better performance
BEQ one
CMP $16, R1
// handle specially if length < 16
BLO tail
BIC $0x3f, R1, R3
CBZ R3, chunk16
// work with 64-byte chunks
ADD R3, R0, R6 // end of chunks
chunk64_loop:
VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2]
VLD1.P (R2), [V4.D2, V5.D2, V6.D2, V7.D2]
VCMEQ V0.D2, V4.D2, V8.D2
VCMEQ V1.D2, V5.D2, V9.D2
VCMEQ V2.D2, V6.D2, V10.D2
VCMEQ V3.D2, V7.D2, V11.D2
VAND V8.B16, V9.B16, V8.B16
VAND V8.B16, V10.B16, V8.B16
VAND V8.B16, V11.B16, V8.B16
CMP R0, R6
VMOV V8.D[0], R4
VMOV V8.D[1], R5
CBZ R4, not_equal
CBZ R5, not_equal
BNE chunk64_loop
AND $0x3f, R1, R1
CBZ R1, equal
chunk16:
// work with 16-byte chunks
BIC $0xf, R1, R3
CBZ R3, tail
ADD R3, R0, R6 // end of chunks
chunk16_loop:
VLD1.P (R0), [V0.D2]
VLD1.P (R2), [V1.D2]
VCMEQ V0.D2, V1.D2, V2.D2
CMP R0, R6
VMOV V2.D[0], R4
VMOV V2.D[1], R5
CBZ R4, not_equal
CBZ R5, not_equal
BNE chunk16_loop
AND $0xf, R1, R1
CBZ R1, equal
tail:
// special compare of tail with length < 16
TBZ $3, R1, lt_8
MOVD.P 8(R0), R4
MOVD.P 8(R2), R5
CMP R4, R5
BNE not_equal
lt_8:
TBZ $2, R1, lt_4
MOVWU.P 4(R0), R4
MOVWU.P 4(R2), R5
CMP R4, R5
BNE not_equal
lt_4:
TBZ $1, R1, lt_2
MOVHU.P 2(R0), R4
MOVHU.P 2(R2), R5
CMP R4, R5
BNE not_equal
lt_2:
TBZ $0, R1, equal
one:
MOVBU (R0), R4
MOVBU (R2), R5
CMP R4, R5
BNE not_equal
equal:
MOVD $1, R0
MOVB R0, ret+48(FP)
MOVB R0, (R8)
RET
not_equal:
MOVB ZR, (R8)
RET
TEXT runtime·return0(SB), NOSPLIT, $0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment