Commit d31ee641 authored by fanzha02's avatar fanzha02 Committed by Cherry Zhang

internal/bytealg: add optimized Compare for arm64

Use LDP instructions to load 16 bytes per loop when the source length is long. Specially
process the 8 bytes length, 4 bytes length and 2 bytes length to get a better performance.

Benchmark result:
name                           old time/op   new time/op    delta
BytesCompare/1-8                21.0ns ± 0%    10.5ns ± 0%      ~     (p=0.079 n=4+5)
BytesCompare/2-8                11.5ns ± 0%    10.5ns ± 0%    -8.70%  (p=0.008 n=5+5)
BytesCompare/4-8                13.5ns ± 0%    10.0ns ± 0%   -25.93%  (p=0.008 n=5+5)
BytesCompare/8-8                28.8ns ± 0%     9.5ns ± 0%      ~     (p=0.079 n=4+5)
BytesCompare/16-8               40.5ns ± 0%    10.5ns ± 0%   -74.07%  (p=0.008 n=5+5)
BytesCompare/32-8               64.6ns ± 0%    12.5ns ± 0%   -80.65%  (p=0.008 n=5+5)
BytesCompare/64-8                112ns ± 0%      16ns ± 0%   -85.27%  (p=0.008 n=5+5)
BytesCompare/128-8               208ns ± 0%      24ns ± 0%   -88.22%  (p=0.008 n=5+5)
BytesCompare/256-8               400ns ± 0%      50ns ± 0%   -87.62%  (p=0.008 n=5+5)
BytesCompare/512-8               785ns ± 0%      82ns ± 0%   -89.61%  (p=0.008 n=5+5)
BytesCompare/1024-8             1.55µs ± 0%    0.14µs ± 0%      ~     (p=0.079 n=4+5)
BytesCompare/2048-8             3.09µs ± 0%    0.27µs ± 0%      ~     (p=0.079 n=4+5)
CompareBytesEqual-8             39.0ns ± 0%    12.0ns ± 0%   -69.23%  (p=0.008 n=5+5)
CompareBytesToNil-8             8.57ns ± 5%    8.23ns ± 2%    -3.99%  (p=0.016 n=5+5)
CompareBytesEmpty-8             7.37ns ± 0%    7.36ns ± 4%      ~     (p=0.690 n=5+5)
CompareBytesIdentical-8         7.39ns ± 0%    7.46ns ± 2%      ~     (p=0.667 n=5+5)
CompareBytesSameLength-8        17.0ns ± 0%    10.5ns ± 0%   -38.24%  (p=0.008 n=5+5)
CompareBytesDifferentLength-8   17.0ns ± 0%    10.5ns ± 0%   -38.24%  (p=0.008 n=5+5)
CompareBytesBigUnaligned-8      1.58ms ± 0%    0.19ms ± 0%   -88.31%  (p=0.016 n=4+5)
CompareBytesBig-8               1.59ms ± 0%    0.19ms ± 0%   -88.27%  (p=0.016 n=5+4)
CompareBytesBigIdentical-8      7.01ns ± 0%    6.60ns ± 3%    -5.91%  (p=0.008 n=5+5)

name                           old speed     new speed      delta
CompareBytesBigUnaligned-8     662MB/s ± 0%  5660MB/s ± 0%  +755.15%  (p=0.016 n=4+5)
CompareBytesBig-8              661MB/s ± 0%  5636MB/s ± 0%  +752.57%  (p=0.016 n=5+4)
CompareBytesBigIdentical-8     150TB/s ± 0%   159TB/s ± 3%    +6.27%  (p=0.008 n=5+5)

This is resubmit of CL90175.

Change-Id: Ie841daedb3123a68dd2554f27ebef0b3f8a855c2
Reviewed-on: https://go-review.googlesource.com/101635
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent 60402856
...@@ -10,7 +10,7 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 ...@@ -10,7 +10,7 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
MOVD a_len+8(FP), R0 MOVD a_len+8(FP), R0
MOVD b_base+24(FP), R3 MOVD b_base+24(FP), R3
MOVD b_len+32(FP), R1 MOVD b_len+32(FP), R1
ADD $56, RSP, R7 MOVD $ret+48(FP), R7
B cmpbody<>(SB) B cmpbody<>(SB)
TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56 TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
...@@ -18,7 +18,7 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56 ...@@ -18,7 +18,7 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
MOVD a_len+8(FP), R0 MOVD a_len+8(FP), R0
MOVD b_base+24(FP), R3 MOVD b_base+24(FP), R3
MOVD b_len+32(FP), R1 MOVD b_len+32(FP), R1
ADD $56, RSP, R7 MOVD $ret+48(FP), R7
B cmpbody<>(SB) B cmpbody<>(SB)
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
...@@ -26,7 +26,7 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 ...@@ -26,7 +26,7 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOVD a_len+8(FP), R0 MOVD a_len+8(FP), R0
MOVD b_base+16(FP), R3 MOVD b_base+16(FP), R3
MOVD b_len+24(FP), R1 MOVD b_len+24(FP), R1
ADD $40, RSP, R7 MOVD $ret+32(FP), R7
B cmpbody<>(SB) B cmpbody<>(SB)
// On entry: // On entry:
...@@ -37,30 +37,98 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 ...@@ -37,30 +37,98 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
// R7 points to return value (-1/0/1 will be written here) // R7 points to return value (-1/0/1 will be written here)
// //
// On exit: // On exit:
// R4, R5, and R6 are clobbered // R4, R5, R6, R8, R9 and R10 are clobbered
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
CMP R2, R3 CMP R2, R3
BEQ samebytes // same starting pointers; compare lengths BEQ samebytes // same starting pointers; compare lengths
CMP R0, R1 CMP R0, R1
CSEL LT, R1, R0, R6 // R6 is min(R0, R1) CSEL LT, R1, R0, R6 // R6 is min(R0, R1)
ADD R2, R6 // R2 is current byte in a, R6 is last byte in a to compare CMP $0, R6
loop: BEQ samebytes
CMP R2, R6 BIC $0xf, R6, R10
BEQ samebytes // all compared bytes were the same; compare lengths CBZ R10, small // length < 16
MOVBU.P 1(R2), R4 ADD R2, R10 // end of chunk16
MOVBU.P 1(R3), R5 // length >= 16
chunk16_loop:
LDP.P 16(R2), (R4, R8)
LDP.P 16(R3), (R5, R9)
CMP R4, R5 CMP R4, R5
BEQ loop BNE cmp
// bytes differed CMP R8, R9
BNE cmpnext
CMP R10, R2
BNE chunk16_loop
AND $0xf, R6, R6
CBZ R6, samebytes
SUBS $8, R6
BLT tail
// the length of tail > 8 bytes
MOVD.P 8(R2), R4
MOVD.P 8(R3), R5
CMP R4, R5
BNE cmp
SUB $8, R6
// compare last 8 bytes
tail:
MOVD (R2)(R6), R4
MOVD (R3)(R6), R5
CMP R4, R5
BEQ samebytes
cmp:
REV R4, R4
REV R5, R5
CMP R4, R5
ret:
MOVD $1, R4 MOVD $1, R4
CSNEG LT, R4, R4, R4 CNEG HI, R4, R4
MOVD R4, (R7) MOVD R4, (R7)
RET RET
small:
TBZ $3, R6, lt_8
MOVD (R2), R4
MOVD (R3), R5
CMP R4, R5
BNE cmp
SUBS $8, R6
BEQ samebytes
ADD $8, R2
ADD $8, R3
SUB $8, R6
B tail
lt_8:
TBZ $2, R6, lt_4
MOVWU (R2), R4
MOVWU (R3), R5
CMPW R4, R5
BNE cmp
SUBS $4, R6
BEQ samebytes
ADD $4, R2
ADD $4, R3
lt_4:
TBZ $1, R6, lt_2
MOVHU (R2), R4
MOVHU (R3), R5
CMPW R4, R5
BNE cmp
ADD $2, R2
ADD $2, R3
lt_2:
TBZ $0, R6, samebytes
one:
MOVBU (R2), R4
MOVBU (R3), R5
CMPW R4, R5
BNE ret
samebytes: samebytes:
MOVD $1, R4 CMP R1, R0
CMP R0, R1 CSET NE, R4
CSNEG LT, R4, R4, R4 CNEG LO, R4, R4
CSEL EQ, ZR, R4, R4
MOVD R4, (R7) MOVD R4, (R7)
RET RET
cmpnext:
REV R8, R4
REV R9, R5
CMP R4, R5
B ret
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment