Commit baec1487 authored by Lynn Boger's avatar Lynn Boger Committed by Minux Ma

bytes: Equal perf improvements on ppc64le/ppc64

The existing implementation for Equal and similar
functions in the bytes package operate on one byte at
at time.  This performs poorly on ppc64/ppc64le especially
when the byte buffers are large.  This change improves
those functions by loading and comparing double words where
possible.  The common code has been moved to a function
that can be shared by the other functions in this
file which perform the same type of comparison.
Further optimizations are done for the case where
>= 32 bytes are being compared.  The new function
memeqbody is used by memeq_varlen, Equal, and eqstring.

When running the bytes test with -test.bench=Equal

benchmark                     old MB/s     new MB/s     speedup
BenchmarkEqual1               164.83       129.49       0.79x
BenchmarkEqual6               563.51       445.47       0.79x
BenchmarkEqual9               656.15       1099.00      1.67x
BenchmarkEqual15              591.93       1024.30      1.73x
BenchmarkEqual16              613.25       1914.12      3.12x
BenchmarkEqual20              682.37       1687.04      2.47x
BenchmarkEqual32              807.96       3843.29      4.76x
BenchmarkEqual4K              1076.25      23280.51     21.63x
BenchmarkEqual4M              1079.30      13120.14     12.16x
BenchmarkEqual64M             1073.28      10876.92     10.13x

It was determined that the degradation in the smaller byte tests
were due to unfavorable code alignment of the single byte loop.

Fixes #14368

Change-Id: I0dd87382c28887c70f4fbe80877a8ba03c31d7cd
Reviewed-on: https://go-review.googlesource.com/20249Reviewed-by: default avatarMinux Ma <minux@golang.org>
parent 516c6b40
...@@ -795,33 +795,13 @@ TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0 ...@@ -795,33 +795,13 @@ TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0 TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
MOVW (R0), R1 MOVW (R0), R1
// memequal(p, q unsafe.Pointer, size uintptr) bool TEXT runtime·memequal(SB),NOSPLIT,$0-25
TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 MOVD a+0(FP), R3
MOVD a+0(FP), R3 MOVD b+8(FP), R4
MOVD b+8(FP), R4 MOVD size+16(FP), R5
CMP R3, R4
BEQ eq
MOVD size+16(FP), R5
SUB $1, R3
SUB $1, R4
ADD R3, R5, R8
loop:
CMP R3, R8
BNE test
MOVD $1, R3
MOVB R3, ret+24(FP)
RET
test:
MOVBZU 1(R3), R6
MOVBZU 1(R4), R7
CMP R6, R7
BEQ loop
MOVB R0, ret+24(FP) BL runtime·memeqbody(SB)
RET MOVB R9, ret+24(FP)
eq:
MOVD $1, R1
MOVB R1, ret+24(FP)
RET RET
// memequal_varlen(a, b unsafe.Pointer) bool // memequal_varlen(a, b unsafe.Pointer) bool
...@@ -831,75 +811,129 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 ...@@ -831,75 +811,129 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
CMP R3, R4 CMP R3, R4
BEQ eq BEQ eq
MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure
MOVD R3, FIXED_FRAME+0(R1) BL runtime·memeqbody(SB)
MOVD R4, FIXED_FRAME+8(R1) MOVB R9, ret+16(FP)
MOVD R5, FIXED_FRAME+16(R1)
BL runtime·memequal(SB)
MOVBZ FIXED_FRAME+24(R1), R3
MOVB R3, ret+16(FP)
RET RET
eq: eq:
MOVD $1, R3 MOVD $1, R3
MOVB R3, ret+16(FP) MOVB R3, ret+16(FP)
RET RET
// Do an efficieint memequal for ppc64
// for reuse where possible.
// R3 = s1
// R4 = s2
// R5 = len
// R9 = return value
// R6, R7 clobbered
TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
MOVD R5,CTR
CMP R5,$8 // only optimize >=8
BLT simplecheck
DCBT (R3) // cache hint
DCBT (R4)
CMP R5,$32 // optimize >= 32
MOVD R5,R6 // needed if setup8a branch
BLT setup8a // 8 byte moves only
setup32a: // 8 byte aligned, >= 32 bytes
SRADCC $5,R5,R6 // number of 32 byte chunks to compare
MOVD R6,CTR
loop32a:
MOVD 0(R3),R6 // doublewords to compare
MOVD 0(R4),R7
MOVD 8(R3),R8 //
MOVD 8(R4),R9
CMP R6,R7 // bytes batch?
BNE noteq
MOVD 16(R3),R6
MOVD 16(R4),R7
CMP R8,R9 // bytes match?
MOVD 24(R3),R8
MOVD 24(R4),R9
BNE noteq
CMP R6,R7 // bytes match?
BNE noteq
ADD $32,R3 // bump up to next 32
ADD $32,R4
CMP R8,R9 // bytes match?
BC 8,2,loop32a // br ctr and cr
BNE noteq
ANDCC $24,R5,R6 // Any 8 byte chunks?
BEQ leftover // and result is 0
setup8a:
SRADCC $3,R6,R6 // get the 8 byte count
BEQ leftover // shifted value is 0
MOVD R6,CTR
loop8:
MOVD 0(R3),R6 // doublewords to compare
ADD $8,R3
MOVD 0(R4),R7
ADD $8,R4
CMP R6,R7 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BNE noteq
leftover:
ANDCC $7,R5,R6 // check for leftover bytes
BEQ equal
MOVD R6,CTR
BR simple
simplecheck:
CMP R5,$0
BEQ equal
simple:
MOVBZ 0(R3), R6
ADD $1,R3
MOVBZ 0(R4), R7
ADD $1,R4
CMP R6, R7
BNE noteq
BC 8,2,simple
BNE noteq
BR equal
noteq:
MOVD $0, R9
RET
equal:
MOVD $1, R9
RET
// eqstring tests whether two strings are equal. // eqstring tests whether two strings are equal.
// The compiler guarantees that strings passed // The compiler guarantees that strings passed
// to eqstring have equal length. // to eqstring have equal length.
// See runtime_test.go:eqstring_generic for // See runtime_test.go:eqstring_generic for
// equivalent Go code. // equivalent Go code.
TEXT runtime·eqstring(SB),NOSPLIT,$0-33 TEXT runtime·eqstring(SB),NOSPLIT,$0-33
MOVD s1str+0(FP), R3 MOVD s1str+0(FP), R3
MOVD s2str+16(FP), R4 MOVD s2str+16(FP), R4
MOVD $1, R5 MOVD $1, R5
MOVB R5, ret+32(FP) MOVB R5, ret+32(FP)
CMP R3, R4 CMP R3, R4
BNE 2(PC) BNE 2(PC)
RET RET
MOVD s1len+8(FP), R5 MOVD s1len+8(FP), R5
SUB $1, R3 BL runtime·memeqbody(SB)
SUB $1, R4 MOVB R9, ret+32(FP)
ADD R3, R5, R8
loop:
CMP R3, R8
BNE 2(PC)
RET
MOVBZU 1(R3), R6
MOVBZU 1(R4), R7
CMP R6, R7
BEQ loop
MOVB R0, ret+32(FP)
RET RET
// TODO: share code with memequal?
TEXT bytes·Equal(SB),NOSPLIT,$0-49 TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVD a_len+8(FP), R3 MOVD a_len+8(FP), R4
MOVD b_len+32(FP), R4 MOVD b_len+32(FP), R5
CMP R5, R4 // unequal lengths are not equal
CMP R3, R4 // unequal lengths are not equal
BNE noteq BNE noteq
MOVD a+0(FP), R3
MOVD b+24(FP), R4
BL runtime·memeqbody(SB)
MOVD a+0(FP), R5 MOVBZ R9,ret+48(FP)
MOVD b+24(FP), R6 RET
SUB $1, R5
SUB $1, R6
ADD R5, R3 // end-1
loop:
CMP R5, R3
BEQ equal // reached the end
MOVBZU 1(R5), R4
MOVBZU 1(R6), R7
CMP R4, R7
BEQ loop
noteq: noteq:
MOVBZ R0, ret+48(FP) MOVBZ $0,ret+48(FP)
RET RET
equal: equal:
MOVD $1, R3 MOVD $1,R3
MOVBZ R3, ret+48(FP) MOVBZ R3,ret+48(FP)
RET RET
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment