Commit d979ac33 authored by Lynn Boger's avatar Lynn Boger

crypto/cipher: improve xorBytesVSX asm for ppc64x

This improves the performance of xorBytesVSX in crypto/cipher by
unrolling the loop that does the stores. Improvement on power9:

name                 old time/op    new time/op    delta
XORBytes/8Bytes        17.9ns ± 0%    18.2ns ± 0%   +1.53%  (p=0.029 n=4+4)
XORBytes/128Bytes      24.4ns ± 0%    22.5ns ± 0%   -7.79%  (p=0.029 n=4+4)
XORBytes/2048Bytes      131ns ± 0%     109ns ± 0%  -16.79%  (p=0.029 n=4+4)
XORBytes/32768Bytes    1.74µs ± 0%    1.43µs ± 8%  -18.04%  (p=0.029 n=4+4)

Change-Id: I75bd625d3ae9daa7bda54c523028671ab036b13d
Reviewed-on: https://go-review.googlesource.com/c/go/+/197058
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
parent a3426f25
......@@ -13,45 +13,66 @@ TEXT ·xorBytesVSX(SB), NOSPLIT, $0
MOVD b+16(FP), R5 // R5 = b
MOVD n+24(FP), R6 // R6 = n
CMPU R6, $16, CR7 // Check if n 16 bytes
CMPU R6, $32, CR7 // Check if n 32 bytes
MOVD R0, R8 // R8 = index
CMPU R6, $8, CR6 // Check if 8 n < 16 bytes
BGE CR7, preloop16
BLT CR6, small
CMPU R6, $8, CR6 // Check if 8 n < 32 bytes
BLT CR6, small // Smaller than 8
BLT CR7, xor16 // Case for 16 n < 32 bytes
// Case for 8 n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8
// Check if we're finished
CMP R6, R0
BGT small
JMP done
// Case for n 16 bytes
preloop16:
SRD $4, R6, R7 // Setup loop counter
// Case for n 32 bytes
preloop32:
SRD $5, R6, R7 // Setup loop counter
MOVD R7, CTR
ANDCC $15, R6, R9 // Check for tailing bytes for later
loop16:
MOVD $16, R10
ANDCC $31, R6, R9 // Check for tailing bytes for later
loop32:
LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]
LXVD2X (R4)(R10), VS34
LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]
XXLXOR VS32, VS33, VS34 // VS34 = a[] ^ b[]
STXVD2X VS34, (R3)(R8) // Store to dst
ADD $16, R8 // Update index
BC 16, 0, loop16 // bdnz loop16
LXVD2X (R5)(R10), VS35
XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]
XXLXOR VS34, VS35, VS34
STXVD2X VS32, (R3)(R8) // Store to dst
STXVD2X VS34, (R3)(R10)
ADD $32, R8 // Update index
ADD $32, R10
BC 16, 0, loop32 // bdnz loop16
BEQ CR0, done
SLD $4, R7
SUB R7, R6 // R6 = n - (R7 * 16)
MOVD R9, R6
CMP R6, $8
BLT small
xor16:
CMP R6, $16
BLT xor8
LXVD2X (R4)(R8), VS32
LXVD2X (R5)(R8), VS33
XXLXOR VS32, VS33, VS32
STXVD2X VS32, (R3)(R8)
ADD $16, R8
ADD $-16, R6
CMP R6, $8
BLT small
xor8:
// Case for 8 n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8
// Check if we're finished
CMP R6, R0
BGT small
RET
// Case for n < 8 bytes and tailing bytes from the
// previous cases.
small:
CMP R6, R0
BEQ done
MOVD R6, CTR // Setup loop counter
loop:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment