Commit 6ac8ccf4 authored by Ilya Tocar's avatar Ilya Tocar Committed by Russ Cox

crypto/sha512: Add AVX2 version for AMD64

name          old time/op    new time/op     delta
Hash8Bytes-6     913ns ± 0%      667ns ± 0%  -26.91%  (p=0.000 n=10+10)
Hash1K-6        6.58µs ± 0%     4.23µs ± 0%  -35.69%  (p=0.000 n=10+9)
Hash8K-6        45.9µs ± 0%     28.1µs ± 0%  -38.93%  (p=0.000 n=10+10)

name          old speed      new speed       delta
Hash8Bytes-6  8.76MB/s ± 0%  11.99MB/s ± 0%  +36.87%  (p=0.000 n=10+8)
Hash1K-6       156MB/s ± 0%    242MB/s ± 0%  +55.49%  (p=0.000 n=10+9)
Hash8K-6       178MB/s ± 0%    292MB/s ± 0%  +63.74%  (p=0.000 n=10+10)

Change-Id: Ic9211d68b02935b2195995f264ec57d6bc36f713
Reviewed-on: https://go-review.googlesource.com/36630
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent 630e93ed
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64
package sha512
//go:noescape
func blockAVX2(dig *digest, p []byte)
//go:noescape
func blockAMD64(dig *digest, p []byte)
//go:noescape
func checkAVX2() bool
var hasAVX2 = checkAVX2()
func block(dig *digest, p []byte) {
if hasAVX2 {
blockAVX2(dig, p)
} else {
blockAMD64(dig, p)
}
}
......@@ -141,7 +141,7 @@
MSGSCHEDULE1(index); \
SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
TEXT ·block(SB),0,$648-32
TEXT ·blockAMD64(SB),0,$648-32
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
SHRQ $7, DX
......@@ -271,3 +271,1216 @@ loop:
end:
RET
// Version bellow is based on "Fast SHA512 Implementations on Intel
// Architecture Processors" White-paper
// http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-sha512-implementations-ia-processors-paper.pdf
// AVX2 version by Intel, same algorithm in Linux kernel:
// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha512-avx2-asm.S
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
// David Cote <david.m.cote@intel.com>
// Aleksey Sidorov <aleksey.sidorov@intel.com>
#define YFER_SIZE (4*8)
#define SRND_SIZE (1*8)
#define INP_SIZE (1*8)
#define frame_YFER (0)
#define frame_SRND (frame_YFER + YFER_SIZE)
#define frame_INP (frame_SRND + SRND_SIZE)
#define frame_INPEND (frame_INP + INP_SIZE)
#define addm(p1, p2) \
ADDQ p1, p2; \
MOVQ p2, p1
#define COPY_YMM_AND_BSWAP(p1, p2, p3) \
VMOVDQU p2, p1; \
VPSHUFB p3, p1, p1
#define MY_VPALIGNR(YDST, YSRC1, YSRC2, RVAL) \
VPERM2F128 $0x3, YSRC2, YSRC1, YDST; \
VPALIGNR $RVAL, YSRC2, YDST, YDST
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x00(SB)/8, $0x0001020304050607
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x08(SB)/8, $0x08090a0b0c0d0e0f
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x10(SB)/8, $0x1011121314151617
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x18(SB)/8, $0x18191a1b1c1d1e1f
GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), (NOPTR+RODATA), $32
DATA MASK_YMM_LO<>+0x00(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+0x08(SB)/8, $0x0000000000000000
DATA MASK_YMM_LO<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA MASK_YMM_LO<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
GLOBL MASK_YMM_LO<>(SB), (NOPTR+RODATA), $32
TEXT ·blockAVX2(SB), NOSPLIT, $56-32
MOVQ dig+0(FP), SI
MOVQ p_base+8(FP), DI
MOVQ p_len+16(FP), DX
SHRQ $7, DX
SHLQ $7, DX
JZ done_hash
ADDQ DI, DX
MOVQ DX, frame_INPEND(SP)
MOVQ (0*8)(SI), AX
MOVQ (1*8)(SI), BX
MOVQ (2*8)(SI), CX
MOVQ (3*8)(SI), R8
MOVQ (4*8)(SI), DX
MOVQ (5*8)(SI), R9
MOVQ (6*8)(SI), R10
MOVQ (7*8)(SI), R11
MOVQ $PSHUFFLE_BYTE_FLIP_MASK<>(SB), R12
VMOVDQU (R12), Y9
loop0:
MOVQ ·_K+0(SB), BP
// byte swap first 16 dwords
COPY_YMM_AND_BSWAP(Y4, (0*32)(DI), Y9)
COPY_YMM_AND_BSWAP(Y5, (1*32)(DI), Y9)
COPY_YMM_AND_BSWAP(Y6, (2*32)(DI), Y9)
COPY_YMM_AND_BSWAP(Y7, (3*32)(DI), Y9)
MOVQ DI, frame_INP(SP)
// schedule 64 input dwords, by doing 12 rounds of 4 each
MOVQ $4, frame_SRND(SP)
loop1:
VPADDQ (BP), Y4, Y0
VMOVDQU Y0, frame_YFER(SP)
MY_VPALIGNR(Y0, Y7, Y6, 8)
VPADDQ Y4, Y0, Y0
MY_VPALIGNR(Y1, Y5, Y4, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ AX, DI
RORXQ $41, DX, R13
RORXQ $18, DX, R14
ADDQ frame_YFER(SP), R11
ORQ CX, DI
MOVQ R9, R15
RORXQ $34, AX, R12
XORQ R14, R13
XORQ R10, R15
RORXQ $14, DX, R14
ANDQ DX, R15
XORQ R14, R13
RORXQ $39, AX, R14
ADDQ R11, R8
ANDQ BX, DI
XORQ R12, R14
RORXQ $28, AX, R12
XORQ R10, R15
XORQ R12, R14
MOVQ AX, R12
ANDQ CX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R11
ADDQ R15, R8
ADDQ R15, R11
ADDQ DI, R11
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y4
MOVQ $MASK_YMM_LO<>(SB), R13
VPAND (R13), Y0, Y0
VPERM2F128 $0x11, Y7, Y7, Y2
VPSRLQ $6, Y2, Y8
MOVQ R11, DI
RORXQ $41, R8, R13
RORXQ $18, R8, R14
ADDQ 1*8+frame_YFER(SP), R10
ORQ BX, DI
MOVQ DX, R15
RORXQ $34, R11, R12
XORQ R14, R13
XORQ R9, R15
RORXQ $14, R8, R14
XORQ R14, R13
RORXQ $39, R11, R14
ANDQ R8, R15
ADDQ R10, CX
ANDQ AX, DI
XORQ R12, R14
RORXQ $28, R11, R12
XORQ R9, R15
XORQ R12, R14
MOVQ R11, R12
ANDQ BX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R10
ADDQ R15, CX
ADDQ R15, R10
ADDQ DI, R10
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y4, Y4
VPSRLQ $6, Y4, Y8
MOVQ R10, DI
RORXQ $41, CX, R13
ADDQ 2*8+frame_YFER(SP), R9
RORXQ $18, CX, R14
ORQ AX, DI
MOVQ R8, R15
XORQ DX, R15
RORXQ $34, R10, R12
XORQ R14, R13
ANDQ CX, R15
RORXQ $14, CX, R14
ADDQ R9, BX
ANDQ R11, DI
XORQ R14, R13
RORXQ $39, R10, R14
XORQ DX, R15
XORQ R12, R14
RORXQ $28, R10, R12
XORQ R12, R14
MOVQ R10, R12
ANDQ AX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R9
ADDQ R15, BX
ADDQ R15, R9
ADDQ DI, R9
VPSRLQ $19, Y4, Y3
VPSLLQ $(64-19), Y4, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y4, Y3
VPSLLQ $(64-61), Y4, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y4, Y4
MOVQ R9, DI
RORXQ $41, BX, R13
RORXQ $18, BX, R14
ADDQ 3*8+frame_YFER(SP), DX
ORQ R11, DI
MOVQ CX, R15
RORXQ $34, R9, R12
XORQ R14, R13
XORQ R8, R15
RORXQ $14, BX, R14
ANDQ BX, R15
ADDQ DX, AX
ANDQ R10, DI
XORQ R14, R13
XORQ R8, R15
RORXQ $39, R9, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, AX
RORXQ $28, R9, R12
XORQ R12, R14
MOVQ R9, R12
ANDQ R11, R12
ORQ R12, DI
ADDQ R14, DX
ADDQ R15, DX
ADDQ DI, DX
VPADDQ 1*32(BP), Y5, Y0
VMOVDQU Y0, frame_YFER(SP)
MY_VPALIGNR(Y0, Y4, Y7, 8)
VPADDQ Y5, Y0, Y0
MY_VPALIGNR(Y1, Y6, Y5, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ DX, DI
RORXQ $41, AX, R13
RORXQ $18, AX, R14
ADDQ frame_YFER(SP), R8
ORQ R10, DI
MOVQ BX, R15
RORXQ $34, DX, R12
XORQ R14, R13
XORQ CX, R15
RORXQ $14, AX, R14
ANDQ AX, R15
XORQ R14, R13
RORXQ $39, DX, R14
ADDQ R8, R11
ANDQ R9, DI
XORQ R12, R14
RORXQ $28, DX, R12
XORQ CX, R15
XORQ R12, R14
MOVQ DX, R12
ANDQ R10, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R8
ADDQ R15, R11
ADDQ R15, R8
ADDQ DI, R8
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y5
MOVQ $MASK_YMM_LO<>(SB), R13
VPAND (R13), Y0, Y0
VPERM2F128 $0x11, Y4, Y4, Y2
VPSRLQ $6, Y2, Y8
MOVQ R8, DI
RORXQ $41, R11, R13
RORXQ $18, R11, R14
ADDQ 1*8+frame_YFER(SP), CX
ORQ R9, DI
MOVQ AX, R15
RORXQ $34, R8, R12
XORQ R14, R13
XORQ BX, R15
RORXQ $14, R11, R14
XORQ R14, R13
RORXQ $39, R8, R14
ANDQ R11, R15
ADDQ CX, R10
ANDQ DX, DI
XORQ R12, R14
RORXQ $28, R8, R12
XORQ BX, R15
XORQ R12, R14
MOVQ R8, R12
ANDQ R9, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, CX
ADDQ R15, R10
ADDQ R15, CX
ADDQ DI, CX
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y5, Y5
VPSRLQ $6, Y5, Y8
MOVQ CX, DI
RORXQ $41, R10, R13
ADDQ 2*8+frame_YFER(SP), BX
RORXQ $18, R10, R14
ORQ DX, DI
MOVQ R11, R15
XORQ AX, R15
RORXQ $34, CX, R12
XORQ R14, R13
ANDQ R10, R15
RORXQ $14, R10, R14
ADDQ BX, R9
ANDQ R8, DI
XORQ R14, R13
RORXQ $39, CX, R14
XORQ AX, R15
XORQ R12, R14
RORXQ $28, CX, R12
XORQ R12, R14
MOVQ CX, R12
ANDQ DX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, BX
ADDQ R15, R9
ADDQ R15, BX
ADDQ DI, BX
VPSRLQ $19, Y5, Y3
VPSLLQ $(64-19), Y5, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y5, Y3
VPSLLQ $(64-61), Y5, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y5, Y5
MOVQ BX, DI
RORXQ $41, R9, R13
RORXQ $18, R9, R14
ADDQ 3*8+frame_YFER(SP), AX
ORQ R8, DI
MOVQ R10, R15
RORXQ $34, BX, R12
XORQ R14, R13
XORQ R11, R15
RORXQ $14, R9, R14
ANDQ R9, R15
ADDQ AX, DX
ANDQ CX, DI
XORQ R14, R13
XORQ R11, R15
RORXQ $39, BX, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, DX
RORXQ $28, BX, R12
XORQ R12, R14
MOVQ BX, R12
ANDQ R8, R12
ORQ R12, DI
ADDQ R14, AX
ADDQ R15, AX
ADDQ DI, AX
VPADDQ 2*32(BP), Y6, Y0
VMOVDQU Y0, frame_YFER(SP)
MY_VPALIGNR(Y0, Y5, Y4, 8)
VPADDQ Y6, Y0, Y0
MY_VPALIGNR(Y1, Y7, Y6, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ AX, DI
RORXQ $41, DX, R13
RORXQ $18, DX, R14
ADDQ frame_YFER(SP), R11
ORQ CX, DI
MOVQ R9, R15
RORXQ $34, AX, R12
XORQ R14, R13
XORQ R10, R15
RORXQ $14, DX, R14
ANDQ DX, R15
XORQ R14, R13
RORXQ $39, AX, R14
ADDQ R11, R8
ANDQ BX, DI
XORQ R12, R14
RORXQ $28, AX, R12
XORQ R10, R15
XORQ R12, R14
MOVQ AX, R12
ANDQ CX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R11
ADDQ R15, R8
ADDQ R15, R11
ADDQ DI, R11
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y6
MOVQ $MASK_YMM_LO<>(SB), R13
VPAND (R13), Y0, Y0
VPERM2F128 $0x11, Y5, Y5, Y2
VPSRLQ $6, Y2, Y8
MOVQ R11, DI
RORXQ $41, R8, R13
RORXQ $18, R8, R14
ADDQ 1*8+frame_YFER(SP), R10
ORQ BX, DI
MOVQ DX, R15
RORXQ $34, R11, R12
XORQ R14, R13
XORQ R9, R15
RORXQ $14, R8, R14
XORQ R14, R13
RORXQ $39, R11, R14
ANDQ R8, R15
ADDQ R10, CX
ANDQ AX, DI
XORQ R12, R14
RORXQ $28, R11, R12
XORQ R9, R15
XORQ R12, R14
MOVQ R11, R12
ANDQ BX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R10
ADDQ R15, CX
ADDQ R15, R10
ADDQ DI, R10
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y6, Y6
VPSRLQ $6, Y6, Y8
MOVQ R10, DI
RORXQ $41, CX, R13
ADDQ 2*8+frame_YFER(SP), R9
RORXQ $18, CX, R14
ORQ AX, DI
MOVQ R8, R15
XORQ DX, R15
RORXQ $34, R10, R12
XORQ R14, R13
ANDQ CX, R15
RORXQ $14, CX, R14
ADDQ R9, BX
ANDQ R11, DI
XORQ R14, R13
RORXQ $39, R10, R14
XORQ DX, R15
XORQ R12, R14
RORXQ $28, R10, R12
XORQ R12, R14
MOVQ R10, R12
ANDQ AX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R9
ADDQ R15, BX
ADDQ R15, R9
ADDQ DI, R9
VPSRLQ $19, Y6, Y3
VPSLLQ $(64-19), Y6, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y6, Y3
VPSLLQ $(64-61), Y6, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y6, Y6
MOVQ R9, DI
RORXQ $41, BX, R13
RORXQ $18, BX, R14
ADDQ 3*8+frame_YFER(SP), DX
ORQ R11, DI
MOVQ CX, R15
RORXQ $34, R9, R12
XORQ R14, R13
XORQ R8, R15
RORXQ $14, BX, R14
ANDQ BX, R15
ADDQ DX, AX
ANDQ R10, DI
XORQ R14, R13
XORQ R8, R15
RORXQ $39, R9, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, AX
RORXQ $28, R9, R12
XORQ R12, R14
MOVQ R9, R12
ANDQ R11, R12
ORQ R12, DI
ADDQ R14, DX
ADDQ R15, DX
ADDQ DI, DX
VPADDQ 3*32(BP), Y7, Y0
VMOVDQU Y0, frame_YFER(SP)
ADDQ $(4*32), BP
MY_VPALIGNR(Y0, Y6, Y5, 8)
VPADDQ Y7, Y0, Y0
MY_VPALIGNR(Y1, Y4, Y7, 8)
VPSRLQ $1, Y1, Y2
VPSLLQ $(64-1), Y1, Y3
VPOR Y2, Y3, Y3
VPSRLQ $7, Y1, Y8
MOVQ DX, DI
RORXQ $41, AX, R13
RORXQ $18, AX, R14
ADDQ frame_YFER(SP), R8
ORQ R10, DI
MOVQ BX, R15
RORXQ $34, DX, R12
XORQ R14, R13
XORQ CX, R15
RORXQ $14, AX, R14
ANDQ AX, R15
XORQ R14, R13
RORXQ $39, DX, R14
ADDQ R8, R11
ANDQ R9, DI
XORQ R12, R14
RORXQ $28, DX, R12
XORQ CX, R15
XORQ R12, R14
MOVQ DX, R12
ANDQ R10, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, R8
ADDQ R15, R11
ADDQ R15, R8
ADDQ DI, R8
VPSRLQ $8, Y1, Y2
VPSLLQ $(64-8), Y1, Y1
VPOR Y2, Y1, Y1
VPXOR Y8, Y3, Y3
VPXOR Y1, Y3, Y1
VPADDQ Y1, Y0, Y0
VPERM2F128 $0x0, Y0, Y0, Y7
MOVQ $MASK_YMM_LO<>(SB), R13
VPAND (R13), Y0, Y0
VPERM2F128 $0x11, Y6, Y6, Y2
VPSRLQ $6, Y2, Y8
MOVQ R8, DI
RORXQ $41, R11, R13
RORXQ $18, R11, R14
ADDQ 1*8+frame_YFER(SP), CX
ORQ R9, DI
MOVQ AX, R15
RORXQ $34, R8, R12
XORQ R14, R13
XORQ BX, R15
RORXQ $14, R11, R14
XORQ R14, R13
RORXQ $39, R8, R14
ANDQ R11, R15
ADDQ CX, R10
ANDQ DX, DI
XORQ R12, R14
RORXQ $28, R8, R12
XORQ BX, R15
XORQ R12, R14
MOVQ R8, R12
ANDQ R9, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, CX
ADDQ R15, R10
ADDQ R15, CX
ADDQ DI, CX
VPSRLQ $19, Y2, Y3
VPSLLQ $(64-19), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y2, Y3
VPSLLQ $(64-61), Y2, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y7, Y7
VPSRLQ $6, Y7, Y8
MOVQ CX, DI
RORXQ $41, R10, R13
ADDQ 2*8+frame_YFER(SP), BX
RORXQ $18, R10, R14
ORQ DX, DI
MOVQ R11, R15
XORQ AX, R15
RORXQ $34, CX, R12
XORQ R14, R13
ANDQ R10, R15
RORXQ $14, R10, R14
ADDQ BX, R9
ANDQ R8, DI
XORQ R14, R13
RORXQ $39, CX, R14
XORQ AX, R15
XORQ R12, R14
RORXQ $28, CX, R12
XORQ R12, R14
MOVQ CX, R12
ANDQ DX, R12
ADDQ R13, R15
ORQ R12, DI
ADDQ R14, BX
ADDQ R15, R9
ADDQ R15, BX
ADDQ DI, BX
VPSRLQ $19, Y7, Y3
VPSLLQ $(64-19), Y7, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPSRLQ $61, Y7, Y3
VPSLLQ $(64-61), Y7, Y1
VPOR Y1, Y3, Y3
VPXOR Y3, Y8, Y8
VPADDQ Y8, Y0, Y2
VPBLENDD $0xF0, Y2, Y7, Y7
MOVQ BX, DI
RORXQ $41, R9, R13
RORXQ $18, R9, R14
ADDQ 3*8+frame_YFER(SP), AX
ORQ R8, DI
MOVQ R10, R15
RORXQ $34, BX, R12
XORQ R14, R13
XORQ R11, R15
RORXQ $14, R9, R14
ANDQ R9, R15
ADDQ AX, DX
ANDQ CX, DI
XORQ R14, R13
XORQ R11, R15
RORXQ $39, BX, R14
ADDQ R13, R15
XORQ R12, R14
ADDQ R15, DX
RORXQ $28, BX, R12
XORQ R12, R14
MOVQ BX, R12
ANDQ R8, R12
ORQ R12, DI
ADDQ R14, AX
ADDQ R15, AX
ADDQ DI, AX
SUBQ $1, frame_SRND(SP)
JNE loop1
MOVQ $2, frame_SRND(SP)
loop2:
VPADDQ (BP), Y4, Y0
VMOVDQU Y0, frame_YFER(SP)
MOVQ R9, R15
RORXQ $41, DX, R13
RORXQ $18, DX, R14
XORQ R10, R15
XORQ R14, R13
RORXQ $14, DX, R14
ANDQ DX, R15
XORQ R14, R13
RORXQ $34, AX, R12
XORQ R10, R15
RORXQ $39, AX, R14
MOVQ AX, DI
XORQ R12, R14
RORXQ $28, AX, R12
ADDQ frame_YFER(SP), R11
ORQ CX, DI
XORQ R12, R14
MOVQ AX, R12
ANDQ BX, DI
ANDQ CX, R12
ADDQ R13, R15
ADDQ R11, R8
ORQ R12, DI
ADDQ R14, R11
ADDQ R15, R8
ADDQ R15, R11
MOVQ DX, R15
RORXQ $41, R8, R13
RORXQ $18, R8, R14
XORQ R9, R15
XORQ R14, R13
RORXQ $14, R8, R14
ANDQ R8, R15
ADDQ DI, R11
XORQ R14, R13
RORXQ $34, R11, R12
XORQ R9, R15
RORXQ $39, R11, R14
MOVQ R11, DI
XORQ R12, R14
RORXQ $28, R11, R12
ADDQ 8*1+frame_YFER(SP), R10
ORQ BX, DI
XORQ R12, R14
MOVQ R11, R12
ANDQ AX, DI
ANDQ BX, R12
ADDQ R13, R15
ADDQ R10, CX
ORQ R12, DI
ADDQ R14, R10
ADDQ R15, CX
ADDQ R15, R10
MOVQ R8, R15
RORXQ $41, CX, R13
RORXQ $18, CX, R14
XORQ DX, R15
XORQ R14, R13
RORXQ $14, CX, R14
ANDQ CX, R15
ADDQ DI, R10
XORQ R14, R13
RORXQ $34, R10, R12
XORQ DX, R15
RORXQ $39, R10, R14
MOVQ R10, DI
XORQ R12, R14
RORXQ $28, R10, R12
ADDQ 8*2+frame_YFER(SP), R9
ORQ AX, DI
XORQ R12, R14
MOVQ R10, R12
ANDQ R11, DI
ANDQ AX, R12
ADDQ R13, R15
ADDQ R9, BX
ORQ R12, DI
ADDQ R14, R9
ADDQ R15, BX
ADDQ R15, R9
MOVQ CX, R15
RORXQ $41, BX, R13
RORXQ $18, BX, R14
XORQ R8, R15
XORQ R14, R13
RORXQ $14, BX, R14
ANDQ BX, R15
ADDQ DI, R9
XORQ R14, R13
RORXQ $34, R9, R12
XORQ R8, R15
RORXQ $39, R9, R14
MOVQ R9, DI
XORQ R12, R14
RORXQ $28, R9, R12
ADDQ 8*3+frame_YFER(SP), DX
ORQ R11, DI
XORQ R12, R14
MOVQ R9, R12
ANDQ R10, DI
ANDQ R11, R12
ADDQ R13, R15
ADDQ DX, AX
ORQ R12, DI
ADDQ R14, DX
ADDQ R15, AX
ADDQ R15, DX
ADDQ DI, DX
VPADDQ 1*32(BP), Y5, Y0
VMOVDQU Y0, frame_YFER(SP)
ADDQ $(2*32), BP
MOVQ BX, R15
RORXQ $41, AX, R13
RORXQ $18, AX, R14
XORQ CX, R15
XORQ R14, R13
RORXQ $14, AX, R14
ANDQ AX, R15
XORQ R14, R13
RORXQ $34, DX, R12
XORQ CX, R15
RORXQ $39, DX, R14
MOVQ DX, DI
XORQ R12, R14
RORXQ $28, DX, R12
ADDQ frame_YFER(SP), R8
ORQ R10, DI
XORQ R12, R14
MOVQ DX, R12
ANDQ R9, DI
ANDQ R10, R12
ADDQ R13, R15
ADDQ R8, R11
ORQ R12, DI
ADDQ R14, R8
ADDQ R15, R11
ADDQ R15, R8
MOVQ AX, R15
RORXQ $41, R11, R13
RORXQ $18, R11, R14
XORQ BX, R15
XORQ R14, R13
RORXQ $14, R11, R14
ANDQ R11, R15
ADDQ DI, R8
XORQ R14, R13
RORXQ $34, R8, R12
XORQ BX, R15
RORXQ $39, R8, R14
MOVQ R8, DI
XORQ R12, R14
RORXQ $28, R8, R12
ADDQ 8*1+frame_YFER(SP), CX
ORQ R9, DI
XORQ R12, R14
MOVQ R8, R12
ANDQ DX, DI
ANDQ R9, R12
ADDQ R13, R15
ADDQ CX, R10
ORQ R12, DI
ADDQ R14, CX
ADDQ R15, R10
ADDQ R15, CX
MOVQ R11, R15
RORXQ $41, R10, R13
RORXQ $18, R10, R14
XORQ AX, R15
XORQ R14, R13
RORXQ $14, R10, R14
ANDQ R10, R15
ADDQ DI, CX
XORQ R14, R13
RORXQ $34, CX, R12
XORQ AX, R15
RORXQ $39, CX, R14
MOVQ CX, DI
XORQ R12, R14
RORXQ $28, CX, R12
ADDQ 8*2+frame_YFER(SP), BX
ORQ DX, DI
XORQ R12, R14
MOVQ CX, R12
ANDQ R8, DI
ANDQ DX, R12
ADDQ R13, R15
ADDQ BX, R9
ORQ R12, DI
ADDQ R14, BX
ADDQ R15, R9
ADDQ R15, BX
MOVQ R10, R15
RORXQ $41, R9, R13
RORXQ $18, R9, R14
XORQ R11, R15
XORQ R14, R13
RORXQ $14, R9, R14
ANDQ R9, R15
ADDQ DI, BX
XORQ R14, R13
RORXQ $34, BX, R12
XORQ R11, R15
RORXQ $39, BX, R14
MOVQ BX, DI
XORQ R12, R14
RORXQ $28, BX, R12
ADDQ 8*3+frame_YFER(SP), AX
ORQ R8, DI
XORQ R12, R14
MOVQ BX, R12
ANDQ CX, DI
ANDQ R8, R12
ADDQ R13, R15
ADDQ AX, DX
ORQ R12, DI
ADDQ R14, AX
ADDQ R15, DX
ADDQ R15, AX
ADDQ DI, AX
VMOVDQU Y6, Y4
VMOVDQU Y7, Y5
SUBQ $1, frame_SRND(SP)
JNE loop2
addm(8*0(SI),AX)
addm(8*1(SI),BX)
addm(8*2(SI),CX)
addm(8*3(SI),R8)
addm(8*4(SI),DX)
addm(8*5(SI),R9)
addm(8*6(SI),R10)
addm(8*7(SI),R11)
MOVQ frame_INP(SP), DI
ADDQ $128, DI
CMPQ DI, frame_INPEND(SP)
JNE loop0
done_hash:
VZEROUPPER
RET
// func checkAVX2() bool
// returns whether AVX2 is supported
TEXT ·checkAVX2(SB), NOSPLIT, $0
MOVB runtime·support_avx2(SB), AX
CMPB AX,$0
JNE check_bmi2
MOVB AX, ret+0(FP)
check_bmi2:
MOVB runtime·support_bmi2(SB), AX
MOVB AX, ret+0(FP)
RET
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 s390x ppc64le
// +build s390x ppc64le
package sha512
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment