Commit fafadc52 authored by Ilya Tocar's avatar Ilya Tocar Committed by Russ Cox

crypto/sha1: Add AVX2 version for AMD64

name             old time/op    new time/op    delta
Hash8Bytes-48       271ns ± 8%     273ns ± 5%     ~     (p=0.313 n=19+19)
Hash320Bytes-48    1.04µs ± 7%    0.75µs ± 8%  -27.66%  (p=0.000 n=20+20)
Hash1K-48          2.72µs ± 6%    1.75µs ± 6%  -35.79%  (p=0.000 n=19+20)
Hash8K-48          19.9µs ± 7%    11.6µs ± 6%  -41.84%  (p=0.000 n=20+19)

name             old speed      new speed      delta
Hash8Bytes-48    29.5MB/s ± 8%  29.3MB/s ± 5%     ~     (p=0.314 n=19+19)
Hash320Bytes-48   307MB/s ± 7%   424MB/s ± 8%  +38.29%  (p=0.000 n=20+20)
Hash1K-48         377MB/s ± 6%   587MB/s ± 6%  +55.76%  (p=0.000 n=19+20)
Hash8K-48         413MB/s ± 7%   709MB/s ± 6%  +71.85%  (p=0.000 n=20+19)

Change-Id: I2963cf744eeb2e8191d4e4223fbf6f533a7fd405
Reviewed-on: https://go-review.googlesource.com/22607
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 2210d88a
......@@ -19,6 +19,7 @@ type sha1Test struct {
}
var golden = []sha1Test{
{"76245dbf96f661bd221046197ab8b9f063f11bad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"},
{"da39a3ee5e6b4b0d3255bfef95601890afd80709", ""},
{"86f7e437faa5a7fce15d1ddcb9eaeaea377667b8", "a"},
{"da23614e02469a0d7c7bd1bdab5c9c474b1904dc", "ab"},
......@@ -120,6 +121,10 @@ func BenchmarkHash8Bytes(b *testing.B) {
benchmarkSize(b, 8)
}
func BenchmarkHash320Bytes(b *testing.B) {
benchmarkSize(b, 320)
}
func BenchmarkHash1K(b *testing.B) {
benchmarkSize(b, 1024)
}
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package sha1
//go:noescape
func blockAVX2(dig *digest, p []byte)
//go:noescape
func blockAMD64(dig *digest, p []byte)
func checkAVX2() bool
var hasAVX2 = checkAVX2()
func block(dig *digest, p []byte) {
if hasAVX2 && len(p) >= 256 {
blockAVX2(dig, p)
} else {
blockAMD64(dig, p)
}
}
......@@ -2,6 +2,15 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// AVX2 version by Intel, same algorithm as code in Linux kernel:
// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
// Authors:
// Ilya Albrekht <ilya.albrekht@intel.com>
// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
// Ronen Zohar <ronen.zohar@intel.com>
// Chandramouli Narayanan <mouli@linux.intel.com>
#include "textflag.h"
// SHA1 block routine. See sha1block.go for Go equivalent.
......@@ -87,7 +96,7 @@
FUNC4(a, b, c, d, e); \
MIX(a, b, c, d, e, 0xCA62C1D6)
TEXT ·block(SB),NOSPLIT,$64-32
TEXT ·blockAMD64(SB),NOSPLIT,$64-32
MOVQ dig+0(FP), BP
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
......@@ -214,3 +223,1293 @@ end:
MOVL DX, (3*4)(DI)
MOVL BP, (4*4)(DI)
RET
// This is the implementation using AVX2. It is based on:
// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
// From http://software.intel.com/en-us/articles
// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
// This implementation is 2x unrolled, and interleaves vector instructions,
// used to precompute W, with scalar computation of current round
// for optimal scheduling.
// Trivial helper macros.
#define UPDATE_HASH(A,TB,C,D,E) \
ADDL (R9), A \
MOVL A, (R9) \
ADDL 4(R9), TB \
MOVL TB, 4(R9) \
ADDL 8(R9), C \
MOVL C, 8(R9) \
ADDL 12(R9), D \
MOVL D, 12(R9) \
ADDL 16(R9), E \
MOVL E, 16(R9)
// Helper macros for PRECALC, which does precomputations
#define PRECALC_0(OFFSET) \
VMOVDQU OFFSET(R10),X0
#define PRECALC_1(OFFSET) \
VINSERTI128 $1, OFFSET(R13), Y0, Y0
#define PRECALC_2(YREG) \
VPSHUFB Y10, Y0, YREG
#define PRECALC_4(YREG,K_OFFSET) \
VPADDD K_OFFSET(R8), YREG, Y0
#define PRECALC_7(OFFSET) \
VMOVDQU Y0, (OFFSET*2)(R14)
// Message scheduling pre-compute for rounds 0-15
// R13 is a pointer to even 64-byte block
// R10 is a pointer to odd 64-byte block
// R14 is a pointer to temp buffer
// X0 is used as temp register
// YREG is clobbered as part of computation
// OFFSET chooses 16 byte chunk within a block
// R8 is a pointer to constants block
// K_OFFSET chooses K constants relevant to this round
// X10 holds swap mask
#define PRECALC_00_15(OFFSET,YREG) \
PRECALC_0(OFFSET) \
PRECALC_1(OFFSET) \
PRECALC_2(YREG) \
PRECALC_4(YREG,0x0) \
PRECALC_7(OFFSET)
// Helper macros for PRECALC_16_31
#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14]
VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
VPXOR REG_SUB_8, REG, REG \
VPXOR REG_SUB_16, Y0, Y0
#define PRECALC_18(REG) \
VPXOR Y0, REG, REG \
VPSLLDQ $12, REG, Y9
#define PRECALC_19(REG) \
VPSLLD $1, REG, Y0 \
VPSRLD $31, REG, REG
#define PRECALC_20(REG) \
VPOR REG, Y0, Y0 \
VPSLLD $2, Y9, REG
#define PRECALC_21(REG) \
VPSRLD $30, Y9, Y9 \
VPXOR REG, Y0, Y0
#define PRECALC_23(REG,K_OFFSET,OFFSET) \
VPXOR Y9, Y0, REG \
VPADDD K_OFFSET(R8), REG, Y0 \
VMOVDQU Y0, (OFFSET)(R14)
// Message scheduling pre-compute for rounds 16-31
// calculating last 32 w[i] values in 8 XMM registers
// pre-calculate K+w[i] values and store to mem
// for later load by ALU add instruction.
// "brute force" vectorization for rounds 16-31 only
// due to w[i]->w[i-3] dependency.
// clobbers 5 input ymm registers REG_SUB*
// uses X0 and X9 as temp registers
// As always, R8 is a pointer to constants block
// and R14 is a pointer to temp buffer
#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
PRECALC_18(REG) \
PRECALC_19(REG) \
PRECALC_20(REG) \
PRECALC_21(REG) \
PRECALC_23(REG,K_OFFSET,OFFSET)
// Helper macros for PRECALC_32_79
#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
#define PRECALC_33(REG_SUB_28,REG) \
VPXOR REG_SUB_28, REG, REG
#define PRECALC_34(REG_SUB_16) \
VPXOR REG_SUB_16, Y0, Y0
#define PRECALC_35(REG) \
VPXOR Y0, REG, REG
#define PRECALC_36(REG) \
VPSLLD $2, REG, Y0
#define PRECALC_37(REG) \
VPSRLD $30, REG, REG \
VPOR REG, Y0, REG
#define PRECALC_39(REG,K_OFFSET,OFFSET) \
VPADDD K_OFFSET(R8), REG, Y0 \
VMOVDQU Y0, (OFFSET)(R14)
// Message scheduling pre-compute for rounds 32-79
// In SHA-1 specification we have:
// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
// Which is the same as:
// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
// This allows for more efficient vectorization,
// since w[i]->w[i-3] dependency is broken
#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
PRECALC_32(REG_SUB_8,REG_SUB_4) \
PRECALC_33(REG_SUB_28,REG) \
PRECALC_34(REG_SUB_16) \
PRECALC_35(REG) \
PRECALC_36(REG) \
PRECALC_37(REG) \
PRECALC_39(REG,K_OFFSET,OFFSET)
#define PRECALC \
PRECALC_00_15(0,Y15) \
PRECALC_00_15(0x10,Y14) \
PRECALC_00_15(0x20,Y13) \
PRECALC_00_15(0x30,Y12) \
PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
// Macros calculating individual rounds have general forn
// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
// CALC_ROUND_{PRE,POST} macros follow
#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
ADDL OFFSET(R15),REG_E \
ANDNL REG_C,REG_A,BP \
LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
RORXL $0x1b, REG_A, R12 \
RORXL $2, REG_A, REG_B // for next round
// Calculate F for the next round
#define CALC_F1_POST(REG_A,REG_B,REG_E) \
ANDL REG_B,REG_A \ // b&c
XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d)
LEAL (REG_E)(R12*1), REG_E // E += A >>> 5
// Registers are cycleickly rotated DX -> AX -> DI -> SI -> BX -> CX
#define CALC_0 \
MOVL SI, BX \ // Precalculating first round
RORXL $2, SI, SI \
ANDNL AX, BX, BP \
ANDL DI, BX \
XORL BP, BX \
CALC_F1_PRE(0x0,CX,BX,DI,DX) \
PRECALC_0(0x80) \
CALC_F1_POST(CX,SI,DX)
#define CALC_1 \
CALC_F1_PRE(0x4,DX,CX,SI,AX) \
PRECALC_1(0x80) \
CALC_F1_POST(DX,BX,AX)
#define CALC_2 \
CALC_F1_PRE(0x8,AX,DX,BX,DI) \
PRECALC_2(Y15) \
CALC_F1_POST(AX,CX,DI)
#define CALC_3 \
CALC_F1_PRE(0xc,DI,AX,CX,SI) \
CALC_F1_POST(DI,DX,SI)
#define CALC_4 \
CALC_F1_PRE(0x20,SI,DI,DX,BX) \
PRECALC_4(Y15,0x0) \
CALC_F1_POST(SI,AX,BX)
#define CALC_5 \
CALC_F1_PRE(0x24,BX,SI,AX,CX) \
CALC_F1_POST(BX,DI,CX)
#define CALC_6 \
CALC_F1_PRE(0x28,CX,BX,DI,DX) \
CALC_F1_POST(CX,SI,DX)
#define CALC_7 \
CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
PRECALC_7(0x0) \
CALC_F1_POST(DX,BX,AX)
#define CALC_8 \
CALC_F1_PRE(0x40,AX,DX,BX,DI) \
PRECALC_0(0x90) \
CALC_F1_POST(AX,CX,DI)
#define CALC_9 \
CALC_F1_PRE(0x44,DI,AX,CX,SI) \
PRECALC_1(0x90) \
CALC_F1_POST(DI,DX,SI)
#define CALC_10 \
CALC_F1_PRE(0x48,SI,DI,DX,BX) \
PRECALC_2(Y14) \
CALC_F1_POST(SI,AX,BX)
#define CALC_11 \
CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
CALC_F1_POST(BX,DI,CX)
#define CALC_12 \
CALC_F1_PRE(0x60,CX,BX,DI,DX) \
PRECALC_4(Y14,0x0) \
CALC_F1_POST(CX,SI,DX)
#define CALC_13 \
CALC_F1_PRE(0x64,DX,CX,SI,AX) \
CALC_F1_POST(DX,BX,AX)
#define CALC_14 \
CALC_F1_PRE(0x68,AX,DX,BX,DI) \
CALC_F1_POST(AX,CX,DI)
#define CALC_15 \
CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
PRECALC_7(0x10) \
CALC_F1_POST(DI,DX,SI)
#define CALC_16 \
CALC_F1_PRE(0x80,SI,DI,DX,BX) \
PRECALC_0(0xa0) \
CALC_F1_POST(SI,AX,BX)
#define CALC_17 \
CALC_F1_PRE(0x84,BX,SI,AX,CX) \
PRECALC_1(0xa0) \
CALC_F1_POST(BX,DI,CX)
#define CALC_18 \
CALC_F1_PRE(0x88,CX,BX,DI,DX) \
PRECALC_2(Y13) \
CALC_F1_POST(CX,SI,DX)
#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
ADDL OFFSET(R15),REG_E \
LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
RORXL $0x1b, REG_A, R12 \
RORXL $2, REG_A, REG_B // for next round
#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
XORL REG_B, REG_A \
ADDL R12, REG_E \
XORL REG_C, REG_A
#define CALC_19 \
CALC_F2_PRE(0x8c,DX,CX,AX) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_20 \
CALC_F2_PRE(0xa0,AX,DX,DI) \
PRECALC_4(Y13,0x0) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_21 \
CALC_F2_PRE(0xa4,DI,AX,SI) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_22 \
CALC_F2_PRE(0xa8,SI,DI,BX) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_23 \
CALC_F2_PRE(0xac,BX,SI,CX) \
PRECALC_7(0x20) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_24 \
CALC_F2_PRE(0xc0,CX,BX,DX) \
PRECALC_0(0xb0) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_25 \
CALC_F2_PRE(0xc4,DX,CX,AX) \
PRECALC_1(0xb0) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_26 \
CALC_F2_PRE(0xc8,AX,DX,DI) \
PRECALC_2(Y12) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_27 \
CALC_F2_PRE(0xcc,DI,AX,SI) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_28 \
CALC_F2_PRE(0xe0,SI,DI,BX) \
PRECALC_4(Y12,0x0) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_29 \
CALC_F2_PRE(0xe4,BX,SI,CX) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_30 \
CALC_F2_PRE(0xe8,CX,BX,DX) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_31 \
CALC_F2_PRE(0xec,DX,CX,AX) \
PRECALC_7(0x30) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_32 \
CALC_F2_PRE(0x100,AX,DX,DI) \
PRECALC_16(Y15,Y14,Y12,Y8) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_33 \
CALC_F2_PRE(0x104,DI,AX,SI) \
PRECALC_17(Y15,Y13,Y8) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_34 \
CALC_F2_PRE(0x108,SI,DI,BX) \
PRECALC_18(Y8) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_35 \
CALC_F2_PRE(0x10c,BX,SI,CX) \
PRECALC_19(Y8) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_36 \
CALC_F2_PRE(0x120,CX,BX,DX) \
PRECALC_20(Y8) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_37 \
CALC_F2_PRE(0x124,DX,CX,AX) \
PRECALC_21(Y8) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_38 \
CALC_F2_PRE(0x128,AX,DX,DI) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_F3_PRE(OFFSET,REG_E) \
ADDL OFFSET(R15),REG_E
#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
MOVL REG_B, BP \
ORL REG_A, BP \
RORXL $0x1b, REG_A, R12 \
RORXL $2, REG_A, REG_TB \
ANDL REG_C, BP \ // Calculate F for the next round
ANDL REG_B, REG_A \
ORL BP, REG_A \
ADDL R12, REG_E
#define CALC_39 \
CALC_F3_PRE(0x12c,SI) \
PRECALC_23(Y8,0x0,0x80) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_40 \
CALC_F3_PRE(0x140,BX) \
PRECALC_16(Y14,Y13,Y8,Y7) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_41 \
CALC_F3_PRE(0x144,CX) \
PRECALC_17(Y14,Y12,Y7) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_42 \
CALC_F3_PRE(0x148,DX) \
PRECALC_18(Y7) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_43 \
CALC_F3_PRE(0x14c,AX) \
PRECALC_19(Y7) \
CALC_F3_POST(DX,BX,SI,AX,CX)
#define CALC_44 \
CALC_F3_PRE(0x160,DI) \
PRECALC_20(Y7) \
CALC_F3_POST(AX,CX,BX,DI,DX)
#define CALC_45 \
CALC_F3_PRE(0x164,SI) \
PRECALC_21(Y7) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_46 \
CALC_F3_PRE(0x168,BX) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_47 \
CALC_F3_PRE(0x16c,CX) \
VPXOR Y9, Y0, Y7 \
VPADDD 0x20(R8), Y7, Y0 \
VMOVDQU Y0, 0xa0(R14) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_48 \
CALC_F3_PRE(0x180,DX) \
PRECALC_16(Y13,Y12,Y7,Y5) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_49 \
CALC_F3_PRE(0x184,AX) \
PRECALC_17(Y13,Y8,Y5) \
CALC_F3_POST(DX,BX,SI,AX,CX)
#define CALC_50 \
CALC_F3_PRE(0x188,DI) \
PRECALC_18(Y5) \
CALC_F3_POST(AX,CX,BX,DI,DX)
#define CALC_51 \
CALC_F3_PRE(0x18c,SI) \
PRECALC_19(Y5) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_52 \
CALC_F3_PRE(0x1a0,BX) \
PRECALC_20(Y5) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_53 \
CALC_F3_PRE(0x1a4,CX) \
PRECALC_21(Y5) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_54 \
CALC_F3_PRE(0x1a8,DX) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_55 \
CALC_F3_PRE(0x1ac,AX) \
PRECALC_23(Y5,0x20,0xc0) \
CALC_F3_POST(DX,BX,SI,AX,CX)
#define CALC_56 \
CALC_F3_PRE(0x1c0,DI) \
PRECALC_16(Y12,Y8,Y5,Y3) \
CALC_F3_POST(AX,CX,BX,DI,DX)
#define CALC_57 \
CALC_F3_PRE(0x1c4,SI) \
PRECALC_17(Y12,Y7,Y3) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_58 \
CALC_F3_PRE(0x1c8,BX) \
PRECALC_18(Y3) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_59 \
CALC_F2_PRE(0x1cc,BX,SI,CX) \
PRECALC_19(Y3) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_60 \
CALC_F2_PRE(0x1e0,CX,BX,DX) \
PRECALC_20(Y3) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_61 \
CALC_F2_PRE(0x1e4,DX,CX,AX) \
PRECALC_21(Y3) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_62 \
CALC_F2_PRE(0x1e8,AX,DX,DI) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_63 \
CALC_F2_PRE(0x1ec,DI,AX,SI) \
PRECALC_23(Y3,0x20,0xe0) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_64 \
CALC_F2_PRE(0x200,SI,DI,BX) \
PRECALC_32(Y5,Y3) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_65 \
CALC_F2_PRE(0x204,BX,SI,CX) \
PRECALC_33(Y14,Y15) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_66 \
CALC_F2_PRE(0x208,CX,BX,DX) \
PRECALC_34(Y8) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_67 \
CALC_F2_PRE(0x20c,DX,CX,AX) \
PRECALC_35(Y15) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_68 \
CALC_F2_PRE(0x220,AX,DX,DI) \
PRECALC_36(Y15) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_69 \
CALC_F2_PRE(0x224,DI,AX,SI) \
PRECALC_37(Y15) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_70 \
CALC_F2_PRE(0x228,SI,DI,BX) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_71 \
CALC_F2_PRE(0x22c,BX,SI,CX) \
PRECALC_39(Y15,0x20,0x100) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_72 \
CALC_F2_PRE(0x240,CX,BX,DX) \
PRECALC_32(Y3,Y15) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_73 \
CALC_F2_PRE(0x244,DX,CX,AX) \
PRECALC_33(Y13,Y14) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_74 \
CALC_F2_PRE(0x248,AX,DX,DI) \
PRECALC_34(Y7) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_75 \
CALC_F2_PRE(0x24c,DI,AX,SI) \
PRECALC_35(Y14) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_76 \
CALC_F2_PRE(0x260,SI,DI,BX) \
PRECALC_36(Y14) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_77 \
CALC_F2_PRE(0x264,BX,SI,CX) \
PRECALC_37(Y14) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_78 \
CALC_F2_PRE(0x268,CX,BX,DX) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_79 \
ADDL 0x26c(R15), AX \
LEAL (AX)(CX*1), AX \
RORXL $0x1b, DX, R12 \
PRECALC_39(Y14,0x20,0x120) \
ADDL R12, AX
// Similar to CALC_0
#define CALC_80 \
MOVL CX, DX \
RORXL $2, CX, CX \
ANDNL SI, DX, BP \
ANDL BX, DX \
XORL BP, DX \
CALC_F1_PRE(0x10,AX,DX,BX,DI) \
PRECALC_32(Y15,Y14) \
CALC_F1_POST(AX,CX,DI)
#define CALC_81 \
CALC_F1_PRE(0x14,DI,AX,CX,SI) \
PRECALC_33(Y12,Y13) \
CALC_F1_POST(DI,DX,SI)
#define CALC_82 \
CALC_F1_PRE(0x18,SI,DI,DX,BX) \
PRECALC_34(Y5) \
CALC_F1_POST(SI,AX,BX)
#define CALC_83 \
CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
PRECALC_35(Y13) \
CALC_F1_POST(BX,DI,CX)
#define CALC_84 \
CALC_F1_PRE(0x30,CX,BX,DI,DX) \
PRECALC_36(Y13) \
CALC_F1_POST(CX,SI,DX)
#define CALC_85 \
CALC_F1_PRE(0x34,DX,CX,SI,AX) \
PRECALC_37(Y13) \
CALC_F1_POST(DX,BX,AX)
#define CALC_86 \
CALC_F1_PRE(0x38,AX,DX,BX,DI) \
CALC_F1_POST(AX,CX,DI)
#define CALC_87 \
CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
PRECALC_39(Y13,0x40,0x140) \
CALC_F1_POST(DI,DX,SI)
#define CALC_88 \
CALC_F1_PRE(0x50,SI,DI,DX,BX) \
PRECALC_32(Y14,Y13) \
CALC_F1_POST(SI,AX,BX)
#define CALC_89 \
CALC_F1_PRE(0x54,BX,SI,AX,CX) \
PRECALC_33(Y8,Y12) \
CALC_F1_POST(BX,DI,CX)
#define CALC_90 \
CALC_F1_PRE(0x58,CX,BX,DI,DX) \
PRECALC_34(Y3) \
CALC_F1_POST(CX,SI,DX)
#define CALC_91 \
CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
PRECALC_35(Y12) \
CALC_F1_POST(DX,BX,AX)
#define CALC_92 \
CALC_F1_PRE(0x70,AX,DX,BX,DI) \
PRECALC_36(Y12) \
CALC_F1_POST(AX,CX,DI)
#define CALC_93 \
CALC_F1_PRE(0x74,DI,AX,CX,SI) \
PRECALC_37(Y12) \
CALC_F1_POST(DI,DX,SI)
#define CALC_94 \
CALC_F1_PRE(0x78,SI,DI,DX,BX) \
CALC_F1_POST(SI,AX,BX)
#define CALC_95 \
CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
PRECALC_39(Y12,0x40,0x160) \
CALC_F1_POST(BX,DI,CX)
#define CALC_96 \
CALC_F1_PRE(0x90,CX,BX,DI,DX) \
PRECALC_32(Y13,Y12) \
CALC_F1_POST(CX,SI,DX)
#define CALC_97 \
CALC_F1_PRE(0x94,DX,CX,SI,AX) \
PRECALC_33(Y7,Y8) \
CALC_F1_POST(DX,BX,AX)
#define CALC_98 \
CALC_F1_PRE(0x98,AX,DX,BX,DI) \
PRECALC_34(Y15) \
CALC_F1_POST(AX,CX,DI)
#define CALC_99 \
CALC_F2_PRE(0x9c,DI,AX,SI) \
PRECALC_35(Y8) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_100 \
CALC_F2_PRE(0xb0,SI,DI,BX) \
PRECALC_36(Y8) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_101 \
CALC_F2_PRE(0xb4,BX,SI,CX) \
PRECALC_37(Y8) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_102 \
CALC_F2_PRE(0xb8,CX,BX,DX) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_103 \
CALC_F2_PRE(0xbc,DX,CX,AX) \
PRECALC_39(Y8,0x40,0x180) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_104 \
CALC_F2_PRE(0xd0,AX,DX,DI) \
PRECALC_32(Y12,Y8) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_105 \
CALC_F2_PRE(0xd4,DI,AX,SI) \
PRECALC_33(Y5,Y7) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_106 \
CALC_F2_PRE(0xd8,SI,DI,BX) \
PRECALC_34(Y14) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_107 \
CALC_F2_PRE(0xdc,BX,SI,CX) \
PRECALC_35(Y7) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_108 \
CALC_F2_PRE(0xf0,CX,BX,DX) \
PRECALC_36(Y7) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_109 \
CALC_F2_PRE(0xf4,DX,CX,AX) \
PRECALC_37(Y7) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_110 \
CALC_F2_PRE(0xf8,AX,DX,DI) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_111 \
CALC_F2_PRE(0xfc,DI,AX,SI) \
PRECALC_39(Y7,0x40,0x1a0) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_112 \
CALC_F2_PRE(0x110,SI,DI,BX) \
PRECALC_32(Y8,Y7) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_113 \
CALC_F2_PRE(0x114,BX,SI,CX) \
PRECALC_33(Y3,Y5) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_114 \
CALC_F2_PRE(0x118,CX,BX,DX) \
PRECALC_34(Y13) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_115 \
CALC_F2_PRE(0x11c,DX,CX,AX) \
PRECALC_35(Y5) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_116 \
CALC_F2_PRE(0x130,AX,DX,DI) \
PRECALC_36(Y5) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_117 \
CALC_F2_PRE(0x134,DI,AX,SI) \
PRECALC_37(Y5) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_118 \
CALC_F2_PRE(0x138,SI,DI,BX) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_119 \
CALC_F3_PRE(0x13c,CX) \
PRECALC_39(Y5,0x40,0x1c0) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_120 \
CALC_F3_PRE(0x150,DX) \
PRECALC_32(Y7,Y5) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_121 \
CALC_F3_PRE(0x154,AX) \
PRECALC_33(Y15,Y3) \
CALC_F3_POST(DX,BX,SI,AX,CX)
#define CALC_122 \
CALC_F3_PRE(0x158,DI) \
PRECALC_34(Y12) \
CALC_F3_POST(AX,CX,BX,DI,DX)
#define CALC_123 \
CALC_F3_PRE(0x15c,SI) \
PRECALC_35(Y3) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_124 \
CALC_F3_PRE(0x170,BX) \
PRECALC_36(Y3) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_125 \
CALC_F3_PRE(0x174,CX) \
PRECALC_37(Y3) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_126 \
CALC_F3_PRE(0x178,DX) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_127 \
CALC_F3_PRE(0x17c,AX) \
PRECALC_39(Y3,0x60,0x1e0) \
CALC_F3_POST(DX,BX,SI,AX,CX)
#define CALC_128 \
CALC_F3_PRE(0x190,DI) \
PRECALC_32(Y5,Y3) \
CALC_F3_POST(AX,CX,BX,DI,DX)
#define CALC_129 \
CALC_F3_PRE(0x194,SI) \
PRECALC_33(Y14,Y15) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_130 \
CALC_F3_PRE(0x198,BX) \
PRECALC_34(Y8) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_131 \
CALC_F3_PRE(0x19c,CX) \
PRECALC_35(Y15) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_132 \
CALC_F3_PRE(0x1b0,DX) \
PRECALC_36(Y15) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_133 \
CALC_F3_PRE(0x1b4,AX) \
PRECALC_37(Y15) \
CALC_F3_POST(DX,BX,SI,AX,CX)
#define CALC_134 \
CALC_F3_PRE(0x1b8,DI) \
CALC_F3_POST(AX,CX,BX,DI,DX)
#define CALC_135 \
CALC_F3_PRE(0x1bc,SI) \
PRECALC_39(Y15,0x60,0x200) \
CALC_F3_POST(DI,DX,CX,SI,AX)
#define CALC_136 \
CALC_F3_PRE(0x1d0,BX) \
PRECALC_32(Y3,Y15) \
CALC_F3_POST(SI,AX,DX,BX,DI)
#define CALC_137 \
CALC_F3_PRE(0x1d4,CX) \
PRECALC_33(Y13,Y14) \
CALC_F3_POST(BX,DI,AX,CX,SI)
#define CALC_138 \
CALC_F3_PRE(0x1d8,DX) \
PRECALC_34(Y7) \
CALC_F3_POST(CX,SI,DI,DX,BX)
#define CALC_139 \
CALC_F2_PRE(0x1dc,DX,CX,AX) \
PRECALC_35(Y14) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_140 \
CALC_F2_PRE(0x1f0,AX,DX,DI) \
PRECALC_36(Y14) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_141 \
CALC_F2_PRE(0x1f4,DI,AX,SI) \
PRECALC_37(Y14) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_142 \
CALC_F2_PRE(0x1f8,SI,DI,BX) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_143 \
CALC_F2_PRE(0x1fc,BX,SI,CX) \
PRECALC_39(Y14,0x60,0x220) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_144 \
CALC_F2_PRE(0x210,CX,BX,DX) \
PRECALC_32(Y15,Y14) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_145 \
CALC_F2_PRE(0x214,DX,CX,AX) \
PRECALC_33(Y12,Y13) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_146 \
CALC_F2_PRE(0x218,AX,DX,DI) \
PRECALC_34(Y5) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_147 \
CALC_F2_PRE(0x21c,DI,AX,SI) \
PRECALC_35(Y13) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_148 \
CALC_F2_PRE(0x230,SI,DI,BX) \
PRECALC_36(Y13) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_149 \
CALC_F2_PRE(0x234,BX,SI,CX) \
PRECALC_37(Y13) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_150 \
CALC_F2_PRE(0x238,CX,BX,DX) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_151 \
CALC_F2_PRE(0x23c,DX,CX,AX) \
PRECALC_39(Y13,0x60,0x240) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_152 \
CALC_F2_PRE(0x250,AX,DX,DI) \
PRECALC_32(Y14,Y13) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_153 \
CALC_F2_PRE(0x254,DI,AX,SI) \
PRECALC_33(Y8,Y12) \
CALC_F2_POST(DI,DX,CX,SI)
#define CALC_154 \
CALC_F2_PRE(0x258,SI,DI,BX) \
PRECALC_34(Y3) \
CALC_F2_POST(SI,AX,DX,BX)
#define CALC_155 \
CALC_F2_PRE(0x25c,BX,SI,CX) \
PRECALC_35(Y12) \
CALC_F2_POST(BX,DI,AX,CX)
#define CALC_156 \
CALC_F2_PRE(0x270,CX,BX,DX) \
PRECALC_36(Y12) \
CALC_F2_POST(CX,SI,DI,DX)
#define CALC_157 \
CALC_F2_PRE(0x274,DX,CX,AX) \
PRECALC_37(Y12) \
CALC_F2_POST(DX,BX,SI,AX)
#define CALC_158 \
CALC_F2_PRE(0x278,AX,DX,DI) \
CALC_F2_POST(AX,CX,BX,DI)
#define CALC_159 \
ADDL 0x27c(R15),SI \
LEAL (SI)(AX*1), SI \
RORXL $0x1b, DI, R12 \
PRECALC_39(Y12,0x60,0x260) \
ADDL R12, SI
#define CALC \
MOVL (R9), CX \
MOVL 4(R9), SI \
MOVL 8(R9), DI \
MOVL 12(R9), AX \
MOVL 16(R9), DX \
MOVQ SP, R14 \
LEAQ (2*4*80+32)(SP), R15 \
PRECALC \ // Precalc WK for first 2 blocks
XCHGQ R15, R14 \
loop: \ // this loops is unrolled
CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block
JNE begin \
VZEROUPPER \
RET \
begin: \
CALC_0 \
CALC_1 \
CALC_2 \
CALC_3 \
CALC_4 \
CALC_5 \
CALC_6 \
CALC_7 \
CALC_8 \
CALC_9 \
CALC_10 \
CALC_11 \
CALC_12 \
CALC_13 \
CALC_14 \
CALC_15 \
CALC_16 \
CALC_17 \
CALC_18 \
CALC_19 \
CALC_20 \
CALC_21 \
CALC_22 \
CALC_23 \
CALC_24 \
CALC_25 \
CALC_26 \
CALC_27 \
CALC_28 \
CALC_29 \
CALC_30 \
CALC_31 \
CALC_32 \
CALC_33 \
CALC_34 \
CALC_35 \
CALC_36 \
CALC_37 \
CALC_38 \
CALC_39 \
CALC_40 \
CALC_41 \
CALC_42 \
CALC_43 \
CALC_44 \
CALC_45 \
CALC_46 \
CALC_47 \
CALC_48 \
CALC_49 \
CALC_50 \
CALC_51 \
CALC_52 \
CALC_53 \
CALC_54 \
CALC_55 \
CALC_56 \
CALC_57 \
CALC_58 \
CALC_59 \
ADDQ $128, R10 \ // move to next even-64-byte block
CMPQ R10, R11 \ // is current block the last one?
CMOVQCC R8, R10 \ // signal the last iteration smartly
CALC_60 \
CALC_61 \
CALC_62 \
CALC_63 \
CALC_64 \
CALC_65 \
CALC_66 \
CALC_67 \
CALC_68 \
CALC_69 \
CALC_70 \
CALC_71 \
CALC_72 \
CALC_73 \
CALC_74 \
CALC_75 \
CALC_76 \
CALC_77 \
CALC_78 \
CALC_79 \
UPDATE_HASH(AX,DX,BX,SI,DI) \
CMPQ R10, R8 \ // is current block the last one?
JE loop\
MOVL DX, CX \
CALC_80 \
CALC_81 \
CALC_82 \
CALC_83 \
CALC_84 \
CALC_85 \
CALC_86 \
CALC_87 \
CALC_88 \
CALC_89 \
CALC_90 \
CALC_91 \
CALC_92 \
CALC_93 \
CALC_94 \
CALC_95 \
CALC_96 \
CALC_97 \
CALC_98 \
CALC_99 \
CALC_100 \
CALC_101 \
CALC_102 \
CALC_103 \
CALC_104 \
CALC_105 \
CALC_106 \
CALC_107 \
CALC_108 \
CALC_109 \
CALC_110 \
CALC_111 \
CALC_112 \
CALC_113 \
CALC_114 \
CALC_115 \
CALC_116 \
CALC_117 \
CALC_118 \
CALC_119 \
CALC_120 \
CALC_121 \
CALC_122 \
CALC_123 \
CALC_124 \
CALC_125 \
CALC_126 \
CALC_127 \
CALC_128 \
CALC_129 \
CALC_130 \
CALC_131 \
CALC_132 \
CALC_133 \
CALC_134 \
CALC_135 \
CALC_136 \
CALC_137 \
CALC_138 \
CALC_139 \
ADDQ $128, R13 \ //move to next even-64-byte block
CMPQ R13, R11 \ //is current block the last one?
CMOVQCC R8, R10 \
CALC_140 \
CALC_141 \
CALC_142 \
CALC_143 \
CALC_144 \
CALC_145 \
CALC_146 \
CALC_147 \
CALC_148 \
CALC_149 \
CALC_150 \
CALC_151 \
CALC_152 \
CALC_153 \
CALC_154 \
CALC_155 \
CALC_156 \
CALC_157 \
CALC_158 \
CALC_159 \
UPDATE_HASH(SI,DI,DX,CX,BX) \
MOVL SI, R12 \ //Reset state for AVX2 reg permutation
MOVL DI, SI \
MOVL DX, DI \
MOVL BX, DX \
MOVL CX, AX \
MOVL R12, CX \
XCHGQ R15, R14 \
JMP loop
TEXT ·blockAVX2(SB),$1408-32
MOVQ dig+0(FP), DI
MOVQ p_base+8(FP), SI
MOVQ p_len+16(FP), DX
SHRQ $6, DX
SHLQ $6, DX
MOVQ $K_XMM_AR<>(SB), R8
MOVQ DI, R9
MOVQ SI, R10
LEAQ 64(SI), R13
ADDQ SI, DX
ADDQ $64, DX
MOVQ DX, R11
CMPQ R13, R11
CMOVQCC R8, R13
MOVQ $BSWAP_SHUFB_CTL<>(SB), R8
VMOVDQU (R8), Y10
MOVQ $K_XMM_AR<>(SB), R8 //restore R8
CALC // RET is inside macros
// func checkAVX2() bool
// returns whether AVX2 is supported
TEXT ·checkAVX2(SB),NOSPLIT,$0
CMPB runtime·support_avx2(SB), $1
JE has
MOVB $0, ret+0(FP)
RET
has:
MOVB $1, ret+0(FP)
RET
DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
GLOBL K_XMM_AR<>(SB),RODATA,$128
DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 amd64p32 arm 386 s390x
// +build amd64p32 arm 386 s390x
package sha1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment