Commit de85fc46 authored by Dave Watson's avatar Dave Watson Committed by Herbert Xu

crypto: aesni - Introduce gcm_context_data

Add the gcm_context_data structure to the avx asm routines.
This will be necessary to support both 256 bit keys and
scatter/gather.

The pre-computed HashKeys are now stored in the gcm_context_data
struct, which is expanded to hold the greater number of hashkeys
necessary for avx.

Loads and stores to the new struct are always done unlaligned to
avoid compiler issues, see e5b954e8 "Use unaligned loads from
gcm_context_data"
Signed-off-by: default avatarDave Watson <davejwatson@fb.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent f9b1d646
...@@ -182,43 +182,22 @@ aad_shift_arr: ...@@ -182,43 +182,22 @@ aad_shift_arr:
.text .text
##define the fields of the gcm aes context HashKey = 16*6 # store HashKey <<1 mod poly here
#{ HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
# u8 expanded_keys[16*11] store expanded keys HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
#} gcm_ctx#
HashKey = 16*11 # store HashKey <<1 mod poly here
HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
#define arg1 %rdi #define arg1 %rdi
#define arg2 %rsi #define arg2 %rsi
...@@ -229,6 +208,7 @@ HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu ...@@ -229,6 +208,7 @@ HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
#define arg7 STACK_OFFSET+8*1(%r14) #define arg7 STACK_OFFSET+8*1(%r14)
#define arg8 STACK_OFFSET+8*2(%r14) #define arg8 STACK_OFFSET+8*2(%r14)
#define arg9 STACK_OFFSET+8*3(%r14) #define arg9 STACK_OFFSET+8*3(%r14)
#define arg10 STACK_OFFSET+8*4(%r14)
i = 0 i = 0
j = 0 j = 0
...@@ -300,9 +280,9 @@ VARIABLE_OFFSET = 16*8 ...@@ -300,9 +280,9 @@ VARIABLE_OFFSET = 16*8
and $~63, %rsp # align rsp to 64 bytes and $~63, %rsp # align rsp to 64 bytes
vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
mov arg4, %r13 # save the number of bytes of plaintext/ciphertext mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
and $-16, %r13 # r13 = r13 - (r13 mod 16) and $-16, %r13 # r13 = r13 - (r13 mod 16)
mov %r13, %r12 mov %r13, %r12
...@@ -413,11 +393,11 @@ _eight_cipher_left\@: ...@@ -413,11 +393,11 @@ _eight_cipher_left\@:
_zero_cipher_left\@: _zero_cipher_left\@:
cmp $16, arg4 cmp $16, arg5
jl _only_less_than_16\@ jl _only_less_than_16\@
mov arg4, %r13 mov arg5, %r13
and $15, %r13 # r13 = (arg4 mod 16) and $15, %r13 # r13 = (arg5 mod 16)
je _multiple_of_16_bytes\@ je _multiple_of_16_bytes\@
...@@ -430,7 +410,7 @@ _zero_cipher_left\@: ...@@ -430,7 +410,7 @@ _zero_cipher_left\@:
sub $16, %r11 sub $16, %r11
add %r13, %r11 add %r13, %r11
vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block vmovdqu (arg4, %r11), %xmm1 # receive the last <16 Byte block
lea SHIFT_MASK+16(%rip), %r12 lea SHIFT_MASK+16(%rip), %r12
sub %r13, %r12 # adjust the shuffle mask pointer to be sub %r13, %r12 # adjust the shuffle mask pointer to be
...@@ -442,8 +422,8 @@ _zero_cipher_left\@: ...@@ -442,8 +422,8 @@ _zero_cipher_left\@:
_only_less_than_16\@: _only_less_than_16\@:
# check for 0 length # check for 0 length
mov arg4, %r13 mov arg5, %r13
and $15, %r13 # r13 = (arg4 mod 16) and $15, %r13 # r13 = (arg5 mod 16)
je _multiple_of_16_bytes\@ je _multiple_of_16_bytes\@
...@@ -461,7 +441,7 @@ _only_less_than_16\@: ...@@ -461,7 +441,7 @@ _only_less_than_16\@:
# number of bytes in plaintext mod 16) # number of bytes in plaintext mod 16)
_get_last_16_byte_loop\@: _get_last_16_byte_loop\@:
movb (arg3, %r11), %al movb (arg4, %r11), %al
movb %al, TMP1 (%rsp , %r11) movb %al, TMP1 (%rsp , %r11)
add $1, %r11 add $1, %r11
cmp %r13, %r11 cmp %r13, %r11
...@@ -506,14 +486,14 @@ _final_ghash_mul\@: ...@@ -506,14 +486,14 @@ _final_ghash_mul\@:
cmp $8, %r13 cmp $8, %r13
jle _less_than_8_bytes_left\@ jle _less_than_8_bytes_left\@
mov %rax, (arg2 , %r11) mov %rax, (arg3 , %r11)
add $8, %r11 add $8, %r11
vpsrldq $8, %xmm9, %xmm9 vpsrldq $8, %xmm9, %xmm9
vmovq %xmm9, %rax vmovq %xmm9, %rax
sub $8, %r13 sub $8, %r13
_less_than_8_bytes_left\@: _less_than_8_bytes_left\@:
movb %al, (arg2 , %r11) movb %al, (arg3 , %r11)
add $1, %r11 add $1, %r11
shr $8, %rax shr $8, %rax
sub $1, %r13 sub $1, %r13
...@@ -521,12 +501,12 @@ _less_than_8_bytes_left\@: ...@@ -521,12 +501,12 @@ _less_than_8_bytes_left\@:
############################# #############################
_multiple_of_16_bytes\@: _multiple_of_16_bytes\@:
mov arg7, %r12 # r12 = aadLen (number of bytes) mov arg8, %r12 # r12 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15 vmovd %r12d, %xmm15 # len(A) in xmm15
shl $3, arg4 # len(C) in bits (*128) shl $3, arg5 # len(C) in bits (*128)
vmovq arg4, %xmm1 vmovq arg5, %xmm1
vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
...@@ -534,7 +514,7 @@ _multiple_of_16_bytes\@: ...@@ -534,7 +514,7 @@ _multiple_of_16_bytes\@:
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
mov arg5, %rax # rax = *Y0 mov arg6, %rax # rax = *Y0
vmovdqu (%rax), %xmm9 # xmm9 = Y0 vmovdqu (%rax), %xmm9 # xmm9 = Y0
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
...@@ -544,8 +524,8 @@ _multiple_of_16_bytes\@: ...@@ -544,8 +524,8 @@ _multiple_of_16_bytes\@:
_return_T\@: _return_T\@:
mov arg8, %r10 # r10 = authTag mov arg9, %r10 # r10 = authTag
mov arg9, %r11 # r11 = auth_tag_len mov arg10, %r11 # r11 = auth_tag_len
cmp $16, %r11 cmp $16, %r11
je _T_16\@ je _T_16\@
...@@ -655,49 +635,49 @@ _return_T_done\@: ...@@ -655,49 +635,49 @@ _return_T_done\@:
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_k(arg1) vmovdqu \T1, HashKey_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_2_k(arg1) vmovdqu \T1, HashKey_2_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
vmovdqa \T5, HashKey_3(arg1) vmovdqu \T5, HashKey_3(arg2)
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_3_k(arg1) vmovdqu \T1, HashKey_3_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
vmovdqa \T5, HashKey_4(arg1) vmovdqu \T5, HashKey_4(arg2)
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_4_k(arg1) vmovdqu \T1, HashKey_4_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
vmovdqa \T5, HashKey_5(arg1) vmovdqu \T5, HashKey_5(arg2)
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_5_k(arg1) vmovdqu \T1, HashKey_5_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
vmovdqa \T5, HashKey_6(arg1) vmovdqu \T5, HashKey_6(arg2)
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_6_k(arg1) vmovdqu \T1, HashKey_6_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
vmovdqa \T5, HashKey_7(arg1) vmovdqu \T5, HashKey_7(arg2)
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_7_k(arg1) vmovdqu \T1, HashKey_7_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
vmovdqa \T5, HashKey_8(arg1) vmovdqu \T5, HashKey_8(arg2)
vpshufd $0b01001110, \T5, \T1 vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1 vpxor \T5, \T1, \T1
vmovdqa \T1, HashKey_8_k(arg1) vmovdqu \T1, HashKey_8_k(arg2)
.endm .endm
...@@ -706,15 +686,15 @@ _return_T_done\@: ...@@ -706,15 +686,15 @@ _return_T_done\@:
## num_initial_blocks = b mod 4# ## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered ## r10, r11, r12, rax are clobbered
## arg1, arg2, arg3, r14 are used as a pointer only, not modified ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks) i = (8-\num_initial_blocks)
j = 0 j = 0
setreg setreg
mov arg6, %r10 # r10 = AAD mov arg7, %r10 # r10 = AAD
mov arg7, %r12 # r12 = aadLen mov arg8, %r12 # r12 = aadLen
mov %r12, %r11 mov %r12, %r11
...@@ -780,7 +760,7 @@ _get_AAD_done\@: ...@@ -780,7 +760,7 @@ _get_AAD_done\@:
xor %r11d, %r11d xor %r11d, %r11d
# start AES for num_initial_blocks blocks # start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0 mov arg6, %rax # rax = *Y0
vmovdqu (%rax), \CTR # CTR = Y0 vmovdqu (%rax), \CTR # CTR = Y0
vpshufb SHUF_MASK(%rip), \CTR, \CTR vpshufb SHUF_MASK(%rip), \CTR, \CTR
...@@ -833,9 +813,9 @@ _get_AAD_done\@: ...@@ -833,9 +813,9 @@ _get_AAD_done\@:
i = (9-\num_initial_blocks) i = (9-\num_initial_blocks)
setreg setreg
.rep \num_initial_blocks .rep \num_initial_blocks
vmovdqu (arg3, %r11), \T1 vmovdqu (arg4, %r11), \T1
vpxor \T1, reg_i, reg_i vpxor \T1, reg_i, reg_i
vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
add $16, %r11 add $16, %r11
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, reg_i vmovdqa \T1, reg_i
...@@ -936,58 +916,58 @@ _get_AAD_done\@: ...@@ -936,58 +916,58 @@ _get_AAD_done\@:
vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM7, \XMM7
vaesenclast \T_key, \XMM8, \XMM8 vaesenclast \T_key, \XMM8, \XMM8
vmovdqu (arg3, %r11), \T1 vmovdqu (arg4, %r11), \T1
vpxor \T1, \XMM1, \XMM1 vpxor \T1, \XMM1, \XMM1
vmovdqu \XMM1, (arg2 , %r11) vmovdqu \XMM1, (arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM1 vmovdqa \T1, \XMM1
.endif .endif
vmovdqu 16*1(arg3, %r11), \T1 vmovdqu 16*1(arg4, %r11), \T1
vpxor \T1, \XMM2, \XMM2 vpxor \T1, \XMM2, \XMM2
vmovdqu \XMM2, 16*1(arg2 , %r11) vmovdqu \XMM2, 16*1(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM2 vmovdqa \T1, \XMM2
.endif .endif
vmovdqu 16*2(arg3, %r11), \T1 vmovdqu 16*2(arg4, %r11), \T1
vpxor \T1, \XMM3, \XMM3 vpxor \T1, \XMM3, \XMM3
vmovdqu \XMM3, 16*2(arg2 , %r11) vmovdqu \XMM3, 16*2(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM3 vmovdqa \T1, \XMM3
.endif .endif
vmovdqu 16*3(arg3, %r11), \T1 vmovdqu 16*3(arg4, %r11), \T1
vpxor \T1, \XMM4, \XMM4 vpxor \T1, \XMM4, \XMM4
vmovdqu \XMM4, 16*3(arg2 , %r11) vmovdqu \XMM4, 16*3(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM4 vmovdqa \T1, \XMM4
.endif .endif
vmovdqu 16*4(arg3, %r11), \T1 vmovdqu 16*4(arg4, %r11), \T1
vpxor \T1, \XMM5, \XMM5 vpxor \T1, \XMM5, \XMM5
vmovdqu \XMM5, 16*4(arg2 , %r11) vmovdqu \XMM5, 16*4(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM5 vmovdqa \T1, \XMM5
.endif .endif
vmovdqu 16*5(arg3, %r11), \T1 vmovdqu 16*5(arg4, %r11), \T1
vpxor \T1, \XMM6, \XMM6 vpxor \T1, \XMM6, \XMM6
vmovdqu \XMM6, 16*5(arg2 , %r11) vmovdqu \XMM6, 16*5(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM6 vmovdqa \T1, \XMM6
.endif .endif
vmovdqu 16*6(arg3, %r11), \T1 vmovdqu 16*6(arg4, %r11), \T1
vpxor \T1, \XMM7, \XMM7 vpxor \T1, \XMM7, \XMM7
vmovdqu \XMM7, 16*6(arg2 , %r11) vmovdqu \XMM7, 16*6(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM7 vmovdqa \T1, \XMM7
.endif .endif
vmovdqu 16*7(arg3, %r11), \T1 vmovdqu 16*7(arg4, %r11), \T1
vpxor \T1, \XMM8, \XMM8 vpxor \T1, \XMM8, \XMM8
vmovdqu \XMM8, 16*7(arg2 , %r11) vmovdqu \XMM8, 16*7(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM8 vmovdqa \T1, \XMM8
.endif .endif
...@@ -1012,7 +992,7 @@ _initial_blocks_done\@: ...@@ -1012,7 +992,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time # encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks # ghash the 8 previously encrypted ciphertext blocks
# arg1, arg2, arg3 are used as pointers only, not modified # arg1, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value # r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
...@@ -1098,14 +1078,14 @@ _initial_blocks_done\@: ...@@ -1098,14 +1078,14 @@ _initial_blocks_done\@:
####################################################################### #######################################################################
vmovdqa HashKey_8(arg1), \T5 vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
vpshufd $0b01001110, \T2, \T6 vpshufd $0b01001110, \T2, \T6
vpxor \T2, \T6, \T6 vpxor \T2, \T6, \T6
vmovdqa HashKey_8_k(arg1), \T5 vmovdqu HashKey_8_k(arg2), \T5
vpclmulqdq $0x00, \T5, \T6, \T6 vpclmulqdq $0x00, \T5, \T6, \T6
vmovdqu 16*3(arg1), \T1 vmovdqu 16*3(arg1), \T1
...@@ -1119,7 +1099,7 @@ _initial_blocks_done\@: ...@@ -1119,7 +1099,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP2(%rsp), \T1 vmovdqa TMP2(%rsp), \T1
vmovdqa HashKey_7(arg1), \T5 vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1127,7 +1107,7 @@ _initial_blocks_done\@: ...@@ -1127,7 +1107,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_7_k(arg1), \T5 vmovdqu HashKey_7_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1144,7 +1124,7 @@ _initial_blocks_done\@: ...@@ -1144,7 +1124,7 @@ _initial_blocks_done\@:
####################################################################### #######################################################################
vmovdqa TMP3(%rsp), \T1 vmovdqa TMP3(%rsp), \T1
vmovdqa HashKey_6(arg1), \T5 vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1152,7 +1132,7 @@ _initial_blocks_done\@: ...@@ -1152,7 +1132,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_6_k(arg1), \T5 vmovdqu HashKey_6_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1167,7 +1147,7 @@ _initial_blocks_done\@: ...@@ -1167,7 +1147,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP4(%rsp), \T1 vmovdqa TMP4(%rsp), \T1
vmovdqa HashKey_5(arg1), \T5 vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1175,7 +1155,7 @@ _initial_blocks_done\@: ...@@ -1175,7 +1155,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_5_k(arg1), \T5 vmovdqu HashKey_5_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1191,7 +1171,7 @@ _initial_blocks_done\@: ...@@ -1191,7 +1171,7 @@ _initial_blocks_done\@:
vmovdqa TMP5(%rsp), \T1 vmovdqa TMP5(%rsp), \T1
vmovdqa HashKey_4(arg1), \T5 vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1199,7 +1179,7 @@ _initial_blocks_done\@: ...@@ -1199,7 +1179,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_4_k(arg1), \T5 vmovdqu HashKey_4_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1214,7 +1194,7 @@ _initial_blocks_done\@: ...@@ -1214,7 +1194,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1 vmovdqa TMP6(%rsp), \T1
vmovdqa HashKey_3(arg1), \T5 vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1222,7 +1202,7 @@ _initial_blocks_done\@: ...@@ -1222,7 +1202,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_3_k(arg1), \T5 vmovdqu HashKey_3_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1238,7 +1218,7 @@ _initial_blocks_done\@: ...@@ -1238,7 +1218,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1 vmovdqa TMP7(%rsp), \T1
vmovdqa HashKey_2(arg1), \T5 vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1246,7 +1226,7 @@ _initial_blocks_done\@: ...@@ -1246,7 +1226,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_2_k(arg1), \T5 vmovdqu HashKey_2_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1263,7 +1243,7 @@ _initial_blocks_done\@: ...@@ -1263,7 +1243,7 @@ _initial_blocks_done\@:
vaesenc \T5, \XMM8, \XMM8 vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1 vmovdqa TMP8(%rsp), \T1
vmovdqa HashKey(arg1), \T5 vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
...@@ -1271,7 +1251,7 @@ _initial_blocks_done\@: ...@@ -1271,7 +1251,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3 vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3 vpxor \T1, \T3, \T3
vmovdqa HashKey_k(arg1), \T5 vmovdqu HashKey_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3 vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6 vpxor \T3, \T6, \T6
...@@ -1284,13 +1264,13 @@ _initial_blocks_done\@: ...@@ -1284,13 +1264,13 @@ _initial_blocks_done\@:
j = 1 j = 1
setreg setreg
.rep 8 .rep 8
vpxor 16*i(arg3, %r11), \T5, \T2 vpxor 16*i(arg4, %r11), \T5, \T2
.if \ENC_DEC == ENC .if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j vaesenclast \T2, reg_j, reg_j
.else .else
vaesenclast \T2, reg_j, \T3 vaesenclast \T2, reg_j, \T3
vmovdqu 16*i(arg3, %r11), reg_j vmovdqu 16*i(arg4, %r11), reg_j
vmovdqu \T3, 16*i(arg2, %r11) vmovdqu \T3, 16*i(arg3, %r11)
.endif .endif
i = (i+1) i = (i+1)
j = (j+1) j = (j+1)
...@@ -1322,14 +1302,14 @@ _initial_blocks_done\@: ...@@ -1322,14 +1302,14 @@ _initial_blocks_done\@:
vpxor \T2, \T7, \T7 # first phase of the reduction complete vpxor \T2, \T7, \T7 # first phase of the reduction complete
####################################################################### #######################################################################
.if \ENC_DEC == ENC .if \ENC_DEC == ENC
vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
.endif .endif
####################################################################### #######################################################################
...@@ -1370,25 +1350,25 @@ _initial_blocks_done\@: ...@@ -1370,25 +1350,25 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM1, \T2 vpshufd $0b01001110, \XMM1, \T2
vpxor \XMM1, \T2, \T2 vpxor \XMM1, \T2, \T2
vmovdqa HashKey_8(arg1), \T5 vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM1, \T6 vpclmulqdq $0x11, \T5, \XMM1, \T6
vpclmulqdq $0x00, \T5, \XMM1, \T7 vpclmulqdq $0x00, \T5, \XMM1, \T7
vmovdqa HashKey_8_k(arg1), \T3 vmovdqu HashKey_8_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \XMM1 vpclmulqdq $0x00, \T3, \T2, \XMM1
###################### ######################
vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \XMM2, \T2
vpxor \XMM2, \T2, \T2 vpxor \XMM2, \T2, \T2
vmovdqa HashKey_7(arg1), \T5 vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM2, \T4 vpclmulqdq $0x11, \T5, \XMM2, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM2, \T4 vpclmulqdq $0x00, \T5, \XMM2, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_7_k(arg1), \T3 vmovdqu HashKey_7_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1396,14 +1376,14 @@ _initial_blocks_done\@: ...@@ -1396,14 +1376,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM3, \T2 vpshufd $0b01001110, \XMM3, \T2
vpxor \XMM3, \T2, \T2 vpxor \XMM3, \T2, \T2
vmovdqa HashKey_6(arg1), \T5 vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM3, \T4 vpclmulqdq $0x11, \T5, \XMM3, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM3, \T4 vpclmulqdq $0x00, \T5, \XMM3, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_6_k(arg1), \T3 vmovdqu HashKey_6_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1411,14 +1391,14 @@ _initial_blocks_done\@: ...@@ -1411,14 +1391,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM4, \T2 vpshufd $0b01001110, \XMM4, \T2
vpxor \XMM4, \T2, \T2 vpxor \XMM4, \T2, \T2
vmovdqa HashKey_5(arg1), \T5 vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM4, \T4 vpclmulqdq $0x11, \T5, \XMM4, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM4, \T4 vpclmulqdq $0x00, \T5, \XMM4, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_5_k(arg1), \T3 vmovdqu HashKey_5_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1426,14 +1406,14 @@ _initial_blocks_done\@: ...@@ -1426,14 +1406,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM5, \T2 vpshufd $0b01001110, \XMM5, \T2
vpxor \XMM5, \T2, \T2 vpxor \XMM5, \T2, \T2
vmovdqa HashKey_4(arg1), \T5 vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM5, \T4 vpclmulqdq $0x11, \T5, \XMM5, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM5, \T4 vpclmulqdq $0x00, \T5, \XMM5, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_4_k(arg1), \T3 vmovdqu HashKey_4_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1441,14 +1421,14 @@ _initial_blocks_done\@: ...@@ -1441,14 +1421,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM6, \T2 vpshufd $0b01001110, \XMM6, \T2
vpxor \XMM6, \T2, \T2 vpxor \XMM6, \T2, \T2
vmovdqa HashKey_3(arg1), \T5 vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM6, \T4 vpclmulqdq $0x11, \T5, \XMM6, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM6, \T4 vpclmulqdq $0x00, \T5, \XMM6, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_3_k(arg1), \T3 vmovdqu HashKey_3_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1456,14 +1436,14 @@ _initial_blocks_done\@: ...@@ -1456,14 +1436,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM7, \T2 vpshufd $0b01001110, \XMM7, \T2
vpxor \XMM7, \T2, \T2 vpxor \XMM7, \T2, \T2
vmovdqa HashKey_2(arg1), \T5 vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM7, \T4 vpclmulqdq $0x11, \T5, \XMM7, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM7, \T4 vpclmulqdq $0x00, \T5, \XMM7, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_2_k(arg1), \T3 vmovdqu HashKey_2_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1471,14 +1451,14 @@ _initial_blocks_done\@: ...@@ -1471,14 +1451,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM8, \T2 vpshufd $0b01001110, \XMM8, \T2
vpxor \XMM8, \T2, \T2 vpxor \XMM8, \T2, \T2
vmovdqa HashKey(arg1), \T5 vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM8, \T4 vpclmulqdq $0x11, \T5, \XMM8, \T4
vpxor \T4, \T6, \T6 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM8, \T4 vpclmulqdq $0x00, \T5, \XMM8, \T4
vpxor \T4, \T7, \T7 vpxor \T4, \T7, \T7
vmovdqa HashKey_k(arg1), \T3 vmovdqu HashKey_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1 vpxor \T2, \XMM1, \XMM1
...@@ -1527,6 +1507,7 @@ _initial_blocks_done\@: ...@@ -1527,6 +1507,7 @@ _initial_blocks_done\@:
############################################################# #############################################################
#void aesni_gcm_precomp_avx_gen2 #void aesni_gcm_precomp_avx_gen2
# (gcm_data *my_ctx_data, # (gcm_data *my_ctx_data,
# gcm_context_data *data,
# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
############################################################# #############################################################
ENTRY(aesni_gcm_precomp_avx_gen2) ENTRY(aesni_gcm_precomp_avx_gen2)
...@@ -1543,7 +1524,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) ...@@ -1543,7 +1524,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
sub $VARIABLE_OFFSET, %rsp sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes and $~63, %rsp # align rsp to 64 bytes
vmovdqu (arg2), %xmm6 # xmm6 = HashKey vmovdqu (arg3), %xmm6 # xmm6 = HashKey
vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
...@@ -1560,7 +1541,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2) ...@@ -1560,7 +1541,7 @@ ENTRY(aesni_gcm_precomp_avx_gen2)
vpand POLY(%rip), %xmm2, %xmm2 vpand POLY(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
####################################################################### #######################################################################
vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
...@@ -1577,6 +1558,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) ...@@ -1577,6 +1558,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2)
############################################################################### ###############################################################################
#void aesni_gcm_enc_avx_gen2( #void aesni_gcm_enc_avx_gen2(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
# gcm_context_data *data,
# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
# const u8 *in, /* Plaintext input */ # const u8 *in, /* Plaintext input */
# u64 plaintext_len, /* Length of data in Bytes for encryption. */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */
...@@ -1598,6 +1580,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2) ...@@ -1598,6 +1580,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2)
############################################################################### ###############################################################################
#void aesni_gcm_dec_avx_gen2( #void aesni_gcm_dec_avx_gen2(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
# gcm_context_data *data,
# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
# const u8 *in, /* Ciphertext input */ # const u8 *in, /* Ciphertext input */
# u64 plaintext_len, /* Length of data in Bytes for encryption. */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */
...@@ -1668,25 +1651,25 @@ ENDPROC(aesni_gcm_dec_avx_gen2) ...@@ -1668,25 +1651,25 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
vmovdqa \HK, \T5 vmovdqa \HK, \T5
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
vmovdqa \T5, HashKey_3(arg1) vmovdqu \T5, HashKey_3(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
vmovdqa \T5, HashKey_4(arg1) vmovdqu \T5, HashKey_4(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
vmovdqa \T5, HashKey_5(arg1) vmovdqu \T5, HashKey_5(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
vmovdqa \T5, HashKey_6(arg1) vmovdqu \T5, HashKey_6(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
vmovdqa \T5, HashKey_7(arg1) vmovdqu \T5, HashKey_7(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
vmovdqa \T5, HashKey_8(arg1) vmovdqu \T5, HashKey_8(arg2)
.endm .endm
...@@ -1696,15 +1679,15 @@ ENDPROC(aesni_gcm_dec_avx_gen2) ...@@ -1696,15 +1679,15 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
## num_initial_blocks = b mod 4# ## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered ## r10, r11, r12, rax are clobbered
## arg1, arg2, arg3, r14 are used as a pointer only, not modified ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks) i = (8-\num_initial_blocks)
j = 0 j = 0
setreg setreg
mov arg6, %r10 # r10 = AAD mov arg7, %r10 # r10 = AAD
mov arg7, %r12 # r12 = aadLen mov arg8, %r12 # r12 = aadLen
mov %r12, %r11 mov %r12, %r11
...@@ -1771,7 +1754,7 @@ _get_AAD_done\@: ...@@ -1771,7 +1754,7 @@ _get_AAD_done\@:
xor %r11d, %r11d xor %r11d, %r11d
# start AES for num_initial_blocks blocks # start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0 mov arg6, %rax # rax = *Y0
vmovdqu (%rax), \CTR # CTR = Y0 vmovdqu (%rax), \CTR # CTR = Y0
vpshufb SHUF_MASK(%rip), \CTR, \CTR vpshufb SHUF_MASK(%rip), \CTR, \CTR
...@@ -1824,9 +1807,9 @@ _get_AAD_done\@: ...@@ -1824,9 +1807,9 @@ _get_AAD_done\@:
i = (9-\num_initial_blocks) i = (9-\num_initial_blocks)
setreg setreg
.rep \num_initial_blocks .rep \num_initial_blocks
vmovdqu (arg3, %r11), \T1 vmovdqu (arg4, %r11), \T1
vpxor \T1, reg_i, reg_i vpxor \T1, reg_i, reg_i
vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
# num_initial_blocks blocks # num_initial_blocks blocks
add $16, %r11 add $16, %r11
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
...@@ -1928,58 +1911,58 @@ _get_AAD_done\@: ...@@ -1928,58 +1911,58 @@ _get_AAD_done\@:
vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM7, \XMM7
vaesenclast \T_key, \XMM8, \XMM8 vaesenclast \T_key, \XMM8, \XMM8
vmovdqu (arg3, %r11), \T1 vmovdqu (arg4, %r11), \T1
vpxor \T1, \XMM1, \XMM1 vpxor \T1, \XMM1, \XMM1
vmovdqu \XMM1, (arg2 , %r11) vmovdqu \XMM1, (arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM1 vmovdqa \T1, \XMM1
.endif .endif
vmovdqu 16*1(arg3, %r11), \T1 vmovdqu 16*1(arg4, %r11), \T1
vpxor \T1, \XMM2, \XMM2 vpxor \T1, \XMM2, \XMM2
vmovdqu \XMM2, 16*1(arg2 , %r11) vmovdqu \XMM2, 16*1(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM2 vmovdqa \T1, \XMM2
.endif .endif
vmovdqu 16*2(arg3, %r11), \T1 vmovdqu 16*2(arg4, %r11), \T1
vpxor \T1, \XMM3, \XMM3 vpxor \T1, \XMM3, \XMM3
vmovdqu \XMM3, 16*2(arg2 , %r11) vmovdqu \XMM3, 16*2(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM3 vmovdqa \T1, \XMM3
.endif .endif
vmovdqu 16*3(arg3, %r11), \T1 vmovdqu 16*3(arg4, %r11), \T1
vpxor \T1, \XMM4, \XMM4 vpxor \T1, \XMM4, \XMM4
vmovdqu \XMM4, 16*3(arg2 , %r11) vmovdqu \XMM4, 16*3(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM4 vmovdqa \T1, \XMM4
.endif .endif
vmovdqu 16*4(arg3, %r11), \T1 vmovdqu 16*4(arg4, %r11), \T1
vpxor \T1, \XMM5, \XMM5 vpxor \T1, \XMM5, \XMM5
vmovdqu \XMM5, 16*4(arg2 , %r11) vmovdqu \XMM5, 16*4(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM5 vmovdqa \T1, \XMM5
.endif .endif
vmovdqu 16*5(arg3, %r11), \T1 vmovdqu 16*5(arg4, %r11), \T1
vpxor \T1, \XMM6, \XMM6 vpxor \T1, \XMM6, \XMM6
vmovdqu \XMM6, 16*5(arg2 , %r11) vmovdqu \XMM6, 16*5(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM6 vmovdqa \T1, \XMM6
.endif .endif
vmovdqu 16*6(arg3, %r11), \T1 vmovdqu 16*6(arg4, %r11), \T1
vpxor \T1, \XMM7, \XMM7 vpxor \T1, \XMM7, \XMM7
vmovdqu \XMM7, 16*6(arg2 , %r11) vmovdqu \XMM7, 16*6(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM7 vmovdqa \T1, \XMM7
.endif .endif
vmovdqu 16*7(arg3, %r11), \T1 vmovdqu 16*7(arg4, %r11), \T1
vpxor \T1, \XMM8, \XMM8 vpxor \T1, \XMM8, \XMM8
vmovdqu \XMM8, 16*7(arg2 , %r11) vmovdqu \XMM8, 16*7(arg3 , %r11)
.if \ENC_DEC == DEC .if \ENC_DEC == DEC
vmovdqa \T1, \XMM8 vmovdqa \T1, \XMM8
.endif .endif
...@@ -2008,7 +1991,7 @@ _initial_blocks_done\@: ...@@ -2008,7 +1991,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time # encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks # ghash the 8 previously encrypted ciphertext blocks
# arg1, arg2, arg3 are used as pointers only, not modified # arg1, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value # r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
...@@ -2094,7 +2077,7 @@ _initial_blocks_done\@: ...@@ -2094,7 +2077,7 @@ _initial_blocks_done\@:
####################################################################### #######################################################################
vmovdqa HashKey_8(arg1), \T5 vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
...@@ -2112,7 +2095,7 @@ _initial_blocks_done\@: ...@@ -2112,7 +2095,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP2(%rsp), \T1 vmovdqa TMP2(%rsp), \T1
vmovdqa HashKey_7(arg1), \T5 vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
...@@ -2138,7 +2121,7 @@ _initial_blocks_done\@: ...@@ -2138,7 +2121,7 @@ _initial_blocks_done\@:
####################################################################### #######################################################################
vmovdqa TMP3(%rsp), \T1 vmovdqa TMP3(%rsp), \T1
vmovdqa HashKey_6(arg1), \T5 vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
...@@ -2162,7 +2145,7 @@ _initial_blocks_done\@: ...@@ -2162,7 +2145,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP4(%rsp), \T1 vmovdqa TMP4(%rsp), \T1
vmovdqa HashKey_5(arg1), \T5 vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
...@@ -2187,7 +2170,7 @@ _initial_blocks_done\@: ...@@ -2187,7 +2170,7 @@ _initial_blocks_done\@:
vmovdqa TMP5(%rsp), \T1 vmovdqa TMP5(%rsp), \T1
vmovdqa HashKey_4(arg1), \T5 vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
...@@ -2211,7 +2194,7 @@ _initial_blocks_done\@: ...@@ -2211,7 +2194,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1 vmovdqa TMP6(%rsp), \T1
vmovdqa HashKey_3(arg1), \T5 vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
...@@ -2235,7 +2218,7 @@ _initial_blocks_done\@: ...@@ -2235,7 +2218,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1 vmovdqa TMP7(%rsp), \T1
vmovdqa HashKey_2(arg1), \T5 vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4 vpxor \T3, \T4, \T4
...@@ -2262,7 +2245,7 @@ _initial_blocks_done\@: ...@@ -2262,7 +2245,7 @@ _initial_blocks_done\@:
vaesenc \T5, \XMM8, \XMM8 vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1 vmovdqa TMP8(%rsp), \T1
vmovdqa HashKey(arg1), \T5 vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x00, \T5, \T1, \T3 vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7 vpxor \T3, \T7, \T7
...@@ -2283,13 +2266,13 @@ _initial_blocks_done\@: ...@@ -2283,13 +2266,13 @@ _initial_blocks_done\@:
j = 1 j = 1
setreg setreg
.rep 8 .rep 8
vpxor 16*i(arg3, %r11), \T5, \T2 vpxor 16*i(arg4, %r11), \T5, \T2
.if \ENC_DEC == ENC .if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j vaesenclast \T2, reg_j, reg_j
.else .else
vaesenclast \T2, reg_j, \T3 vaesenclast \T2, reg_j, \T3
vmovdqu 16*i(arg3, %r11), reg_j vmovdqu 16*i(arg4, %r11), reg_j
vmovdqu \T3, 16*i(arg2, %r11) vmovdqu \T3, 16*i(arg3, %r11)
.endif .endif
i = (i+1) i = (i+1)
j = (j+1) j = (j+1)
...@@ -2315,14 +2298,14 @@ _initial_blocks_done\@: ...@@ -2315,14 +2298,14 @@ _initial_blocks_done\@:
vpxor \T2, \T7, \T7 # first phase of the reduction complete vpxor \T2, \T7, \T7 # first phase of the reduction complete
####################################################################### #######################################################################
.if \ENC_DEC == ENC .if \ENC_DEC == ENC
vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
.endif .endif
####################################################################### #######################################################################
...@@ -2359,7 +2342,7 @@ _initial_blocks_done\@: ...@@ -2359,7 +2342,7 @@ _initial_blocks_done\@:
## Karatsuba Method ## Karatsuba Method
vmovdqa HashKey_8(arg1), \T5 vmovdqu HashKey_8(arg2), \T5
vpshufd $0b01001110, \XMM1, \T2 vpshufd $0b01001110, \XMM1, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
...@@ -2373,7 +2356,7 @@ _initial_blocks_done\@: ...@@ -2373,7 +2356,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey_7(arg1), \T5 vmovdqu HashKey_7(arg2), \T5
vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \XMM2, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM2, \T2, \T2 vpxor \XMM2, \T2, \T2
...@@ -2391,7 +2374,7 @@ _initial_blocks_done\@: ...@@ -2391,7 +2374,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey_6(arg1), \T5 vmovdqu HashKey_6(arg2), \T5
vpshufd $0b01001110, \XMM3, \T2 vpshufd $0b01001110, \XMM3, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM3, \T2, \T2 vpxor \XMM3, \T2, \T2
...@@ -2409,7 +2392,7 @@ _initial_blocks_done\@: ...@@ -2409,7 +2392,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey_5(arg1), \T5 vmovdqu HashKey_5(arg2), \T5
vpshufd $0b01001110, \XMM4, \T2 vpshufd $0b01001110, \XMM4, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM4, \T2, \T2 vpxor \XMM4, \T2, \T2
...@@ -2427,7 +2410,7 @@ _initial_blocks_done\@: ...@@ -2427,7 +2410,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey_4(arg1), \T5 vmovdqu HashKey_4(arg2), \T5
vpshufd $0b01001110, \XMM5, \T2 vpshufd $0b01001110, \XMM5, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM5, \T2, \T2 vpxor \XMM5, \T2, \T2
...@@ -2445,7 +2428,7 @@ _initial_blocks_done\@: ...@@ -2445,7 +2428,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey_3(arg1), \T5 vmovdqu HashKey_3(arg2), \T5
vpshufd $0b01001110, \XMM6, \T2 vpshufd $0b01001110, \XMM6, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM6, \T2, \T2 vpxor \XMM6, \T2, \T2
...@@ -2463,7 +2446,7 @@ _initial_blocks_done\@: ...@@ -2463,7 +2446,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey_2(arg1), \T5 vmovdqu HashKey_2(arg2), \T5
vpshufd $0b01001110, \XMM7, \T2 vpshufd $0b01001110, \XMM7, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM7, \T2, \T2 vpxor \XMM7, \T2, \T2
...@@ -2481,7 +2464,7 @@ _initial_blocks_done\@: ...@@ -2481,7 +2464,7 @@ _initial_blocks_done\@:
###################### ######################
vmovdqa HashKey(arg1), \T5 vmovdqu HashKey(arg2), \T5
vpshufd $0b01001110, \XMM8, \T2 vpshufd $0b01001110, \XMM8, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM8, \T2, \T2 vpxor \XMM8, \T2, \T2
...@@ -2537,6 +2520,7 @@ _initial_blocks_done\@: ...@@ -2537,6 +2520,7 @@ _initial_blocks_done\@:
############################################################# #############################################################
#void aesni_gcm_precomp_avx_gen4 #void aesni_gcm_precomp_avx_gen4
# (gcm_data *my_ctx_data, # (gcm_data *my_ctx_data,
# gcm_context_data *data,
# u8 *hash_subkey)# /* H, the Hash sub key input. # u8 *hash_subkey)# /* H, the Hash sub key input.
# Data starts on a 16-byte boundary. */ # Data starts on a 16-byte boundary. */
############################################################# #############################################################
...@@ -2554,7 +2538,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) ...@@ -2554,7 +2538,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
sub $VARIABLE_OFFSET, %rsp sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes and $~63, %rsp # align rsp to 64 bytes
vmovdqu (arg2), %xmm6 # xmm6 = HashKey vmovdqu (arg3), %xmm6 # xmm6 = HashKey
vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
...@@ -2571,7 +2555,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4) ...@@ -2571,7 +2555,7 @@ ENTRY(aesni_gcm_precomp_avx_gen4)
vpand POLY(%rip), %xmm2, %xmm2 vpand POLY(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
####################################################################### #######################################################################
vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
...@@ -2589,6 +2573,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) ...@@ -2589,6 +2573,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
############################################################################### ###############################################################################
#void aesni_gcm_enc_avx_gen4( #void aesni_gcm_enc_avx_gen4(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
# gcm_context_data *data,
# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
# const u8 *in, /* Plaintext input */ # const u8 *in, /* Plaintext input */
# u64 plaintext_len, /* Length of data in Bytes for encryption. */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */
...@@ -2610,6 +2595,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4) ...@@ -2610,6 +2595,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4)
############################################################################### ###############################################################################
#void aesni_gcm_dec_avx_gen4( #void aesni_gcm_dec_avx_gen4(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
# gcm_context_data *data,
# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
# const u8 *in, /* Ciphertext input */ # const u8 *in, /* Ciphertext input */
# u64 plaintext_len, /* Length of data in Bytes for encryption. */ # u64 plaintext_len, /* Length of data in Bytes for encryption. */
......
...@@ -84,7 +84,7 @@ struct gcm_context_data { ...@@ -84,7 +84,7 @@ struct gcm_context_data {
u8 current_counter[GCM_BLOCK_LEN]; u8 current_counter[GCM_BLOCK_LEN];
u64 partial_block_len; u64 partial_block_len;
u64 unused; u64 unused;
u8 hash_keys[GCM_BLOCK_LEN * 8]; u8 hash_keys[GCM_BLOCK_LEN * 16];
}; };
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
...@@ -187,14 +187,18 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, ...@@ -187,14 +187,18 @@ asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
* gcm_data *my_ctx_data, context data * gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/ */
asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data,
struct gcm_context_data *gdata,
u8 *hash_subkey);
asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len); u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len); u8 *auth_tag, unsigned long auth_tag_len);
...@@ -211,9 +215,9 @@ static void aesni_gcm_enc_avx(void *ctx, ...@@ -211,9 +215,9 @@ static void aesni_gcm_enc_avx(void *ctx,
plaintext_len, iv, hash_subkey, aad, plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len); aad_len, auth_tag, auth_tag_len);
} else { } else {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} }
} }
...@@ -229,9 +233,9 @@ static void aesni_gcm_dec_avx(void *ctx, ...@@ -229,9 +233,9 @@ static void aesni_gcm_dec_avx(void *ctx,
ciphertext_len, iv, hash_subkey, aad, ciphertext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len); aad_len, auth_tag, auth_tag_len);
} else { } else {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} }
} }
#endif #endif
...@@ -242,14 +246,18 @@ static void aesni_gcm_dec_avx(void *ctx, ...@@ -242,14 +246,18 @@ static void aesni_gcm_dec_avx(void *ctx,
* gcm_data *my_ctx_data, context data * gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/ */
asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data,
struct gcm_context_data *gdata,
u8 *hash_subkey);
asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len); u8 *auth_tag, unsigned long auth_tag_len);
asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len); u8 *auth_tag, unsigned long auth_tag_len);
...@@ -266,13 +274,13 @@ static void aesni_gcm_enc_avx2(void *ctx, ...@@ -266,13 +274,13 @@ static void aesni_gcm_enc_avx2(void *ctx,
plaintext_len, iv, hash_subkey, aad, plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len); aad_len, auth_tag, auth_tag_len);
} else if (plaintext_len < AVX_GEN4_OPTSIZE) { } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, aesni_gcm_enc_avx_gen2(ctx, data, out, in, plaintext_len, iv,
aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} else { } else {
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, aesni_gcm_enc_avx_gen4(ctx, data, out, in, plaintext_len, iv,
aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} }
} }
...@@ -288,13 +296,13 @@ static void aesni_gcm_dec_avx2(void *ctx, ...@@ -288,13 +296,13 @@ static void aesni_gcm_dec_avx2(void *ctx,
ciphertext_len, iv, hash_subkey, ciphertext_len, iv, hash_subkey,
aad, aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} else if (ciphertext_len < AVX_GEN4_OPTSIZE) { } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); aesni_gcm_precomp_avx_gen2(ctx, data, hash_subkey);
aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, aesni_gcm_dec_avx_gen2(ctx, data, out, in, ciphertext_len, iv,
aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} else { } else {
aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); aesni_gcm_precomp_avx_gen4(ctx, data, hash_subkey);
aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, aesni_gcm_dec_avx_gen4(ctx, data, out, in, ciphertext_len, iv,
aad_len, auth_tag, auth_tag_len); aad, aad_len, auth_tag, auth_tag_len);
} }
} }
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment