Commit 543ea178 authored by Eric Biggers's avatar Eric Biggers Committed by Herbert Xu

crypto: x86/aes-xts - optimize size of instructions operating on lengths

x86_64 has the "interesting" property that the instruction size is
generally a bit shorter for instructions that operate on the 32-bit (or
less) part of registers, or registers that are in the original set of 8.

This patch adjusts the AES-XTS code to take advantage of that property
by changing the LEN parameter from size_t to unsigned int (which is all
that's needed and is what the non-AVX implementation uses) and using the
%eax register for KEYLEN.

This decreases the size of aes-xts-avx-x86_64.o by 1.2%.

Note that changing the kmovq to kmovd was going to be needed anyway to
make the AVX10/256 code really work on CPUs that don't support 512-bit
vectors (since the AVX10 spec says that 64-bit opmask instructions will
only be supported on processors that support 512-bit vectors).
Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent e619723a
...@@ -85,14 +85,16 @@ ...@@ -85,14 +85,16 @@
// advanced to point to 7th-from-last round key // advanced to point to 7th-from-last round key
.set SRC, %rsi // Pointer to next source data .set SRC, %rsi // Pointer to next source data
.set DST, %rdx // Pointer to next destination data .set DST, %rdx // Pointer to next destination data
.set LEN, %rcx // Remaining length in bytes .set LEN, %ecx // Remaining length in bytes
.set LEN8, %cl
.set LEN64, %rcx
.set TWEAK, %r8 // Pointer to next tweak .set TWEAK, %r8 // Pointer to next tweak
// %r9 holds the AES key length in bytes. // %rax holds the AES key length in bytes.
.set KEYLEN, %r9d .set KEYLEN, %eax
.set KEYLEN64, %r9 .set KEYLEN64, %rax
// %rax and %r10-r11 are available as temporaries. // %r9-r11 are available as temporaries.
.macro _define_Vi i .macro _define_Vi i
.if VL == 16 .if VL == 16
...@@ -565,9 +567,9 @@ ...@@ -565,9 +567,9 @@
// subtracting 16 from LEN. This is needed because ciphertext stealing // subtracting 16 from LEN. This is needed because ciphertext stealing
// decryption uses the last two tweaks in reverse order. We'll handle // decryption uses the last two tweaks in reverse order. We'll handle
// the last full block and the partial block specially at the end. // the last full block and the partial block specially at the end.
lea -16(LEN), %rax lea -16(LEN), %eax
test $15, LEN test $15, LEN8
cmovnz %rax, LEN cmovnz %eax, LEN
.endif .endif
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
...@@ -650,7 +652,7 @@ ...@@ -650,7 +652,7 @@
// Check for the uncommon case where the data length isn't a multiple of // Check for the uncommon case where the data length isn't a multiple of
// 4*VL. Handle it out-of-line in order to optimize for the common // 4*VL. Handle it out-of-line in order to optimize for the common
// case. In the common case, just fall through to the ret. // case. In the common case, just fall through to the ret.
test $4*VL-1, LEN test $4*VL-1, LEN8
jnz .Lhandle_remainder\@ jnz .Lhandle_remainder\@
.Ldone\@: .Ldone\@:
// Store the next tweak back to *TWEAK to support continuation calls. // Store the next tweak back to *TWEAK to support continuation calls.
...@@ -718,9 +720,9 @@ ...@@ -718,9 +720,9 @@
.if USE_AVX10 .if USE_AVX10
// Create a mask that has the first LEN bits set. // Create a mask that has the first LEN bits set.
mov $-1, %rax mov $-1, %r9d
bzhi LEN, %rax, %rax bzhi LEN, %r9d, %r9d
kmovq %rax, %k1 kmovd %r9d, %k1
// Swap the first LEN bytes of the en/decryption of the last full block // Swap the first LEN bytes of the en/decryption of the last full block
// with the partial block. Note that to support in-place en/decryption, // with the partial block. Note that to support in-place en/decryption,
...@@ -730,23 +732,23 @@ ...@@ -730,23 +732,23 @@
vmovdqu8 16(SRC), %xmm0{%k1} vmovdqu8 16(SRC), %xmm0{%k1}
vmovdqu8 %xmm1, 16(DST){%k1} vmovdqu8 %xmm1, 16(DST){%k1}
.else .else
lea .Lcts_permute_table(%rip), %rax lea .Lcts_permute_table(%rip), %r9
// Load the src partial block, left-aligned. Note that to support // Load the src partial block, left-aligned. Note that to support
// in-place en/decryption, this must happen before the store to the dst // in-place en/decryption, this must happen before the store to the dst
// partial block. // partial block.
vmovdqu (SRC, LEN, 1), %xmm1 vmovdqu (SRC, LEN64, 1), %xmm1
// Shift the first LEN bytes of the en/decryption of the last full block // Shift the first LEN bytes of the en/decryption of the last full block
// to the end of a register, then store it to DST+LEN. This stores the // to the end of a register, then store it to DST+LEN. This stores the
// dst partial block. It also writes to the second part of the dst last // dst partial block. It also writes to the second part of the dst last
// full block, but that part is overwritten later. // full block, but that part is overwritten later.
vpshufb (%rax, LEN, 1), %xmm0, %xmm2 vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
vmovdqu %xmm2, (DST, LEN, 1) vmovdqu %xmm2, (DST, LEN64, 1)
// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
sub LEN, %rax sub LEN64, %r9
vmovdqu 32(%rax), %xmm3 vmovdqu 32(%r9), %xmm3
// Shift the src partial block to the beginning of its register. // Shift the src partial block to the beginning of its register.
vpshufb %xmm3, %xmm1, %xmm1 vpshufb %xmm3, %xmm1, %xmm1
...@@ -795,7 +797,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv) ...@@ -795,7 +797,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
// instantiated from the above macro. They all have the following prototype: // instantiated from the above macro. They all have the following prototype:
// //
// void (*xts_asm_func)(const struct crypto_aes_ctx *key, // void (*xts_asm_func)(const struct crypto_aes_ctx *key,
// const u8 *src, u8 *dst, size_t len, // const u8 *src, u8 *dst, unsigned int len,
// u8 tweak[AES_BLOCK_SIZE]); // u8 tweak[AES_BLOCK_SIZE]);
// //
// |key| is the data key. |tweak| contains the next tweak; the encryption of // |key| is the data key. |tweak| contains the next tweak; the encryption of
......
...@@ -899,7 +899,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, ...@@ -899,7 +899,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]); u8 iv[AES_BLOCK_SIZE]);
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
const u8 *src, u8 *dst, size_t len, const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE]); u8 tweak[AES_BLOCK_SIZE]);
/* This handles cases where the source and/or destination span pages. */ /* This handles cases where the source and/or destination span pages. */
...@@ -1021,14 +1021,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, ...@@ -1021,14 +1021,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
} }
static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
const u8 *src, u8 *dst, size_t len, const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE]) u8 tweak[AES_BLOCK_SIZE])
{ {
aesni_xts_enc(key, dst, src, len, tweak); aesni_xts_enc(key, dst, src, len, tweak);
} }
static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
const u8 *src, u8 *dst, size_t len, const u8 *src, u8 *dst, unsigned int len,
u8 tweak[AES_BLOCK_SIZE]) u8 tweak[AES_BLOCK_SIZE])
{ {
aesni_xts_dec(key, dst, src, len, tweak); aesni_xts_dec(key, dst, src, len, tweak);
...@@ -1185,12 +1185,12 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, ...@@ -1185,12 +1185,12 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
#define DEFINE_XTS_ALG(suffix, driver_name, priority) \ #define DEFINE_XTS_ALG(suffix, driver_name, priority) \
\ \
asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \ asmlinkage void \
const u8 *src, u8 *dst, size_t len, \ aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
u8 tweak[AES_BLOCK_SIZE]); \ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \ asmlinkage void \
const u8 *src, u8 *dst, size_t len, \ aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
u8 tweak[AES_BLOCK_SIZE]); \ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
\ \
static int xts_encrypt_##suffix(struct skcipher_request *req) \ static int xts_encrypt_##suffix(struct skcipher_request *req) \
{ \ { \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment