Commit 4af78261 authored by Eric Biggers's avatar Eric Biggers Committed by Herbert Xu

crypto: x86/chacha20 - add XChaCha20 support

Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD
implementations of ChaCha20.  This can be used by Adiantum.

An SSSE3 implementation of single-block HChaCha20 is also added so that
XChaCha20 can use it rather than the generic implementation.  This
required refactoring the ChaCha permutation into its own function.
Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 0f961f9f
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/frame.h>
.section .rodata.cst16.ROT8, "aM", @progbits, 16 .section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16 .align 16
...@@ -23,37 +24,24 @@ CTRINC: .octa 0x00000003000000020000000100000000 ...@@ -23,37 +24,24 @@ CTRINC: .octa 0x00000003000000020000000100000000
.text .text
ENTRY(chacha20_block_xor_ssse3) /*
# %rdi: Input state matrix, s * chacha20_permute - permute one block
# %rsi: up to 1 data block output, o *
# %rdx: up to 1 data block input, i * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
# %rcx: input/output length in bytes * function performs matrix operations on four words in parallel, but requires
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
# This function encrypts one ChaCha20 block by loading the state matrix * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
# in four SSE registers. It performs matrix operation on four words in * rotation uses traditional shift+OR.
# parallel, but requires shuffling to rearrange the words after each *
# round. 8/16-bit word rotation is done with the slightly better * Clobbers: %ecx, %xmm4-%xmm7
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses */
# traditional shift+OR. chacha20_permute:
# x0..3 = s0..3
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
movdqa ROT8(%rip),%xmm4 movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5 movdqa ROT16(%rip),%xmm5
mov %rcx,%rax
mov $10,%ecx mov $10,%ecx
.Ldoubleround: .Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16) # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd %xmm1,%xmm0 paddd %xmm1,%xmm0
pxor %xmm0,%xmm3 pxor %xmm0,%xmm3
...@@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3) ...@@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3)
dec %ecx dec %ecx
jnz .Ldoubleround jnz .Ldoubleround
ret
ENDPROC(chacha20_permute)
ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 1 data block output, o
# %rdx: up to 1 data block input, i
# %rcx: input/output length in bytes
FRAME_BEGIN
# x0..3 = s0..3
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
mov %rcx,%rax
call chacha20_permute
# o0 = i0 ^ (x0 + s0) # o0 = i0 ^ (x0 + s0)
paddd %xmm8,%xmm0 paddd %xmm8,%xmm0
cmp $0x10,%rax cmp $0x10,%rax
...@@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3) ...@@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3)
movdqu %xmm0,0x30(%rsi) movdqu %xmm0,0x30(%rsi)
.Ldone: .Ldone:
FRAME_END
ret ret
.Lxorpart: .Lxorpart:
...@@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3) ...@@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3)
ENDPROC(chacha20_block_xor_ssse3) ENDPROC(chacha20_block_xor_ssse3)
ENTRY(hchacha20_block_ssse3)
# %rdi: Input state matrix, s
# %rsi: output (8 32-bit words)
FRAME_BEGIN
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
call chacha20_permute
movdqu %xmm0,0x00(%rsi)
movdqu %xmm3,0x10(%rsi)
FRAME_END
ret
ENDPROC(hchacha20_block_ssse3)
ENTRY(chacha20_4block_xor_ssse3) ENTRY(chacha20_4block_xor_ssse3)
# %rdi: Input state matrix, s # %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o # %rsi: up to 4 data blocks output, o
......
...@@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, ...@@ -23,6 +23,7 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len); unsigned int len);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len); unsigned int len);
asmlinkage void hchacha20_block_ssse3(const u32 *state, u32 *out);
#ifdef CONFIG_AS_AVX2 #ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len); unsigned int len);
...@@ -121,10 +122,9 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, ...@@ -121,10 +122,9 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
} }
} }
static int chacha20_simd(struct skcipher_request *req) static int chacha20_simd_stream_xor(struct skcipher_request *req,
struct chacha_ctx *ctx, u8 *iv)
{ {
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
u32 *state, state_buf[16 + 2] __aligned(8); u32 *state, state_buf[16 + 2] __aligned(8);
struct skcipher_walk walk; struct skcipher_walk walk;
int err; int err;
...@@ -132,14 +132,9 @@ static int chacha20_simd(struct skcipher_request *req) ...@@ -132,14 +132,9 @@ static int chacha20_simd(struct skcipher_request *req)
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, true); err = skcipher_walk_virt(&walk, req, true);
crypto_chacha_init(state, ctx, walk.iv); crypto_chacha_init(state, ctx, iv);
kernel_fpu_begin();
while (walk.nbytes > 0) { while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes; unsigned int nbytes = walk.nbytes;
...@@ -153,26 +148,85 @@ static int chacha20_simd(struct skcipher_request *req) ...@@ -153,26 +148,85 @@ static int chacha20_simd(struct skcipher_request *req)
err = skcipher_walk_done(&walk, walk.nbytes - nbytes); err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
} }
return err;
}
static int chacha20_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
return crypto_chacha_crypt(req);
kernel_fpu_begin();
err = chacha20_simd_stream_xor(req, ctx, req->iv);
kernel_fpu_end();
return err;
}
static int xchacha20_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 *state, state_buf[16 + 2] __aligned(8);
u8 real_iv[16];
int err;
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
return crypto_xchacha_crypt(req);
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
crypto_chacha_init(state, ctx, req->iv);
kernel_fpu_begin();
hchacha20_block_ssse3(state, subctx.key);
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
err = chacha20_simd_stream_xor(req, &subctx, real_iv);
kernel_fpu_end(); kernel_fpu_end();
return err; return err;
} }
static struct skcipher_alg alg = { static struct skcipher_alg algs[] = {
.base.cra_name = "chacha20", {
.base.cra_driver_name = "chacha20-simd", .base.cra_name = "chacha20",
.base.cra_priority = 300, .base.cra_driver_name = "chacha20-simd",
.base.cra_blocksize = 1, .base.cra_priority = 300,
.base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_blocksize = 1,
.base.cra_module = THIS_MODULE, .base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE, .min_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE, .max_keysize = CHACHA_KEY_SIZE,
.chunksize = CHACHA_BLOCK_SIZE, .ivsize = CHACHA_IV_SIZE,
.setkey = crypto_chacha20_setkey, .chunksize = CHACHA_BLOCK_SIZE,
.encrypt = chacha20_simd, .setkey = crypto_chacha20_setkey,
.decrypt = chacha20_simd, .encrypt = chacha20_simd,
.decrypt = chacha20_simd,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = xchacha20_simd,
.decrypt = xchacha20_simd,
},
}; };
static int __init chacha20_simd_mod_init(void) static int __init chacha20_simd_mod_init(void)
...@@ -190,12 +244,12 @@ static int __init chacha20_simd_mod_init(void) ...@@ -190,12 +244,12 @@ static int __init chacha20_simd_mod_init(void)
boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
#endif #endif
#endif #endif
return crypto_register_skcipher(&alg); return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
} }
static void __exit chacha20_simd_mod_fini(void) static void __exit chacha20_simd_mod_fini(void)
{ {
crypto_unregister_skcipher(&alg); crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
} }
module_init(chacha20_simd_mod_init); module_init(chacha20_simd_mod_init);
...@@ -206,3 +260,5 @@ MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); ...@@ -206,3 +260,5 @@ MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd"); MODULE_ALIAS_CRYPTO("chacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-simd");
...@@ -1468,19 +1468,13 @@ config CRYPTO_CHACHA20 ...@@ -1468,19 +1468,13 @@ config CRYPTO_CHACHA20
in some performance-sensitive scenarios. in some performance-sensitive scenarios.
config CRYPTO_CHACHA20_X86_64 config CRYPTO_CHACHA20_X86_64
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)" tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
depends on X86 && 64BIT depends on X86 && 64BIT
select CRYPTO_BLKCIPHER select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20 select CRYPTO_CHACHA20
help help
ChaCha20 cipher algorithm, RFC7539. SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20
and XChaCha20 stream ciphers.
ChaCha20 is a 256-bit high-speed stream cipher designed by Daniel J.
Bernstein and further specified in RFC7539 for use in IETF protocols.
This is the x86_64 assembler implementation using SIMD instructions.
See also:
<http://cr.yp.to/chacha/chacha-20080128.pdf>
config CRYPTO_SEED config CRYPTO_SEED
tristate "SEED cipher algorithm" tristate "SEED cipher algorithm"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment