crypto: x86/chacha20 - Add a 8-block AVX-512VL variant

This variant is similar to the AVX2 version, but benefits from the AVX-512 rotate instructions and the additional registers, so it can operate without any data on the stack. It uses ymm registers only to avoid the massive core throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed improvement compared to the AVX2 variant for random encryption lengths. The AVX2 version uses "rep movsb" for partial block XORing via the stack. With AVX-512, the new "vmovdqu8" can do this much more efficiently. The associated "kmov" instructions to work with dynamic masks is not part of the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given that the major AVX-512VL architectures provide AVX-512BW and this extension does not affect core clocking, this seems to be no problem at least for now. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

crypto: x86/chacha20 - Add a 8-block AVX-512VL variant
This variant is similar to the AVX2 version, but benefits from the AVX-512 rotate instructions and the additional registers, so it can operate without any data on the stack. It uses ymm registers only to avoid the massive core throttling on Skylake-X platforms. Nontheless does it bring a ~30% speed improvement compared to the AVX2 variant for random encryption lengths. The AVX2 version uses "rep movsb" for partial block XORing via the stack. With AVX-512, the new "vmovdqu8" can do this much more efficiently. The associated "kmov" instructions to work with dynamic masks is not part of the AVX-512VL instruction set, hence we depend on AVX-512BW as well. Given that the major AVX-512VL architectures provide AVX-512BW and this extension does not affect core clocking, this seems to be no problem at least for now. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
cee7a36e · Martin Willi · Herbert Xu · 059c2a4d · cee7a36e · cee7a36e
Commit cee7a36e authored Nov 20, 2018 by Martin Willi Committed by Herbert Xu Nov 29, 2018
3 changed files
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
 sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
 sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)

@@ -103,6 +104,10 @@ ifeq ($(avx2_supported),yes)
 	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
 endif

+ifeq ($(avx512_supported),yes)
+	chacha20-x86_64-y += chacha20-avx512vl-x86_64.o
+endif
+
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o

--- a/arch/x86/crypto/chacha20-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx512vl-x86_64.S
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -31,6 +31,11 @@ asmlinkage void chacha20_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 static bool chacha20_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha20_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+					     unsigned int len);
+static bool chacha20_use_avx512vl;
+#endif
 #endif

 static unsigned int chacha20_advance(unsigned int len, unsigned int maxblocks)
@@ -43,6 +48,22 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
 #ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+	if (chacha20_use_avx512vl) {
+		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+			chacha20_8block_xor_avx512vl(state, dst, src, bytes);
+			bytes -= CHACHA_BLOCK_SIZE * 8;
+			src += CHACHA_BLOCK_SIZE * 8;
+			dst += CHACHA_BLOCK_SIZE * 8;
+			state[12] += 8;
+		}
+		if (bytes > CHACHA_BLOCK_SIZE * 4) {
+			chacha20_8block_xor_avx512vl(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 8);
+			return;
+		}
+	}
+#endif
 	if (chacha20_use_avx2) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha20_8block_xor_avx2(state, dst, src, bytes);
@@ -149,6 +170,11 @@ static int __init chacha20_simd_mod_init(void)
 	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
 			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+	chacha20_use_avx512vl = chacha20_use_avx2 &&
+				boot_cpu_has(X86_FEATURE_AVX512VL) &&
+				boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
 #endif
 	return crypto_register_skcipher(&alg);
 }