crypto: arm/blake2b - add NEON-accelerated BLAKE2b

Add a NEON-accelerated implementation of BLAKE2b. On Cortex-A7 (which these days is the most common ARM processor that doesn't have the ARMv8 Crypto Extensions), this is over twice as fast as SHA-256, and slightly faster than SHA-1. It is also almost three times as fast as the generic implementation of BLAKE2b: Algorithm Cycles per byte (on 4096-byte messages) =================== ======================================= blake2b-256-neon 14.0 sha1-neon 16.3 blake2s-256-arm 18.8 sha1-asm 20.8 blake2s-256-generic 26.0 sha256-neon 28.9 sha256-asm 32.0 blake2b-256-generic 38.9 This implementation isn't directly based on any other implementation, but it borrows some ideas from previous NEON code I've written as well as from chacha-neon-core.S. At least on Cortex-A7, it is faster than the other NEON implementations of BLAKE2b I'm aware of (the implementation in the BLAKE2 official repository using intrinsics, and Andrew Moon's implementation which can be found in SUPERCOP). It does only one block at a time, so it performs well on short messages too. NEON-accelerated BLAKE2b is useful because there is interest in using BLAKE2b-256 for dm-verity on low-end Android devices (specifically, devices that lack the ARMv8 Crypto Extensions) to replace SHA-1. On these devices, the performance cost of upgrading to SHA-256 may be unacceptable, whereas BLAKE2b-256 would actually improve performance. Although BLAKE2b is intended for 64-bit platforms (unlike BLAKE2s which is intended for 32-bit platforms), on 32-bit ARM processors with NEON, BLAKE2b is actually faster than BLAKE2s. This is because NEON supports 64-bit operations, and because BLAKE2s's block size is too small for NEON to be helpful for it. The best I've been able to do with BLAKE2s on Cortex-A7 is 18.8 cpb with an optimized scalar implementation. (I didn't try BLAKE2sp and BLAKE3, which in theory would be faster, but they're more complex as they require running multiple hashes at once. Note that BLAKE2b already uses all the NEON bandwidth on the Cortex-A7, so I expect that any speedup from BLAKE2sp or BLAKE3 would come only from the smaller number of rounds, not from the extra parallelism.) For now this BLAKE2b implementation is only wired up to the shash API, since there is no library API for BLAKE2b yet. However, I've tried to keep things consistent with BLAKE2s, e.g. by defining blake2b_compress_arch() which is analogous to blake2s_compress_arch() and could be exported for use by the library API later if needed. Acked-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Tested-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

crypto: arm/blake2b - add NEON-accelerated BLAKE2b
Add a NEON-accelerated implementation of BLAKE2b. On Cortex-A7 (which these days is the most common ARM processor that doesn't have the ARMv8 Crypto Extensions), this is over twice as fast as SHA-256, and slightly faster than SHA-1. It is also almost three times as fast as the generic implementation of BLAKE2b: Algorithm Cycles per byte (on 4096-byte messages) =================== ======================================= blake2b-256-neon 14.0 sha1-neon 16.3 blake2s-256-arm 18.8 sha1-asm 20.8 blake2s-256-generic 26.0 sha256-neon 28.9 sha256-asm 32.0 blake2b-256-generic 38.9 This implementation isn't directly based on any other implementation, but it borrows some ideas from previous NEON code I've written as well as from chacha-neon-core.S. At least on Cortex-A7, it is faster than the other NEON implementations of BLAKE2b I'm aware of (the implementation in the BLAKE2 official repository using intrinsics, and Andrew Moon's implementation which can be found in SUPERCOP). It does only one block at a time, so it performs well on short messages too. NEON-accelerated BLAKE2b is useful because there is interest in using BLAKE2b-256 for dm-verity on low-end Android devices (specifically, devices that lack the ARMv8 Crypto Extensions) to replace SHA-1. On these devices, the performance cost of upgrading to SHA-256 may be unacceptable, whereas BLAKE2b-256 would actually improve performance. Although BLAKE2b is intended for 64-bit platforms (unlike BLAKE2s which is intended for 32-bit platforms), on 32-bit ARM processors with NEON, BLAKE2b is actually faster than BLAKE2s. This is because NEON supports 64-bit operations, and because BLAKE2s's block size is too small for NEON to be helpful for it. The best I've been able to do with BLAKE2s on Cortex-A7 is 18.8 cpb with an optimized scalar implementation. (I didn't try BLAKE2sp and BLAKE3, which in theory would be faster, but they're more complex as they require running multiple hashes at once. Note that BLAKE2b already uses all the NEON bandwidth on the Cortex-A7, so I expect that any speedup from BLAKE2sp or BLAKE3 would come only from the smaller number of rounds, not from the extra parallelism.) For now this BLAKE2b implementation is only wired up to the shash API, since there is no library API for BLAKE2b yet. However, I've tried to keep things consistent with BLAKE2s, e.g. by defining blake2b_compress_arch() which is analogous to blake2s_compress_arch() and could be exported for use by the library API later if needed. Acked-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Eric Biggers <ebiggers@google.com> Tested-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
1862eb00 · Eric Biggers · Herbert Xu · 0cdc438e · 1862eb00 · 1862eb00
Commit 1862eb00 authored Dec 23, 2020 by Eric Biggers Committed by Herbert Xu Jan 03, 2021
4 changed files
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -71,6 +71,16 @@ config CRYPTO_BLAKE2S_ARM
 	  slower than the NEON implementation of BLAKE2b.  (There is no NEON
 	  implementation of BLAKE2s, since NEON doesn't really help with it.)

+config CRYPTO_BLAKE2B_NEON
+	tristate "BLAKE2b digest algorithm (ARM NEON)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_BLAKE2B
+	help
+	  BLAKE2b digest algorithm optimized with ARM NEON instructions.
+	  On ARM processors that have NEON support but not the ARMv8
+	  Crypto Extensions, typically this BLAKE2b implementation is
+	  much faster than SHA-2 and slightly faster than SHA-1.
+
 config CRYPTO_AES_ARM
 	tristate "Scalar AES cipher for ARM"
 	select CRYPTO_ALGAPI

--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
+obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
@@ -31,6 +32,7 @@ sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
 blake2s-arm-y   := blake2s-core.o blake2s-glue.o
+blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o

--- a/arch/arm/crypto/blake2b-neon-core.S
+++ b/arch/arm/crypto/blake2b-neon-core.S
--- a/arch/arm/crypto/blake2b-neon-glue.c
+++ b/arch/arm/crypto/blake2b-neon-glue.c
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <crypto/internal/blake2b.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
+				      const u8 *block, size_t nblocks, u32 inc);
+
+static void blake2b_compress_arch(struct blake2b_state *state,
+				  const u8 *block, size_t nblocks, u32 inc)
+{
+	if (!crypto_simd_usable()) {
+		blake2b_compress_generic(state, block, nblocks, inc);
+		return;
+	}
+
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+		kernel_neon_begin();
+		blake2b_compress_neon(state, block, blocks, inc);
+		kernel_neon_end();
+
+		nblocks -= blocks;
+		block += blocks * BLAKE2B_BLOCK_SIZE;
+	} while (nblocks);
+}
+
+static int crypto_blake2b_update_neon(struct shash_desc *desc,
+				      const u8 *in, unsigned int inlen)
+{
+	return crypto_blake2b_update(desc, in, inlen, blake2b_compress_arch);
+}
+
+static int crypto_blake2b_final_neon(struct shash_desc *desc, u8 *out)
+{
+	return crypto_blake2b_final(desc, out, blake2b_compress_arch);
+}
+
+#define BLAKE2B_ALG(name, driver_name, digest_size)			\
+	{								\
+		.base.cra_name		= name,				\
+		.base.cra_driver_name	= driver_name,			\
+		.base.cra_priority	= 200,				\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
+		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
+		.base.cra_module	= THIS_MODULE,			\
+		.digestsize		= digest_size,			\
+		.setkey			= crypto_blake2b_setkey,	\
+		.init			= crypto_blake2b_init,		\
+		.update			= crypto_blake2b_update_neon,	\
+		.final			= crypto_blake2b_final_neon,	\
+		.descsize		= sizeof(struct blake2b_state),	\
+	}
+
+static struct shash_alg blake2b_neon_algs[] = {
+	BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
+};
+
+static int __init blake2b_neon_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	return crypto_register_shashes(blake2b_neon_algs,
+				       ARRAY_SIZE(blake2b_neon_algs));
+}
+
+static void __exit blake2b_neon_mod_exit(void)
+{
+	return crypto_unregister_shashes(blake2b_neon_algs,
+					 ARRAY_SIZE(blake2b_neon_algs));
+}
+
+module_init(blake2b_neon_mod_init);
+module_exit(blake2b_neon_mod_exit);
+
+MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("blake2b-160");
+MODULE_ALIAS_CRYPTO("blake2b-160-neon");
+MODULE_ALIAS_CRYPTO("blake2b-256");
+MODULE_ALIAS_CRYPTO("blake2b-256-neon");
+MODULE_ALIAS_CRYPTO("blake2b-384");
+MODULE_ALIAS_CRYPTO("blake2b-384-neon");
+MODULE_ALIAS_CRYPTO("blake2b-512");
+MODULE_ALIAS_CRYPTO("blake2b-512-neon");