crypto: arm64/ghash-ce - replace NEON yield check with block limit

Checking the TIF_NEED_RESCHED flag is disproportionately costly on cores with fast crypto instructions and comparatively slow memory accesses. On algorithms such as GHASH, which executes at ~1 cycle per byte on cores that implement support for 64 bit polynomial multiplication, there is really no need to check the TIF_NEED_RESCHED particularly often, and so we can remove the NEON yield check from the assembler routines. However, unlike the AEAD or skcipher APIs, the shash/ahash APIs take arbitrary input lengths, and so there needs to be some sanity check to ensure that we don't hog the CPU for excessive amounts of time. So let's simply cap the maximum input size that is processed in one go to 64 KB. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

crypto: arm64/ghash-ce - replace NEON yield check with block limit
Checking the TIF_NEED_RESCHED flag is disproportionately costly on cores with fast crypto instructions and comparatively slow memory accesses. On algorithms such as GHASH, which executes at ~1 cycle per byte on cores that implement support for 64 bit polynomial multiplication, there is really no need to check the TIF_NEED_RESCHED particularly often, and so we can remove the NEON yield check from the assembler routines. However, unlike the AEAD or skcipher APIs, the shash/ahash APIs take arbitrary input lengths, and so there needs to be some sanity check to ensure that we don't hog the CPU for excessive amounts of time. So let's simply cap the maximum input size that is processed in one go to 64 KB. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
8e492eff · Ard Biesheuvel · Herbert Xu · 8418cf54 · 8e492eff · 8e492eff
Commit 8e492eff authored Aug 04, 2018 by Ard Biesheuvel Committed by Herbert Xu Aug 07, 2018
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 32 deletions

arch/arm64/crypto/ghash-ce-core.S arch/arm64/crypto/ghash-ce-core.S +11 -28

arch/arm64/crypto/ghash-ce-glue.c arch/arm64/crypto/ghash-ce-glue.c +12 -4

No files found.
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -213,31 +213,23 @@
 	.endm
 	.macro		__pmull_ghash, pn
-	frame_push	5
+	ld1		{SHASH.2d}, [x3]
+	ld1		{XL.2d}, [x1]
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-0:	ld1		{SHASH.2d}, [x22]
-	ld1		{XL.2d}, [x20]
 	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
 	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
 	__pmull_pre_\pn
 	/* do the head block first, if supplied */
-	cbz		x23, 1f
+	cbz		x4, 0f
-	ld1		{T1.2d}, [x23]
+	ld1		{T1.2d}, [x4]
-	mov		x23, xzr
+	mov		x4, xzr
-	b		2f
+	b		1f
-1:	ld1		{T1.2d}, [x21], #16
+0:	ld1		{T1.2d}, [x2], #16
-	sub		w19, w19, #1
+	sub		w0, w0, #1
-2:	/* multiply XL by SHASH in GF(2^128) */
+1:	/* multiply XL by SHASH in GF(2^128) */
 CPU_LE(	rev64		T1.16b, T1.16b	)
 	ext		T2.16b, XL.16b, XL.16b, #8
@@ -259,18 +251,9 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	eor		T2.16b, T2.16b, XH.16b
 	eor		XL.16b, XL.16b, T2.16b
-	cbz		w19, 3f
+	cbnz		w0, 0b
-	if_will_cond_yield_neon
-	st1		{XL.2d}, [x20]
-	do_cond_yield_neon
-	b		0b
-	endif_yield_neon
-	b		1b
-3:	st1		{XL.2d}, [x20]
+	st1		{XL.2d}, [x1]
-	frame_pop
 	ret
 	.endm

--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -113,6 +113,9 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src,
 	}
 }
+/* avoid hogging the CPU for too long */
+#define MAX_BLOCKS	(SZ_64K / GHASH_BLOCK_SIZE)
 static int ghash_update(struct shash_desc *desc, const u8 *src,
 			unsigned int len)
 {
@@ -136,11 +139,16 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 		blocks = len / GHASH_BLOCK_SIZE;
 		len %= GHASH_BLOCK_SIZE;
-		ghash_do_update(blocks, ctx->digest, src, key,
+		do {
+			int chunk = min(blocks, MAX_BLOCKS);
+			ghash_do_update(chunk, ctx->digest, src, key,
 					partial ? ctx->buf : NULL);
-		src += blocks * GHASH_BLOCK_SIZE;
+			blocks -= chunk;
+			src += chunk * GHASH_BLOCK_SIZE;
 			partial = 0;
+		} while (unlikely(blocks > 0));
 	}
 	if (len)
 		memcpy(ctx->buf + partial, src, len);