Commit f10dc56c authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64 - revert NEON yield for fast AEAD implementations

As it turns out, checking the TIF_NEED_RESCHED flag after each
iteration results in a significant performance regression (~10%)
when running fast algorithms (i.e., ones that use special instructions
and operate in the < 4 cycles per byte range) on in-order cores with
comparatively slow memory accesses such as the Cortex-A53.

Given the speed of these ciphers, and the fact that the page based
nature of the AEAD scatterwalk API guarantees that the core NEON
transform is never invoked with more than a single page's worth of
input, we can estimate the worst case duration of any resulting
scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages,
processing a page's worth of input at 4 cycles per byte results in
a delay of ~250 us, which is a reasonable upper bound.

So let's remove the yield checks from the fused AES-CCM and AES-GCM
routines entirely.

This reverts commit 7b67ae4d and
partially reverts commit 7c50136a.

Fixes: 7c50136a ("crypto: arm64/aes-ghash - yield NEON after every ...")
Fixes: 7b67ae4d ("crypto: arm64/aes-ccm - yield NEON after every ...")
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 46d8c4b2
...@@ -19,33 +19,24 @@ ...@@ -19,33 +19,24 @@
* u32 *macp, u8 const rk[], u32 rounds); * u32 *macp, u8 const rk[], u32 rounds);
*/ */
ENTRY(ce_aes_ccm_auth_data) ENTRY(ce_aes_ccm_auth_data)
frame_push 7 ldr w8, [x3] /* leftover from prev round? */
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
ldr w25, [x22] /* leftover from prev round? */
ld1 {v0.16b}, [x0] /* load mac */ ld1 {v0.16b}, [x0] /* load mac */
cbz w25, 1f cbz w8, 1f
sub w25, w25, #16 sub w8, w8, #16
eor v1.16b, v1.16b, v1.16b eor v1.16b, v1.16b, v1.16b
0: ldrb w7, [x20], #1 /* get 1 byte of input */ 0: ldrb w7, [x1], #1 /* get 1 byte of input */
subs w21, w21, #1 subs w2, w2, #1
add w25, w25, #1 add w8, w8, #1
ins v1.b[0], w7 ins v1.b[0], w7
ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */
beq 8f /* out of input? */ beq 8f /* out of input? */
cbnz w25, 0b cbnz w8, 0b
eor v0.16b, v0.16b, v1.16b eor v0.16b, v0.16b, v1.16b
1: ld1 {v3.4s}, [x23] /* load first round key */ 1: ld1 {v3.4s}, [x4] /* load first round key */
prfm pldl1strm, [x20] prfm pldl1strm, [x1]
cmp w24, #12 /* which key size? */ cmp w5, #12 /* which key size? */
add x6, x23, #16 add x6, x4, #16
sub w7, w24, #2 /* modified # of rounds */ sub w7, w5, #2 /* modified # of rounds */
bmi 2f bmi 2f
bne 5f bne 5f
mov v5.16b, v3.16b mov v5.16b, v3.16b
...@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data) ...@@ -64,43 +55,33 @@ ENTRY(ce_aes_ccm_auth_data)
ld1 {v5.4s}, [x6], #16 /* load next round key */ ld1 {v5.4s}, [x6], #16 /* load next round key */
bpl 3b bpl 3b
aese v0.16b, v4.16b aese v0.16b, v4.16b
subs w21, w21, #16 /* last data? */ subs w2, w2, #16 /* last data? */
eor v0.16b, v0.16b, v5.16b /* final round */ eor v0.16b, v0.16b, v5.16b /* final round */
bmi 6f bmi 6f
ld1 {v1.16b}, [x20], #16 /* load next input block */ ld1 {v1.16b}, [x1], #16 /* load next input block */
eor v0.16b, v0.16b, v1.16b /* xor with mac */ eor v0.16b, v0.16b, v1.16b /* xor with mac */
beq 6f bne 1b
6: st1 {v0.16b}, [x0] /* store mac */
if_will_cond_yield_neon
st1 {v0.16b}, [x19] /* store mac */
do_cond_yield_neon
ld1 {v0.16b}, [x19] /* reload mac */
endif_yield_neon
b 1b
6: st1 {v0.16b}, [x19] /* store mac */
beq 10f beq 10f
adds w21, w21, #16 adds w2, w2, #16
beq 10f beq 10f
mov w25, w21 mov w8, w2
7: ldrb w7, [x20], #1 7: ldrb w7, [x1], #1
umov w6, v0.b[0] umov w6, v0.b[0]
eor w6, w6, w7 eor w6, w6, w7
strb w6, [x19], #1 strb w6, [x0], #1
subs w21, w21, #1 subs w2, w2, #1
beq 10f beq 10f
ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */
b 7b b 7b
8: mov w7, w25 8: mov w7, w8
add w25, w25, #16 add w8, w8, #16
9: ext v1.16b, v1.16b, v1.16b, #1 9: ext v1.16b, v1.16b, v1.16b, #1
adds w7, w7, #1 adds w7, w7, #1
bne 9b bne 9b
eor v0.16b, v0.16b, v1.16b eor v0.16b, v0.16b, v1.16b
st1 {v0.16b}, [x19] st1 {v0.16b}, [x0]
10: str w25, [x22] 10: str w8, [x3]
frame_pop
ret ret
ENDPROC(ce_aes_ccm_auth_data) ENDPROC(ce_aes_ccm_auth_data)
...@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final) ...@@ -145,29 +126,19 @@ ENTRY(ce_aes_ccm_final)
ENDPROC(ce_aes_ccm_final) ENDPROC(ce_aes_ccm_final)
.macro aes_ccm_do_crypt,enc .macro aes_ccm_do_crypt,enc
frame_push 8 ldr x8, [x6, #8] /* load lower ctr */
ld1 {v0.16b}, [x5] /* load mac */
mov x19, x0 CPU_LE( rev x8, x8 ) /* keep swabbed ctr in reg */
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
mov x25, x6
ldr x26, [x25, #8] /* load lower ctr */
ld1 {v0.16b}, [x24] /* load mac */
CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
0: /* outer loop */ 0: /* outer loop */
ld1 {v1.8b}, [x25] /* load upper ctr */ ld1 {v1.8b}, [x6] /* load upper ctr */
prfm pldl1strm, [x20] prfm pldl1strm, [x1]
add x26, x26, #1 add x8, x8, #1
rev x9, x26 rev x9, x8
cmp w23, #12 /* which key size? */ cmp w4, #12 /* which key size? */
sub w7, w23, #2 /* get modified # of rounds */ sub w7, w4, #2 /* get modified # of rounds */
ins v1.d[1], x9 /* no carry in lower ctr */ ins v1.d[1], x9 /* no carry in lower ctr */
ld1 {v3.4s}, [x22] /* load first round key */ ld1 {v3.4s}, [x3] /* load first round key */
add x10, x22, #16 add x10, x3, #16
bmi 1f bmi 1f
bne 4f bne 4f
mov v5.16b, v3.16b mov v5.16b, v3.16b
...@@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ ...@@ -194,9 +165,9 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
bpl 2b bpl 2b
aese v0.16b, v4.16b aese v0.16b, v4.16b
aese v1.16b, v4.16b aese v1.16b, v4.16b
subs w21, w21, #16 subs w2, w2, #16
bmi 7f /* partial block? */ bmi 6f /* partial block? */
ld1 {v2.16b}, [x20], #16 /* load next input block */ ld1 {v2.16b}, [x1], #16 /* load next input block */
.if \enc == 1 .if \enc == 1
eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ eor v2.16b, v2.16b, v5.16b /* final round enc+mac */
eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */
...@@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */ ...@@ -205,29 +176,18 @@ CPU_LE( rev x26, x26 ) /* keep swabbed ctr in reg */
eor v1.16b, v2.16b, v5.16b /* final round enc */ eor v1.16b, v2.16b, v5.16b /* final round enc */
.endif .endif
eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */
st1 {v1.16b}, [x19], #16 /* write output block */ st1 {v1.16b}, [x0], #16 /* write output block */
beq 5f bne 0b
CPU_LE( rev x8, x8 )
if_will_cond_yield_neon st1 {v0.16b}, [x5] /* store mac */
st1 {v0.16b}, [x24] /* store mac */ str x8, [x6, #8] /* store lsb end of ctr (BE) */
do_cond_yield_neon 5: ret
ld1 {v0.16b}, [x24] /* reload mac */
endif_yield_neon 6: eor v0.16b, v0.16b, v5.16b /* final round mac */
b 0b
5:
CPU_LE( rev x26, x26 )
st1 {v0.16b}, [x24] /* store mac */
str x26, [x25, #8] /* store lsb end of ctr (BE) */
6: frame_pop
ret
7: eor v0.16b, v0.16b, v5.16b /* final round mac */
eor v1.16b, v1.16b, v5.16b /* final round enc */ eor v1.16b, v1.16b, v5.16b /* final round enc */
st1 {v0.16b}, [x24] /* store mac */ st1 {v0.16b}, [x5] /* store mac */
add w21, w21, #16 /* process partial tail block */ add w2, w2, #16 /* process partial tail block */
8: ldrb w9, [x20], #1 /* get 1 byte of input */ 7: ldrb w9, [x1], #1 /* get 1 byte of input */
umov w6, v1.b[0] /* get top crypted ctr byte */ umov w6, v1.b[0] /* get top crypted ctr byte */
umov w7, v0.b[0] /* get top mac byte */ umov w7, v0.b[0] /* get top mac byte */
.if \enc == 1 .if \enc == 1
...@@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 ) ...@@ -237,13 +197,13 @@ CPU_LE( rev x26, x26 )
eor w9, w9, w6 eor w9, w9, w6
eor w7, w7, w9 eor w7, w7, w9
.endif .endif
strb w9, [x19], #1 /* store out byte */ strb w9, [x0], #1 /* store out byte */
strb w7, [x24], #1 /* store mac byte */ strb w7, [x5], #1 /* store mac byte */
subs w21, w21, #1 subs w2, w2, #1
beq 6b beq 5b
ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */
ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */
b 8b b 7b
.endm .endm
/* /*
......
...@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8) ...@@ -322,55 +322,41 @@ ENDPROC(pmull_ghash_update_p8)
.endm .endm
.macro pmull_gcm_do_crypt, enc .macro pmull_gcm_do_crypt, enc
frame_push 10 ld1 {SHASH.2d}, [x4]
ld1 {XL.2d}, [x1]
ldr x8, [x5, #8] // load lower counter
mov x19, x0 load_round_keys w7, x6
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
mov x25, x6
mov x26, x7
.if \enc == 1
ldr x27, [sp, #96] // first stacked arg
.endif
ldr x28, [x24, #8] // load lower counter
CPU_LE( rev x28, x28 )
0: mov x0, x25
load_round_keys w26, x0
ld1 {SHASH.2d}, [x23]
ld1 {XL.2d}, [x20]
movi MASK.16b, #0xe1 movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
CPU_LE( rev x8, x8 )
shl MASK.2d, MASK.2d, #57 shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b eor SHASH2.16b, SHASH2.16b, SHASH.16b
.if \enc == 1 .if \enc == 1
ld1 {KS.16b}, [x27] ldr x10, [sp]
ld1 {KS.16b}, [x10]
.endif .endif
1: ld1 {CTR.8b}, [x24] // load upper counter 0: ld1 {CTR.8b}, [x5] // load upper counter
ld1 {INP.16b}, [x22], #16 ld1 {INP.16b}, [x3], #16
rev x9, x28 rev x9, x8
add x28, x28, #1 add x8, x8, #1
sub w19, w19, #1 sub w0, w0, #1
ins CTR.d[1], x9 // set lower counter ins CTR.d[1], x9 // set lower counter
.if \enc == 1 .if \enc == 1
eor INP.16b, INP.16b, KS.16b // encrypt input eor INP.16b, INP.16b, KS.16b // encrypt input
st1 {INP.16b}, [x21], #16 st1 {INP.16b}, [x2], #16
.endif .endif
rev64 T1.16b, INP.16b rev64 T1.16b, INP.16b
cmp w26, #12 cmp w7, #12
b.ge 4f // AES-192/256? b.ge 2f // AES-192/256?
2: enc_round CTR, v21 1: enc_round CTR, v21
ext T2.16b, XL.16b, XL.16b, #8 ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8
...@@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 ) ...@@ -425,39 +411,27 @@ CPU_LE( rev x28, x28 )
.if \enc == 0 .if \enc == 0
eor INP.16b, INP.16b, KS.16b eor INP.16b, INP.16b, KS.16b
st1 {INP.16b}, [x21], #16 st1 {INP.16b}, [x2], #16
.endif .endif
cbz w19, 3f cbnz w0, 0b
if_will_cond_yield_neon CPU_LE( rev x8, x8 )
st1 {XL.2d}, [x20] st1 {XL.2d}, [x1]
.if \enc == 1 str x8, [x5, #8] // store lower counter
st1 {KS.16b}, [x27]
.endif
do_cond_yield_neon
b 0b
endif_yield_neon
b 1b
3: st1 {XL.2d}, [x20]
.if \enc == 1 .if \enc == 1
st1 {KS.16b}, [x27] st1 {KS.16b}, [x10]
.endif .endif
CPU_LE( rev x28, x28 )
str x28, [x24, #8] // store lower counter
frame_pop
ret ret
4: b.eq 5f // AES-192? 2: b.eq 3f // AES-192?
enc_round CTR, v17 enc_round CTR, v17
enc_round CTR, v18 enc_round CTR, v18
5: enc_round CTR, v19 3: enc_round CTR, v19
enc_round CTR, v20 enc_round CTR, v20
b 2b b 1b
.endm .endm
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment