Commit 7edc86cb authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/sha3-ce - yield NEON after every block of input

Avoid excessive scheduling delays under a preemptible kernel by
conditionally yielding the NEON after every block of input.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 5b3da651
...@@ -41,9 +41,16 @@ ...@@ -41,9 +41,16 @@
*/ */
.text .text
ENTRY(sha3_ce_transform) ENTRY(sha3_ce_transform)
/* load state */ frame_push 4
add x8, x0, #32
ld1 { v0.1d- v3.1d}, [x0] mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
0: /* load state */
add x8, x19, #32
ld1 { v0.1d- v3.1d}, [x19]
ld1 { v4.1d- v7.1d}, [x8], #32 ld1 { v4.1d- v7.1d}, [x8], #32
ld1 { v8.1d-v11.1d}, [x8], #32 ld1 { v8.1d-v11.1d}, [x8], #32
ld1 {v12.1d-v15.1d}, [x8], #32 ld1 {v12.1d-v15.1d}, [x8], #32
...@@ -51,13 +58,13 @@ ENTRY(sha3_ce_transform) ...@@ -51,13 +58,13 @@ ENTRY(sha3_ce_transform)
ld1 {v20.1d-v23.1d}, [x8], #32 ld1 {v20.1d-v23.1d}, [x8], #32
ld1 {v24.1d}, [x8] ld1 {v24.1d}, [x8]
0: sub w2, w2, #1 1: sub w21, w21, #1
mov w8, #24 mov w8, #24
adr_l x9, .Lsha3_rcon adr_l x9, .Lsha3_rcon
/* load input */ /* load input */
ld1 {v25.8b-v28.8b}, [x1], #32 ld1 {v25.8b-v28.8b}, [x20], #32
ld1 {v29.8b-v31.8b}, [x1], #24 ld1 {v29.8b-v31.8b}, [x20], #24
eor v0.8b, v0.8b, v25.8b eor v0.8b, v0.8b, v25.8b
eor v1.8b, v1.8b, v26.8b eor v1.8b, v1.8b, v26.8b
eor v2.8b, v2.8b, v27.8b eor v2.8b, v2.8b, v27.8b
...@@ -66,10 +73,10 @@ ENTRY(sha3_ce_transform) ...@@ -66,10 +73,10 @@ ENTRY(sha3_ce_transform)
eor v5.8b, v5.8b, v30.8b eor v5.8b, v5.8b, v30.8b
eor v6.8b, v6.8b, v31.8b eor v6.8b, v6.8b, v31.8b
tbnz x3, #6, 2f // SHA3-512 tbnz x22, #6, 3f // SHA3-512
ld1 {v25.8b-v28.8b}, [x1], #32 ld1 {v25.8b-v28.8b}, [x20], #32
ld1 {v29.8b-v30.8b}, [x1], #16 ld1 {v29.8b-v30.8b}, [x20], #16
eor v7.8b, v7.8b, v25.8b eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b eor v8.8b, v8.8b, v26.8b
eor v9.8b, v9.8b, v27.8b eor v9.8b, v9.8b, v27.8b
...@@ -77,34 +84,34 @@ ENTRY(sha3_ce_transform) ...@@ -77,34 +84,34 @@ ENTRY(sha3_ce_transform)
eor v11.8b, v11.8b, v29.8b eor v11.8b, v11.8b, v29.8b
eor v12.8b, v12.8b, v30.8b eor v12.8b, v12.8b, v30.8b
tbnz x3, #4, 1f // SHA3-384 or SHA3-224 tbnz x22, #4, 2f // SHA3-384 or SHA3-224
// SHA3-256 // SHA3-256
ld1 {v25.8b-v28.8b}, [x1], #32 ld1 {v25.8b-v28.8b}, [x20], #32
eor v13.8b, v13.8b, v25.8b eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b eor v16.8b, v16.8b, v28.8b
b 3f b 4f
1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 2: tbz x22, #2, 4f // bit 2 cleared? SHA-384
// SHA3-224 // SHA3-224
ld1 {v25.8b-v28.8b}, [x1], #32 ld1 {v25.8b-v28.8b}, [x20], #32
ld1 {v29.8b}, [x1], #8 ld1 {v29.8b}, [x20], #8
eor v13.8b, v13.8b, v25.8b eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b eor v16.8b, v16.8b, v28.8b
eor v17.8b, v17.8b, v29.8b eor v17.8b, v17.8b, v29.8b
b 3f b 4f
// SHA3-512 // SHA3-512
2: ld1 {v25.8b-v26.8b}, [x1], #16 3: ld1 {v25.8b-v26.8b}, [x20], #16
eor v7.8b, v7.8b, v25.8b eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b eor v8.8b, v8.8b, v26.8b
3: sub w8, w8, #1 4: sub w8, w8, #1
eor3 v29.16b, v4.16b, v9.16b, v14.16b eor3 v29.16b, v4.16b, v9.16b, v14.16b
eor3 v26.16b, v1.16b, v6.16b, v11.16b eor3 v26.16b, v1.16b, v6.16b, v11.16b
...@@ -183,17 +190,33 @@ ENTRY(sha3_ce_transform) ...@@ -183,17 +190,33 @@ ENTRY(sha3_ce_transform)
eor v0.16b, v0.16b, v31.16b eor v0.16b, v0.16b, v31.16b
cbnz w8, 3b cbnz w8, 4b
cbnz w2, 0b cbz w21, 5f
if_will_cond_yield_neon
add x8, x19, #32
st1 { v0.1d- v3.1d}, [x19]
st1 { v4.1d- v7.1d}, [x8], #32
st1 { v8.1d-v11.1d}, [x8], #32
st1 {v12.1d-v15.1d}, [x8], #32
st1 {v16.1d-v19.1d}, [x8], #32
st1 {v20.1d-v23.1d}, [x8], #32
st1 {v24.1d}, [x8]
do_cond_yield_neon
b 0b
endif_yield_neon
b 1b
/* save state */ /* save state */
st1 { v0.1d- v3.1d}, [x0], #32 5: st1 { v0.1d- v3.1d}, [x19], #32
st1 { v4.1d- v7.1d}, [x0], #32 st1 { v4.1d- v7.1d}, [x19], #32
st1 { v8.1d-v11.1d}, [x0], #32 st1 { v8.1d-v11.1d}, [x19], #32
st1 {v12.1d-v15.1d}, [x0], #32 st1 {v12.1d-v15.1d}, [x19], #32
st1 {v16.1d-v19.1d}, [x0], #32 st1 {v16.1d-v19.1d}, [x19], #32
st1 {v20.1d-v23.1d}, [x0], #32 st1 {v20.1d-v23.1d}, [x19], #32
st1 {v24.1d}, [x0] st1 {v24.1d}, [x19]
frame_pop
ret ret
ENDPROC(sha3_ce_transform) ENDPROC(sha3_ce_transform)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment