Commit c4fc6328 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/chacha - simplify tail block handling

Based on lessons learnt from optimizing the 32-bit version of this driver,
we can simplify the arm64 version considerably, by reordering the final
two stores when the last block is not a multiple of 64 bytes. This removes
the need to use permutation instructions to calculate the elements that are
clobbered by the final overlapping store, given that the store of the
penultimate block now follows it, and that one carries the correct values
for those elements already.

While at it, simplify the overlapping loads as well, by calculating the
address of the final overlapping load upfront, and switching to this
address for every load that would otherwise extend past the end of the
source buffer.

There is no impact on performance, but the resulting code is substantially
smaller and easier to follow.

Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 9c0cef23
...@@ -195,7 +195,6 @@ SYM_FUNC_START(chacha_4block_xor_neon) ...@@ -195,7 +195,6 @@ SYM_FUNC_START(chacha_4block_xor_neon)
adr_l x10, .Lpermute adr_l x10, .Lpermute
and x5, x4, #63 and x5, x4, #63
add x10, x10, x5 add x10, x10, x5
add x11, x10, #64
// //
// This function encrypts four consecutive ChaCha blocks by loading // This function encrypts four consecutive ChaCha blocks by loading
...@@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 ) ...@@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 )
zip2 v31.4s, v14.4s, v15.4s zip2 v31.4s, v14.4s, v15.4s
eor a15, a15, w9 eor a15, a15, w9
mov x3, #64 add x3, x2, x4
sub x3, x3, #128 // start of last block
subs x5, x4, #128 subs x5, x4, #128
add x6, x5, x2 csel x2, x2, x3, ge
csel x3, x3, xzr, ge
csel x2, x2, x6, ge
// interleave 64-bit words in state n, n+2 // interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d zip1 v0.2d, v16.2d, v18.2d
...@@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 ) ...@@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 )
zip1 v8.2d, v17.2d, v19.2d zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d zip2 v12.2d, v17.2d, v19.2d
stp a2, a3, [x1, #-56] stp a2, a3, [x1, #-56]
ld1 {v16.16b-v19.16b}, [x2], x3
subs x6, x4, #192 subs x6, x4, #192
ccmp x3, xzr, #4, lt ld1 {v16.16b-v19.16b}, [x2], #64
add x7, x6, x2 csel x2, x2, x3, ge
csel x3, x3, xzr, eq
csel x2, x2, x7, eq
zip1 v1.2d, v20.2d, v22.2d zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d zip2 v5.2d, v20.2d, v22.2d
...@@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 ) ...@@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 )
zip1 v9.2d, v21.2d, v23.2d zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d zip2 v13.2d, v21.2d, v23.2d
stp a6, a7, [x1, #-40] stp a6, a7, [x1, #-40]
ld1 {v20.16b-v23.16b}, [x2], x3
subs x7, x4, #256 subs x7, x4, #256
ccmp x3, xzr, #4, lt ld1 {v20.16b-v23.16b}, [x2], #64
add x8, x7, x2 csel x2, x2, x3, ge
csel x3, x3, xzr, eq
csel x2, x2, x8, eq
zip1 v2.2d, v24.2d, v26.2d zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d zip2 v6.2d, v24.2d, v26.2d
...@@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 ) ...@@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 )
zip1 v10.2d, v25.2d, v27.2d zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d zip2 v14.2d, v25.2d, v27.2d
stp a10, a11, [x1, #-24] stp a10, a11, [x1, #-24]
ld1 {v24.16b-v27.16b}, [x2], x3
subs x8, x4, #320 subs x8, x4, #320
ccmp x3, xzr, #4, lt ld1 {v24.16b-v27.16b}, [x2], #64
add x9, x8, x2 csel x2, x2, x3, ge
csel x2, x2, x9, eq
zip1 v3.2d, v28.2d, v30.2d zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d zip2 v7.2d, v28.2d, v30.2d
...@@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 ) ...@@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 )
zip1 v11.2d, v29.2d, v31.2d zip1 v11.2d, v29.2d, v31.2d
zip2 v15.2d, v29.2d, v31.2d zip2 v15.2d, v29.2d, v31.2d
stp a14, a15, [x1, #-8] stp a14, a15, [x1, #-8]
tbnz x5, #63, .Lt128
ld1 {v28.16b-v31.16b}, [x2] ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output // xor with corresponding input, write to output
tbnz x5, #63, 0f
eor v16.16b, v16.16b, v0.16b eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v1.16b eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b eor v19.16b, v19.16b, v3.16b
st1 {v16.16b-v19.16b}, [x1], #64
cbz x5, .Lout
tbnz x6, #63, 1f tbnz x6, #63, .Lt192
eor v20.16b, v20.16b, v4.16b eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b eor v21.16b, v21.16b, v5.16b
eor v22.16b, v22.16b, v6.16b eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b eor v23.16b, v23.16b, v7.16b
st1 {v20.16b-v23.16b}, [x1], #64
cbz x6, .Lout
tbnz x7, #63, 2f st1 {v16.16b-v19.16b}, [x1], #64
tbnz x7, #63, .Lt256
eor v24.16b, v24.16b, v8.16b eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b eor v25.16b, v25.16b, v9.16b
eor v26.16b, v26.16b, v10.16b eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b eor v27.16b, v27.16b, v11.16b
st1 {v24.16b-v27.16b}, [x1], #64
cbz x7, .Lout
tbnz x8, #63, 3f st1 {v20.16b-v23.16b}, [x1], #64
tbnz x8, #63, .Lt320
eor v28.16b, v28.16b, v12.16b eor v28.16b, v28.16b, v12.16b
eor v29.16b, v29.16b, v13.16b eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b eor v31.16b, v31.16b, v15.16b
st1 {v24.16b-v27.16b}, [x1], #64
st1 {v28.16b-v31.16b}, [x1] st1 {v28.16b-v31.16b}, [x1]
.Lout: frame_pop .Lout: frame_pop
ret ret
// fewer than 128 bytes of in/output
0: ld1 {v8.16b}, [x10]
ld1 {v9.16b}, [x11]
movi v10.16b, #16
sub x2, x1, #64
add x1, x1, x5
ld1 {v16.16b-v19.16b}, [x2]
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 192 bytes of in/output // fewer than 192 bytes of in/output
1: ld1 {v8.16b}, [x10] .Lt192: cbz x5, 1f // exactly 128 bytes?
ld1 {v9.16b}, [x11] ld1 {v28.16b-v31.16b}, [x10]
movi v10.16b, #16 add x5, x5, x1
add x1, x1, x6 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
tbl v0.16b, {v4.16b-v7.16b}, v8.16b tbl v29.16b, {v4.16b-v7.16b}, v29.16b
tbx v20.16b, {v16.16b-v19.16b}, v9.16b tbl v30.16b, {v4.16b-v7.16b}, v30.16b
add v8.16b, v8.16b, v10.16b tbl v31.16b, {v4.16b-v7.16b}, v31.16b
add v9.16b, v9.16b, v10.16b
tbl v1.16b, {v4.16b-v7.16b}, v8.16b 0: eor v20.16b, v20.16b, v28.16b
tbx v21.16b, {v16.16b-v19.16b}, v9.16b eor v21.16b, v21.16b, v29.16b
add v8.16b, v8.16b, v10.16b eor v22.16b, v22.16b, v30.16b
add v9.16b, v9.16b, v10.16b eor v23.16b, v23.16b, v31.16b
tbl v2.16b, {v4.16b-v7.16b}, v8.16b st1 {v20.16b-v23.16b}, [x5] // overlapping stores
tbx v22.16b, {v16.16b-v19.16b}, v9.16b 1: st1 {v16.16b-v19.16b}, [x1]
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
eor v20.16b, v20.16b, v0.16b
eor v21.16b, v21.16b, v1.16b
eor v22.16b, v22.16b, v2.16b
eor v23.16b, v23.16b, v3.16b
st1 {v20.16b-v23.16b}, [x1]
b .Lout b .Lout
// fewer than 128 bytes of in/output
.Lt128: ld1 {v28.16b-v31.16b}, [x10]
add x5, x5, x1
sub x1, x1, #64
tbl v28.16b, {v0.16b-v3.16b}, v28.16b
tbl v29.16b, {v0.16b-v3.16b}, v29.16b
tbl v30.16b, {v0.16b-v3.16b}, v30.16b
tbl v31.16b, {v0.16b-v3.16b}, v31.16b
ld1 {v16.16b-v19.16b}, [x1] // reload first output block
b 0b
// fewer than 256 bytes of in/output // fewer than 256 bytes of in/output
2: ld1 {v4.16b}, [x10] .Lt256: cbz x6, 2f // exactly 192 bytes?
ld1 {v5.16b}, [x11] ld1 {v4.16b-v7.16b}, [x10]
movi v6.16b, #16 add x6, x6, x1
add x1, x1, x7
tbl v0.16b, {v8.16b-v11.16b}, v4.16b tbl v0.16b, {v8.16b-v11.16b}, v4.16b
tbx v24.16b, {v20.16b-v23.16b}, v5.16b tbl v1.16b, {v8.16b-v11.16b}, v5.16b
add v4.16b, v4.16b, v6.16b tbl v2.16b, {v8.16b-v11.16b}, v6.16b
add v5.16b, v5.16b, v6.16b tbl v3.16b, {v8.16b-v11.16b}, v7.16b
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
tbx v25.16b, {v20.16b-v23.16b}, v5.16b eor v28.16b, v28.16b, v0.16b
add v4.16b, v4.16b, v6.16b eor v29.16b, v29.16b, v1.16b
add v5.16b, v5.16b, v6.16b eor v30.16b, v30.16b, v2.16b
tbl v2.16b, {v8.16b-v11.16b}, v4.16b eor v31.16b, v31.16b, v3.16b
tbx v26.16b, {v20.16b-v23.16b}, v5.16b st1 {v28.16b-v31.16b}, [x6] // overlapping stores
add v4.16b, v4.16b, v6.16b 2: st1 {v20.16b-v23.16b}, [x1]
add v5.16b, v5.16b, v6.16b
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
eor v24.16b, v24.16b, v0.16b
eor v25.16b, v25.16b, v1.16b
eor v26.16b, v26.16b, v2.16b
eor v27.16b, v27.16b, v3.16b
st1 {v24.16b-v27.16b}, [x1]
b .Lout b .Lout
// fewer than 320 bytes of in/output // fewer than 320 bytes of in/output
3: ld1 {v4.16b}, [x10] .Lt320: cbz x7, 3f // exactly 256 bytes?
ld1 {v5.16b}, [x11] ld1 {v4.16b-v7.16b}, [x10]
movi v6.16b, #16 add x7, x7, x1
add x1, x1, x8
tbl v0.16b, {v12.16b-v15.16b}, v4.16b tbl v0.16b, {v12.16b-v15.16b}, v4.16b
tbx v28.16b, {v24.16b-v27.16b}, v5.16b tbl v1.16b, {v12.16b-v15.16b}, v5.16b
add v4.16b, v4.16b, v6.16b tbl v2.16b, {v12.16b-v15.16b}, v6.16b
add v5.16b, v5.16b, v6.16b tbl v3.16b, {v12.16b-v15.16b}, v7.16b
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
eor v28.16b, v28.16b, v0.16b eor v28.16b, v28.16b, v0.16b
eor v29.16b, v29.16b, v1.16b eor v29.16b, v29.16b, v1.16b
eor v30.16b, v30.16b, v2.16b eor v30.16b, v30.16b, v2.16b
eor v31.16b, v31.16b, v3.16b eor v31.16b, v31.16b, v3.16b
st1 {v28.16b-v31.16b}, [x1] st1 {v28.16b-v31.16b}, [x7] // overlapping stores
3: st1 {v24.16b-v27.16b}, [x1]
b .Lout b .Lout
SYM_FUNC_END(chacha_4block_xor_neon) SYM_FUNC_END(chacha_4block_xor_neon)
...@@ -851,7 +796,7 @@ SYM_FUNC_END(chacha_4block_xor_neon) ...@@ -851,7 +796,7 @@ SYM_FUNC_END(chacha_4block_xor_neon)
.align L1_CACHE_SHIFT .align L1_CACHE_SHIFT
.Lpermute: .Lpermute:
.set .Li, 0 .set .Li, 0
.rept 192 .rept 128
.byte (.Li - 64) .byte (.Li - 64)
.set .Li, .Li + 1 .set .Li, .Li + 1
.endr .endr
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment