Commit 4b6d196c authored by Eric Biggers's avatar Eric Biggers Committed by Herbert Xu

crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian

The change to encrypt a fifth ChaCha block using scalar instructions
caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests
to start failing on big endian arm64 kernels.  The bug is that the
keystream block produced in 32-bit scalar registers is directly XOR'd
with the data words, which are loaded and stored in native endianness.
Thus in big endian mode the data bytes end up XOR'd with the wrong
bytes.  Fix it by byte-swapping the keystream words in big endian mode.

Fixes: 2fe55987 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed")
Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Reviewed-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent c6431650
...@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon) ...@@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon)
add v3.4s, v3.4s, v19.4s add v3.4s, v3.4s, v19.4s
add a2, a2, w8 add a2, a2, w8
add a3, a3, w9 add a3, a3, w9
CPU_BE( rev a0, a0 )
CPU_BE( rev a1, a1 )
CPU_BE( rev a2, a2 )
CPU_BE( rev a3, a3 )
ld4r {v24.4s-v27.4s}, [x0], #16 ld4r {v24.4s-v27.4s}, [x0], #16
ld4r {v28.4s-v31.4s}, [x0] ld4r {v28.4s-v31.4s}, [x0]
...@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon) ...@@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon)
add v7.4s, v7.4s, v23.4s add v7.4s, v7.4s, v23.4s
add a6, a6, w8 add a6, a6, w8
add a7, a7, w9 add a7, a7, w9
CPU_BE( rev a4, a4 )
CPU_BE( rev a5, a5 )
CPU_BE( rev a6, a6 )
CPU_BE( rev a7, a7 )
// x8[0-3] += s2[0] // x8[0-3] += s2[0]
// x9[0-3] += s2[1] // x9[0-3] += s2[1]
...@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon) ...@@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon)
add v11.4s, v11.4s, v27.4s add v11.4s, v11.4s, v27.4s
add a10, a10, w8 add a10, a10, w8
add a11, a11, w9 add a11, a11, w9
CPU_BE( rev a8, a8 )
CPU_BE( rev a9, a9 )
CPU_BE( rev a10, a10 )
CPU_BE( rev a11, a11 )
// x12[0-3] += s3[0] // x12[0-3] += s3[0]
// x13[0-3] += s3[1] // x13[0-3] += s3[1]
...@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon) ...@@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon)
add v15.4s, v15.4s, v31.4s add v15.4s, v15.4s, v31.4s
add a14, a14, w8 add a14, a14, w8
add a15, a15, w9 add a15, a15, w9
CPU_BE( rev a12, a12 )
CPU_BE( rev a13, a13 )
CPU_BE( rev a14, a14 )
CPU_BE( rev a15, a15 )
// interleave 32-bit words in state n, n+1 // interleave 32-bit words in state n, n+1
ldp w6, w7, [x2], #64 ldp w6, w7, [x2], #64
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment