Commit b7b73cd5 authored by Eric Biggers's avatar Eric Biggers Committed by Herbert Xu

crypto: x86/salsa20 - remove x86 salsa20 implementations

The x86 assembly implementations of Salsa20 use the frame base pointer
register (%ebp or %rbp), which breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.
Recent (v4.10+) kernels will warn about this, e.g.

WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c
[...]

But after looking into it, I believe there's very little reason to still
retain the x86 Salsa20 code.  First, these are *not* vectorized
(SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere
close to the best Salsa20 performance on any remotely modern x86
processor; they're just regular x86 assembly.  Second, it's still
unclear that anyone is actually using the kernel's Salsa20 at all,
especially given that now ChaCha20 is supported too, and with much more
efficient SSSE3 and AVX2 implementations.  Finally, in benchmarks I did
on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the
x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic
(~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm
is only slightly faster than salsa20-generic (~15% faster on Skylake,
~20% faster on Zen).  The gcc version made little difference.

So, the x86_64 salsa20-asm is pretty clearly useless.  That leaves just
the i686 salsa20-asm, which based on my tests provides a 15-20% speed
boost.  But that's without updating the code to not use %ebp.  And given
the maintenance cost, the small speed difference vs. salsa20-generic,
the fact that few people still use i686 kernels, the doubt that anyone
is even using the kernel's Salsa20 at all, and the fact that a SSE2
implementation would almost certainly be much faster on any remotely
modern x86 processor yet no one has cared enough to add one yet, I don't
think it's worthwhile to keep.

Thus, just remove both the x86_64 and i686 salsa20-asm implementations.

Reported-by: syzbot+ffa3a158337bbc01ff09@syzkaller.appspotmail.com
Signed-off-by: default avatarEric Biggers <ebiggers@google.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 0b3a830b
......@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
......@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
......@@ -71,7 +69,6 @@ endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
......@@ -80,7 +77,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
......
# Derived from:
# salsa20_pm.s version 20051229
# D. J. Bernstein
# Public domain.
#include <linux/linkage.h>
.text
# enter salsa20_encrypt_bytes
ENTRY(salsa20_encrypt_bytes)
mov %esp,%eax
and $31,%eax
add $256,%eax
sub %eax,%esp
# eax_stack = eax
movl %eax,80(%esp)
# ebx_stack = ebx
movl %ebx,84(%esp)
# esi_stack = esi
movl %esi,88(%esp)
# edi_stack = edi
movl %edi,92(%esp)
# ebp_stack = ebp
movl %ebp,96(%esp)
# x = arg1
movl 4(%esp,%eax),%edx
# m = arg2
movl 8(%esp,%eax),%esi
# out = arg3
movl 12(%esp,%eax),%edi
# bytes = arg4
movl 16(%esp,%eax),%ebx
# bytes -= 0
sub $0,%ebx
# goto done if unsigned<=
jbe ._done
._start:
# in0 = *(uint32 *) (x + 0)
movl 0(%edx),%eax
# in1 = *(uint32 *) (x + 4)
movl 4(%edx),%ecx
# in2 = *(uint32 *) (x + 8)
movl 8(%edx),%ebp
# j0 = in0
movl %eax,164(%esp)
# in3 = *(uint32 *) (x + 12)
movl 12(%edx),%eax
# j1 = in1
movl %ecx,168(%esp)
# in4 = *(uint32 *) (x + 16)
movl 16(%edx),%ecx
# j2 = in2
movl %ebp,172(%esp)
# in5 = *(uint32 *) (x + 20)
movl 20(%edx),%ebp
# j3 = in3
movl %eax,176(%esp)
# in6 = *(uint32 *) (x + 24)
movl 24(%edx),%eax
# j4 = in4
movl %ecx,180(%esp)
# in7 = *(uint32 *) (x + 28)
movl 28(%edx),%ecx
# j5 = in5
movl %ebp,184(%esp)
# in8 = *(uint32 *) (x + 32)
movl 32(%edx),%ebp
# j6 = in6
movl %eax,188(%esp)
# in9 = *(uint32 *) (x + 36)
movl 36(%edx),%eax
# j7 = in7
movl %ecx,192(%esp)
# in10 = *(uint32 *) (x + 40)
movl 40(%edx),%ecx
# j8 = in8
movl %ebp,196(%esp)
# in11 = *(uint32 *) (x + 44)
movl 44(%edx),%ebp
# j9 = in9
movl %eax,200(%esp)
# in12 = *(uint32 *) (x + 48)
movl 48(%edx),%eax
# j10 = in10
movl %ecx,204(%esp)
# in13 = *(uint32 *) (x + 52)
movl 52(%edx),%ecx
# j11 = in11
movl %ebp,208(%esp)
# in14 = *(uint32 *) (x + 56)
movl 56(%edx),%ebp
# j12 = in12
movl %eax,212(%esp)
# in15 = *(uint32 *) (x + 60)
movl 60(%edx),%eax
# j13 = in13
movl %ecx,216(%esp)
# j14 = in14
movl %ebp,220(%esp)
# j15 = in15
movl %eax,224(%esp)
# x_backup = x
movl %edx,64(%esp)
._bytesatleast1:
# bytes - 64
cmp $64,%ebx
# goto nocopy if unsigned>=
jae ._nocopy
# ctarget = out
movl %edi,228(%esp)
# out = &tmp
leal 0(%esp),%edi
# i = bytes
mov %ebx,%ecx
# while (i) { *out++ = *m++; --i }
rep movsb
# out = &tmp
leal 0(%esp),%edi
# m = &tmp
leal 0(%esp),%esi
._nocopy:
# out_backup = out
movl %edi,72(%esp)
# m_backup = m
movl %esi,68(%esp)
# bytes_backup = bytes
movl %ebx,76(%esp)
# in0 = j0
movl 164(%esp),%eax
# in1 = j1
movl 168(%esp),%ecx
# in2 = j2
movl 172(%esp),%edx
# in3 = j3
movl 176(%esp),%ebx
# x0 = in0
movl %eax,100(%esp)
# x1 = in1
movl %ecx,104(%esp)
# x2 = in2
movl %edx,108(%esp)
# x3 = in3
movl %ebx,112(%esp)
# in4 = j4
movl 180(%esp),%eax
# in5 = j5
movl 184(%esp),%ecx
# in6 = j6
movl 188(%esp),%edx
# in7 = j7
movl 192(%esp),%ebx
# x4 = in4
movl %eax,116(%esp)
# x5 = in5
movl %ecx,120(%esp)
# x6 = in6
movl %edx,124(%esp)
# x7 = in7
movl %ebx,128(%esp)
# in8 = j8
movl 196(%esp),%eax
# in9 = j9
movl 200(%esp),%ecx
# in10 = j10
movl 204(%esp),%edx
# in11 = j11
movl 208(%esp),%ebx
# x8 = in8
movl %eax,132(%esp)
# x9 = in9
movl %ecx,136(%esp)
# x10 = in10
movl %edx,140(%esp)
# x11 = in11
movl %ebx,144(%esp)
# in12 = j12
movl 212(%esp),%eax
# in13 = j13
movl 216(%esp),%ecx
# in14 = j14
movl 220(%esp),%edx
# in15 = j15
movl 224(%esp),%ebx
# x12 = in12
movl %eax,148(%esp)
# x13 = in13
movl %ecx,152(%esp)
# x14 = in14
movl %edx,156(%esp)
# x15 = in15
movl %ebx,160(%esp)
# i = 20
mov $20,%ebp
# p = x0
movl 100(%esp),%eax
# s = x5
movl 120(%esp),%ecx
# t = x10
movl 140(%esp),%edx
# w = x15
movl 160(%esp),%ebx
._mainloop:
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x12
addl 148(%esp),%eax
# x5 = s
movl %ecx,120(%esp)
# t += x6
addl 124(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x1
movl 104(%esp),%esi
# r += s
add %ecx,%esi
# v = x11
movl 144(%esp),%edi
# v += w
add %ebx,%edi
# p <<<= 7
rol $7,%eax
# p ^= x4
xorl 116(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x14
xorl 156(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x9
xorl 136(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x3
xorl 112(%esp),%edi
# x4 = p
movl %eax,116(%esp)
# x14 = t
movl %edx,156(%esp)
# p += x0
addl 100(%esp),%eax
# x9 = r
movl %esi,136(%esp)
# t += x10
addl 140(%esp),%edx
# x3 = v
movl %edi,112(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x8
xorl 132(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x2
xorl 108(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x13
xorl 152(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x7
xorl 128(%esp),%ebx
# x8 = p
movl %eax,132(%esp)
# x2 = t
movl %edx,108(%esp)
# p += x4
addl 116(%esp),%eax
# x13 = s
movl %ecx,152(%esp)
# t += x14
addl 156(%esp),%edx
# x7 = w
movl %ebx,128(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x12
xorl 148(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x6
xorl 124(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x1
xorl 104(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x11
xorl 144(%esp),%edi
# x12 = p
movl %eax,148(%esp)
# x6 = t
movl %edx,124(%esp)
# p += x8
addl 132(%esp),%eax
# x1 = r
movl %esi,104(%esp)
# t += x2
addl 108(%esp),%edx
# x11 = v
movl %edi,144(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x3
addl 112(%esp),%eax
# p <<<= 7
rol $7,%eax
# x5 = s
movl %ecx,120(%esp)
# t += x9
addl 136(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x4
movl 116(%esp),%esi
# r += s
add %ecx,%esi
# v = x14
movl 156(%esp),%edi
# v += w
add %ebx,%edi
# p ^= x1
xorl 104(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x11
xorl 144(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x6
xorl 124(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x12
xorl 148(%esp),%edi
# x1 = p
movl %eax,104(%esp)
# x11 = t
movl %edx,144(%esp)
# p += x0
addl 100(%esp),%eax
# x6 = r
movl %esi,124(%esp)
# t += x10
addl 140(%esp),%edx
# x12 = v
movl %edi,148(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x2
xorl 108(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x8
xorl 132(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x7
xorl 128(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x13
xorl 152(%esp),%ebx
# x2 = p
movl %eax,108(%esp)
# x8 = t
movl %edx,132(%esp)
# p += x1
addl 104(%esp),%eax
# x7 = s
movl %ecx,128(%esp)
# t += x11
addl 144(%esp),%edx
# x13 = w
movl %ebx,152(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x3
xorl 112(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x9
xorl 136(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x4
xorl 116(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x14
xorl 156(%esp),%edi
# x3 = p
movl %eax,112(%esp)
# x9 = t
movl %edx,136(%esp)
# p += x2
addl 108(%esp),%eax
# x4 = r
movl %esi,116(%esp)
# t += x8
addl 132(%esp),%edx
# x14 = v
movl %edi,156(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x12
addl 148(%esp),%eax
# x5 = s
movl %ecx,120(%esp)
# t += x6
addl 124(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x1
movl 104(%esp),%esi
# r += s
add %ecx,%esi
# v = x11
movl 144(%esp),%edi
# v += w
add %ebx,%edi
# p <<<= 7
rol $7,%eax
# p ^= x4
xorl 116(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x14
xorl 156(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x9
xorl 136(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x3
xorl 112(%esp),%edi
# x4 = p
movl %eax,116(%esp)
# x14 = t
movl %edx,156(%esp)
# p += x0
addl 100(%esp),%eax
# x9 = r
movl %esi,136(%esp)
# t += x10
addl 140(%esp),%edx
# x3 = v
movl %edi,112(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x8
xorl 132(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x2
xorl 108(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x13
xorl 152(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x7
xorl 128(%esp),%ebx
# x8 = p
movl %eax,132(%esp)
# x2 = t
movl %edx,108(%esp)
# p += x4
addl 116(%esp),%eax
# x13 = s
movl %ecx,152(%esp)
# t += x14
addl 156(%esp),%edx
# x7 = w
movl %ebx,128(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x12
xorl 148(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x6
xorl 124(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x1
xorl 104(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x11
xorl 144(%esp),%edi
# x12 = p
movl %eax,148(%esp)
# x6 = t
movl %edx,124(%esp)
# p += x8
addl 132(%esp),%eax
# x1 = r
movl %esi,104(%esp)
# t += x2
addl 108(%esp),%edx
# x11 = v
movl %edi,144(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x3
addl 112(%esp),%eax
# p <<<= 7
rol $7,%eax
# x5 = s
movl %ecx,120(%esp)
# t += x9
addl 136(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x4
movl 116(%esp),%esi
# r += s
add %ecx,%esi
# v = x14
movl 156(%esp),%edi
# v += w
add %ebx,%edi
# p ^= x1
xorl 104(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x11
xorl 144(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x6
xorl 124(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x12
xorl 148(%esp),%edi
# x1 = p
movl %eax,104(%esp)
# x11 = t
movl %edx,144(%esp)
# p += x0
addl 100(%esp),%eax
# x6 = r
movl %esi,124(%esp)
# t += x10
addl 140(%esp),%edx
# x12 = v
movl %edi,148(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x2
xorl 108(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x8
xorl 132(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x7
xorl 128(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x13
xorl 152(%esp),%ebx
# x2 = p
movl %eax,108(%esp)
# x8 = t
movl %edx,132(%esp)
# p += x1
addl 104(%esp),%eax
# x7 = s
movl %ecx,128(%esp)
# t += x11
addl 144(%esp),%edx
# x13 = w
movl %ebx,152(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x3
xorl 112(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x9
xorl 136(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x4
xorl 116(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x14
xorl 156(%esp),%edi
# x3 = p
movl %eax,112(%esp)
# x9 = t
movl %edx,136(%esp)
# p += x2
addl 108(%esp),%eax
# x4 = r
movl %esi,116(%esp)
# t += x8
addl 132(%esp),%edx
# x14 = v
movl %edi,156(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# i -= 4
sub $4,%ebp
# goto mainloop if unsigned >
ja ._mainloop
# x0 = p
movl %eax,100(%esp)
# x5 = s
movl %ecx,120(%esp)
# x10 = t
movl %edx,140(%esp)
# x15 = w
movl %ebx,160(%esp)
# out = out_backup
movl 72(%esp),%edi
# m = m_backup
movl 68(%esp),%esi
# in0 = x0
movl 100(%esp),%eax
# in1 = x1
movl 104(%esp),%ecx
# in0 += j0
addl 164(%esp),%eax
# in1 += j1
addl 168(%esp),%ecx
# in0 ^= *(uint32 *) (m + 0)
xorl 0(%esi),%eax
# in1 ^= *(uint32 *) (m + 4)
xorl 4(%esi),%ecx
# *(uint32 *) (out + 0) = in0
movl %eax,0(%edi)
# *(uint32 *) (out + 4) = in1
movl %ecx,4(%edi)
# in2 = x2
movl 108(%esp),%eax
# in3 = x3
movl 112(%esp),%ecx
# in2 += j2
addl 172(%esp),%eax
# in3 += j3
addl 176(%esp),%ecx
# in2 ^= *(uint32 *) (m + 8)
xorl 8(%esi),%eax
# in3 ^= *(uint32 *) (m + 12)
xorl 12(%esi),%ecx
# *(uint32 *) (out + 8) = in2
movl %eax,8(%edi)
# *(uint32 *) (out + 12) = in3
movl %ecx,12(%edi)
# in4 = x4
movl 116(%esp),%eax
# in5 = x5
movl 120(%esp),%ecx
# in4 += j4
addl 180(%esp),%eax
# in5 += j5
addl 184(%esp),%ecx
# in4 ^= *(uint32 *) (m + 16)
xorl 16(%esi),%eax
# in5 ^= *(uint32 *) (m + 20)
xorl 20(%esi),%ecx
# *(uint32 *) (out + 16) = in4
movl %eax,16(%edi)
# *(uint32 *) (out + 20) = in5
movl %ecx,20(%edi)
# in6 = x6
movl 124(%esp),%eax
# in7 = x7
movl 128(%esp),%ecx
# in6 += j6
addl 188(%esp),%eax
# in7 += j7
addl 192(%esp),%ecx
# in6 ^= *(uint32 *) (m + 24)
xorl 24(%esi),%eax
# in7 ^= *(uint32 *) (m + 28)
xorl 28(%esi),%ecx
# *(uint32 *) (out + 24) = in6
movl %eax,24(%edi)
# *(uint32 *) (out + 28) = in7
movl %ecx,28(%edi)
# in8 = x8
movl 132(%esp),%eax
# in9 = x9
movl 136(%esp),%ecx
# in8 += j8
addl 196(%esp),%eax
# in9 += j9
addl 200(%esp),%ecx
# in8 ^= *(uint32 *) (m + 32)
xorl 32(%esi),%eax
# in9 ^= *(uint32 *) (m + 36)
xorl 36(%esi),%ecx
# *(uint32 *) (out + 32) = in8
movl %eax,32(%edi)
# *(uint32 *) (out + 36) = in9
movl %ecx,36(%edi)
# in10 = x10
movl 140(%esp),%eax
# in11 = x11
movl 144(%esp),%ecx
# in10 += j10
addl 204(%esp),%eax
# in11 += j11
addl 208(%esp),%ecx
# in10 ^= *(uint32 *) (m + 40)
xorl 40(%esi),%eax
# in11 ^= *(uint32 *) (m + 44)
xorl 44(%esi),%ecx
# *(uint32 *) (out + 40) = in10
movl %eax,40(%edi)
# *(uint32 *) (out + 44) = in11
movl %ecx,44(%edi)
# in12 = x12
movl 148(%esp),%eax
# in13 = x13
movl 152(%esp),%ecx
# in12 += j12
addl 212(%esp),%eax
# in13 += j13
addl 216(%esp),%ecx
# in12 ^= *(uint32 *) (m + 48)
xorl 48(%esi),%eax
# in13 ^= *(uint32 *) (m + 52)
xorl 52(%esi),%ecx
# *(uint32 *) (out + 48) = in12
movl %eax,48(%edi)
# *(uint32 *) (out + 52) = in13
movl %ecx,52(%edi)
# in14 = x14
movl 156(%esp),%eax
# in15 = x15
movl 160(%esp),%ecx
# in14 += j14
addl 220(%esp),%eax
# in15 += j15
addl 224(%esp),%ecx
# in14 ^= *(uint32 *) (m + 56)
xorl 56(%esi),%eax
# in15 ^= *(uint32 *) (m + 60)
xorl 60(%esi),%ecx
# *(uint32 *) (out + 56) = in14
movl %eax,56(%edi)
# *(uint32 *) (out + 60) = in15
movl %ecx,60(%edi)
# bytes = bytes_backup
movl 76(%esp),%ebx
# in8 = j8
movl 196(%esp),%eax
# in9 = j9
movl 200(%esp),%ecx
# in8 += 1
add $1,%eax
# in9 += 0 + carry
adc $0,%ecx
# j8 = in8
movl %eax,196(%esp)
# j9 = in9
movl %ecx,200(%esp)
# bytes - 64
cmp $64,%ebx
# goto bytesatleast65 if unsigned>
ja ._bytesatleast65
# goto bytesatleast64 if unsigned>=
jae ._bytesatleast64
# m = out
mov %edi,%esi
# out = ctarget
movl 228(%esp),%edi
# i = bytes
mov %ebx,%ecx
# while (i) { *out++ = *m++; --i }
rep movsb
._bytesatleast64:
# x = x_backup
movl 64(%esp),%eax
# in8 = j8
movl 196(%esp),%ecx
# in9 = j9
movl 200(%esp),%edx
# *(uint32 *) (x + 32) = in8
movl %ecx,32(%eax)
# *(uint32 *) (x + 36) = in9
movl %edx,36(%eax)
._done:
# eax = eax_stack
movl 80(%esp),%eax
# ebx = ebx_stack
movl 84(%esp),%ebx
# esi = esi_stack
movl 88(%esp),%esi
# edi = edi_stack
movl 92(%esp),%edi
# ebp = ebp_stack
movl 96(%esp),%ebp
# leave
add %eax,%esp
ret
._bytesatleast65:
# bytes -= 64
sub $64,%ebx
# out += 64
add $64,%edi
# m += 64
add $64,%esi
# goto bytesatleast1
jmp ._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
# enter salsa20_encrypt_bytes
ENTRY(salsa20_encrypt_bytes)
mov %rsp,%r11
and $31,%r11
add $256,%r11
sub %r11,%rsp
# x = arg1
mov %rdi,%r8
# m = arg2
mov %rsi,%rsi
# out = arg3
mov %rdx,%rdi
# bytes = arg4
mov %rcx,%rdx
# unsigned>? bytes - 0
cmp $0,%rdx
# comment:fp stack unchanged by jump
# goto done if !unsigned>
jbe ._done
# comment:fp stack unchanged by fallthrough
# start:
._start:
# r11_stack = r11
movq %r11,0(%rsp)
# r12_stack = r12
movq %r12,8(%rsp)
# r13_stack = r13
movq %r13,16(%rsp)
# r14_stack = r14
movq %r14,24(%rsp)
# r15_stack = r15
movq %r15,32(%rsp)
# rbx_stack = rbx
movq %rbx,40(%rsp)
# rbp_stack = rbp
movq %rbp,48(%rsp)
# in0 = *(uint64 *) (x + 0)
movq 0(%r8),%rcx
# in2 = *(uint64 *) (x + 8)
movq 8(%r8),%r9
# in4 = *(uint64 *) (x + 16)
movq 16(%r8),%rax
# in6 = *(uint64 *) (x + 24)
movq 24(%r8),%r10
# in8 = *(uint64 *) (x + 32)
movq 32(%r8),%r11
# in10 = *(uint64 *) (x + 40)
movq 40(%r8),%r12
# in12 = *(uint64 *) (x + 48)
movq 48(%r8),%r13
# in14 = *(uint64 *) (x + 56)
movq 56(%r8),%r14
# j0 = in0
movq %rcx,56(%rsp)
# j2 = in2
movq %r9,64(%rsp)
# j4 = in4
movq %rax,72(%rsp)
# j6 = in6
movq %r10,80(%rsp)
# j8 = in8
movq %r11,88(%rsp)
# j10 = in10
movq %r12,96(%rsp)
# j12 = in12
movq %r13,104(%rsp)
# j14 = in14
movq %r14,112(%rsp)
# x_backup = x
movq %r8,120(%rsp)
# bytesatleast1:
._bytesatleast1:
# unsigned<? bytes - 64
cmp $64,%rdx
# comment:fp stack unchanged by jump
# goto nocopy if !unsigned<
jae ._nocopy
# ctarget = out
movq %rdi,128(%rsp)
# out = &tmp
leaq 192(%rsp),%rdi
# i = bytes
mov %rdx,%rcx
# while (i) { *out++ = *m++; --i }
rep movsb
# out = &tmp
leaq 192(%rsp),%rdi
# m = &tmp
leaq 192(%rsp),%rsi
# comment:fp stack unchanged by fallthrough
# nocopy:
._nocopy:
# out_backup = out
movq %rdi,136(%rsp)
# m_backup = m
movq %rsi,144(%rsp)
# bytes_backup = bytes
movq %rdx,152(%rsp)
# x1 = j0
movq 56(%rsp),%rdi
# x0 = x1
mov %rdi,%rdx
# (uint64) x1 >>= 32
shr $32,%rdi
# x3 = j2
movq 64(%rsp),%rsi
# x2 = x3
mov %rsi,%rcx
# (uint64) x3 >>= 32
shr $32,%rsi
# x5 = j4
movq 72(%rsp),%r8
# x4 = x5
mov %r8,%r9
# (uint64) x5 >>= 32
shr $32,%r8
# x5_stack = x5
movq %r8,160(%rsp)
# x7 = j6
movq 80(%rsp),%r8
# x6 = x7
mov %r8,%rax
# (uint64) x7 >>= 32
shr $32,%r8
# x9 = j8
movq 88(%rsp),%r10
# x8 = x9
mov %r10,%r11
# (uint64) x9 >>= 32
shr $32,%r10
# x11 = j10
movq 96(%rsp),%r12
# x10 = x11
mov %r12,%r13
# x10_stack = x10
movq %r13,168(%rsp)
# (uint64) x11 >>= 32
shr $32,%r12
# x13 = j12
movq 104(%rsp),%r13
# x12 = x13
mov %r13,%r14
# (uint64) x13 >>= 32
shr $32,%r13
# x15 = j14
movq 112(%rsp),%r15
# x14 = x15
mov %r15,%rbx
# (uint64) x15 >>= 32
shr $32,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# i = 20
mov $20,%r15
# mainloop:
._mainloop:
# i_backup = i
movq %r15,184(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x12 + x0
lea (%r14,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x4 ^= a
xor %rbp,%r9
# b = x1 + x5
lea (%rdi,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x9 ^= b
xor %rbp,%r10
# a = x0 + x4
lea (%rdx,%r9),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x8 ^= a
xor %rbp,%r11
# b = x5 + x9
lea (%r15,%r10),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x13 ^= b
xor %rbp,%r13
# a = x4 + x8
lea (%r9,%r11),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x12 ^= a
xor %rbp,%r14
# b = x9 + x13
lea (%r10,%r13),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x1 ^= b
xor %rbp,%rdi
# a = x8 + x12
lea (%r11,%r14),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x13 + x1
lea (%r13,%rdi),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x6 + x10
lea (%rax,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x14 ^= c
xor %r15,%rbx
# c = x10 + x14
lea (%rbp,%rbx),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x2 ^= c
xor %r15,%rcx
# c = x14 + x2
lea (%rbx,%rcx),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x6 ^= c
xor %r15,%rax
# c = x2 + x6
lea (%rcx,%rax),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x11 + x15
lea (%r12,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x3 ^= d
xor %rbp,%rsi
# d = x15 + x3
lea (%r15,%rsi),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x7 ^= d
xor %rbp,%r8
# d = x3 + x7
lea (%rsi,%r8),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x11 ^= d
xor %rbp,%r12
# d = x7 + x11
lea (%r8,%r12),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x3 + x0
lea (%rsi,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x1 ^= a
xor %rbp,%rdi
# b = x4 + x5
lea (%r9,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x6 ^= b
xor %rbp,%rax
# a = x0 + x1
lea (%rdx,%rdi),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x2 ^= a
xor %rbp,%rcx
# b = x5 + x6
lea (%r15,%rax),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x7 ^= b
xor %rbp,%r8
# a = x1 + x2
lea (%rdi,%rcx),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x3 ^= a
xor %rbp,%rsi
# b = x6 + x7
lea (%rax,%r8),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x4 ^= b
xor %rbp,%r9
# a = x2 + x3
lea (%rcx,%rsi),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x7 + x4
lea (%r8,%r9),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x9 + x10
lea (%r10,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x11 ^= c
xor %r15,%r12
# c = x10 + x11
lea (%rbp,%r12),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x8 ^= c
xor %r15,%r11
# c = x11 + x8
lea (%r12,%r11),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x9 ^= c
xor %r15,%r10
# c = x8 + x9
lea (%r11,%r10),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x14 + x15
lea (%rbx,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x12 ^= d
xor %rbp,%r14
# d = x15 + x12
lea (%r15,%r14),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x13 ^= d
xor %rbp,%r13
# d = x12 + x13
lea (%r14,%r13),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x14 ^= d
xor %rbp,%rbx
# d = x13 + x14
lea (%r13,%rbx),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x12 + x0
lea (%r14,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x4 ^= a
xor %rbp,%r9
# b = x1 + x5
lea (%rdi,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x9 ^= b
xor %rbp,%r10
# a = x0 + x4
lea (%rdx,%r9),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x8 ^= a
xor %rbp,%r11
# b = x5 + x9
lea (%r15,%r10),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x13 ^= b
xor %rbp,%r13
# a = x4 + x8
lea (%r9,%r11),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x12 ^= a
xor %rbp,%r14
# b = x9 + x13
lea (%r10,%r13),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x1 ^= b
xor %rbp,%rdi
# a = x8 + x12
lea (%r11,%r14),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x13 + x1
lea (%r13,%rdi),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x6 + x10
lea (%rax,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x14 ^= c
xor %r15,%rbx
# c = x10 + x14
lea (%rbp,%rbx),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x2 ^= c
xor %r15,%rcx
# c = x14 + x2
lea (%rbx,%rcx),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x6 ^= c
xor %r15,%rax
# c = x2 + x6
lea (%rcx,%rax),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x11 + x15
lea (%r12,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x3 ^= d
xor %rbp,%rsi
# d = x15 + x3
lea (%r15,%rsi),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x7 ^= d
xor %rbp,%r8
# d = x3 + x7
lea (%rsi,%r8),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x11 ^= d
xor %rbp,%r12
# d = x7 + x11
lea (%r8,%r12),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# x5 = x5_stack
movq 160(%rsp),%r15
# a = x3 + x0
lea (%rsi,%rdx),%rbp
# (uint32) a <<<= 7
rol $7,%ebp
# x1 ^= a
xor %rbp,%rdi
# b = x4 + x5
lea (%r9,%r15),%rbp
# (uint32) b <<<= 7
rol $7,%ebp
# x6 ^= b
xor %rbp,%rax
# a = x0 + x1
lea (%rdx,%rdi),%rbp
# (uint32) a <<<= 9
rol $9,%ebp
# x2 ^= a
xor %rbp,%rcx
# b = x5 + x6
lea (%r15,%rax),%rbp
# (uint32) b <<<= 9
rol $9,%ebp
# x7 ^= b
xor %rbp,%r8
# a = x1 + x2
lea (%rdi,%rcx),%rbp
# (uint32) a <<<= 13
rol $13,%ebp
# x3 ^= a
xor %rbp,%rsi
# b = x6 + x7
lea (%rax,%r8),%rbp
# (uint32) b <<<= 13
rol $13,%ebp
# x4 ^= b
xor %rbp,%r9
# a = x2 + x3
lea (%rcx,%rsi),%rbp
# (uint32) a <<<= 18
rol $18,%ebp
# x0 ^= a
xor %rbp,%rdx
# b = x7 + x4
lea (%r8,%r9),%rbp
# (uint32) b <<<= 18
rol $18,%ebp
# x5 ^= b
xor %rbp,%r15
# x10 = x10_stack
movq 168(%rsp),%rbp
# x5_stack = x5
movq %r15,160(%rsp)
# c = x9 + x10
lea (%r10,%rbp),%r15
# (uint32) c <<<= 7
rol $7,%r15d
# x11 ^= c
xor %r15,%r12
# c = x10 + x11
lea (%rbp,%r12),%r15
# (uint32) c <<<= 9
rol $9,%r15d
# x8 ^= c
xor %r15,%r11
# c = x11 + x8
lea (%r12,%r11),%r15
# (uint32) c <<<= 13
rol $13,%r15d
# x9 ^= c
xor %r15,%r10
# c = x8 + x9
lea (%r11,%r10),%r15
# (uint32) c <<<= 18
rol $18,%r15d
# x10 ^= c
xor %r15,%rbp
# x15 = x15_stack
movq 176(%rsp),%r15
# x10_stack = x10
movq %rbp,168(%rsp)
# d = x14 + x15
lea (%rbx,%r15),%rbp
# (uint32) d <<<= 7
rol $7,%ebp
# x12 ^= d
xor %rbp,%r14
# d = x15 + x12
lea (%r15,%r14),%rbp
# (uint32) d <<<= 9
rol $9,%ebp
# x13 ^= d
xor %rbp,%r13
# d = x12 + x13
lea (%r14,%r13),%rbp
# (uint32) d <<<= 13
rol $13,%ebp
# x14 ^= d
xor %rbp,%rbx
# d = x13 + x14
lea (%r13,%rbx),%rbp
# (uint32) d <<<= 18
rol $18,%ebp
# x15 ^= d
xor %rbp,%r15
# x15_stack = x15
movq %r15,176(%rsp)
# i = i_backup
movq 184(%rsp),%r15
# unsigned>? i -= 4
sub $4,%r15
# comment:fp stack unchanged by jump
# goto mainloop if unsigned>
ja ._mainloop
# (uint32) x2 += j2
addl 64(%rsp),%ecx
# x3 <<= 32
shl $32,%rsi
# x3 += j2
addq 64(%rsp),%rsi
# (uint64) x3 >>= 32
shr $32,%rsi
# x3 <<= 32
shl $32,%rsi
# x2 += x3
add %rsi,%rcx
# (uint32) x6 += j6
addl 80(%rsp),%eax
# x7 <<= 32
shl $32,%r8
# x7 += j6
addq 80(%rsp),%r8
# (uint64) x7 >>= 32
shr $32,%r8
# x7 <<= 32
shl $32,%r8
# x6 += x7
add %r8,%rax
# (uint32) x8 += j8
addl 88(%rsp),%r11d
# x9 <<= 32
shl $32,%r10
# x9 += j8
addq 88(%rsp),%r10
# (uint64) x9 >>= 32
shr $32,%r10
# x9 <<= 32
shl $32,%r10
# x8 += x9
add %r10,%r11
# (uint32) x12 += j12
addl 104(%rsp),%r14d
# x13 <<= 32
shl $32,%r13
# x13 += j12
addq 104(%rsp),%r13
# (uint64) x13 >>= 32
shr $32,%r13
# x13 <<= 32
shl $32,%r13
# x12 += x13
add %r13,%r14
# (uint32) x0 += j0
addl 56(%rsp),%edx
# x1 <<= 32
shl $32,%rdi
# x1 += j0
addq 56(%rsp),%rdi
# (uint64) x1 >>= 32
shr $32,%rdi
# x1 <<= 32
shl $32,%rdi
# x0 += x1
add %rdi,%rdx
# x5 = x5_stack
movq 160(%rsp),%rdi
# (uint32) x4 += j4
addl 72(%rsp),%r9d
# x5 <<= 32
shl $32,%rdi
# x5 += j4
addq 72(%rsp),%rdi
# (uint64) x5 >>= 32
shr $32,%rdi
# x5 <<= 32
shl $32,%rdi
# x4 += x5
add %rdi,%r9
# x10 = x10_stack
movq 168(%rsp),%r8
# (uint32) x10 += j10
addl 96(%rsp),%r8d
# x11 <<= 32
shl $32,%r12
# x11 += j10
addq 96(%rsp),%r12
# (uint64) x11 >>= 32
shr $32,%r12
# x11 <<= 32
shl $32,%r12
# x10 += x11
add %r12,%r8
# x15 = x15_stack
movq 176(%rsp),%rdi
# (uint32) x14 += j14
addl 112(%rsp),%ebx
# x15 <<= 32
shl $32,%rdi
# x15 += j14
addq 112(%rsp),%rdi
# (uint64) x15 >>= 32
shr $32,%rdi
# x15 <<= 32
shl $32,%rdi
# x14 += x15
add %rdi,%rbx
# out = out_backup
movq 136(%rsp),%rdi
# m = m_backup
movq 144(%rsp),%rsi
# x0 ^= *(uint64 *) (m + 0)
xorq 0(%rsi),%rdx
# *(uint64 *) (out + 0) = x0
movq %rdx,0(%rdi)
# x2 ^= *(uint64 *) (m + 8)
xorq 8(%rsi),%rcx
# *(uint64 *) (out + 8) = x2
movq %rcx,8(%rdi)
# x4 ^= *(uint64 *) (m + 16)
xorq 16(%rsi),%r9
# *(uint64 *) (out + 16) = x4
movq %r9,16(%rdi)
# x6 ^= *(uint64 *) (m + 24)
xorq 24(%rsi),%rax
# *(uint64 *) (out + 24) = x6
movq %rax,24(%rdi)
# x8 ^= *(uint64 *) (m + 32)
xorq 32(%rsi),%r11
# *(uint64 *) (out + 32) = x8
movq %r11,32(%rdi)
# x10 ^= *(uint64 *) (m + 40)
xorq 40(%rsi),%r8
# *(uint64 *) (out + 40) = x10
movq %r8,40(%rdi)
# x12 ^= *(uint64 *) (m + 48)
xorq 48(%rsi),%r14
# *(uint64 *) (out + 48) = x12
movq %r14,48(%rdi)
# x14 ^= *(uint64 *) (m + 56)
xorq 56(%rsi),%rbx
# *(uint64 *) (out + 56) = x14
movq %rbx,56(%rdi)
# bytes = bytes_backup
movq 152(%rsp),%rdx
# in8 = j8
movq 88(%rsp),%rcx
# in8 += 1
add $1,%rcx
# j8 = in8
movq %rcx,88(%rsp)
# unsigned>? unsigned<? bytes - 64
cmp $64,%rdx
# comment:fp stack unchanged by jump
# goto bytesatleast65 if unsigned>
ja ._bytesatleast65
# comment:fp stack unchanged by jump
# goto bytesatleast64 if !unsigned<
jae ._bytesatleast64
# m = out
mov %rdi,%rsi
# out = ctarget
movq 128(%rsp),%rdi
# i = bytes
mov %rdx,%rcx
# while (i) { *out++ = *m++; --i }
rep movsb
# comment:fp stack unchanged by fallthrough
# bytesatleast64:
._bytesatleast64:
# x = x_backup
movq 120(%rsp),%rdi
# in8 = j8
movq 88(%rsp),%rsi
# *(uint64 *) (x + 32) = in8
movq %rsi,32(%rdi)
# r11 = r11_stack
movq 0(%rsp),%r11
# r12 = r12_stack
movq 8(%rsp),%r12
# r13 = r13_stack
movq 16(%rsp),%r13
# r14 = r14_stack
movq 24(%rsp),%r14
# r15 = r15_stack
movq 32(%rsp),%r15
# rbx = rbx_stack
movq 40(%rsp),%rbx
# rbp = rbp_stack
movq 48(%rsp),%rbp
# comment:fp stack unchanged by fallthrough
# done:
._done:
# leave
add %r11,%rsp
mov %rdi,%rax
mov %rsi,%rdx
ret
# bytesatleast65:
._bytesatleast65:
# bytes -= 64
sub $64,%rdx
# out += 64
add $64,%rdi
# m += 64
add $64,%rsi
# comment:fp stack unchanged by jump
# goto bytesatleast1
jmp ._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)
/*
* Glue code for optimized assembly version of Salsa20.
*
* Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
*
* The assembly codes are public domain assembly codes written by Daniel. J.
* Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
* and to remove extraneous comments and functions that are not needed.
* - i586 version, renamed as salsa20-i586-asm_32.S
* available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
* - x86-64 version, renamed as salsa20-x86_64-asm_64.S
* available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
*
* Also modified to set up the initial state using the generic C code rather
* than in assembly.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include <asm/unaligned.h>
#include <crypto/internal/skcipher.h>
#include <crypto/salsa20.h>
#include <linux/module.h>
asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
u32 bytes);
static int salsa20_asm_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
u32 state[16];
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_salsa20_init(state, ctx, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
salsa20_encrypt_bytes(state, walk.src.virt.addr,
walk.dst.virt.addr, nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "salsa20",
.base.cra_driver_name = "salsa20-asm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct salsa20_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = SALSA20_MIN_KEY_SIZE,
.max_keysize = SALSA20_MAX_KEY_SIZE,
.ivsize = SALSA20_IV_SIZE,
.chunksize = SALSA20_BLOCK_SIZE,
.setkey = crypto_salsa20_setkey,
.encrypt = salsa20_asm_crypt,
.decrypt = salsa20_asm_crypt,
};
static int __init init(void)
{
return crypto_register_skcipher(&alg);
}
static void __exit fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(init);
module_exit(fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
MODULE_ALIAS_CRYPTO("salsa20");
MODULE_ALIAS_CRYPTO("salsa20-asm");
......@@ -1436,34 +1436,6 @@ config CRYPTO_SALSA20
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_SALSA20_586
tristate "Salsa20 stream cipher algorithm (i586)"
depends on (X86 || UML_X86) && !64BIT
select CRYPTO_BLKCIPHER
select CRYPTO_SALSA20
help
Salsa20 stream cipher algorithm.
Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_SALSA20_X86_64
tristate "Salsa20 stream cipher algorithm (x86_64)"
depends on (X86 || UML_X86) && 64BIT
select CRYPTO_BLKCIPHER
select CRYPTO_SALSA20
help
Salsa20 stream cipher algorithm.
Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
The Salsa20 stream cipher algorithm is designed by Daniel J.
Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
config CRYPTO_CHACHA20
tristate "ChaCha20 cipher algorithm"
select CRYPTO_BLKCIPHER
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment