Commit 1ed55eac authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu:

 - Added aesni/avx/x86_64 implementations for camellia.

 - Optimised AVX code for cast5/serpent/twofish/cast6.

 - Fixed vmac bug with unaligned input.

 - Allow compression algorithms in FIPS mode.

 - Optimised crc32c implementation for Intel.

 - Misc fixes.

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits)
  crypto: caam - Updated SEC-4.0 device tree binding for ERA information.
  crypto: testmgr - remove superfluous initializers for xts(aes)
  crypto: testmgr - allow compression algs in fips mode
  crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel
  crypto: testmgr - clean alg_test_null entries in alg_test_descs[]
  crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests
  crypto: cast5/cast6 - move lookup tables to shared module
  padata: use __this_cpu_read per-cpu helper
  crypto: s5p-sss - Fix compilation error
  crypto: picoxcell - Add terminating entry for platform_device_id table
  crypto: omap-aes - select BLKCIPHER2
  crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher
  crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file
  crypto: tcrypt - add async speed test for camellia cipher
  crypto: tegra-aes - fix error-valued pointer dereference
  crypto: tegra - fix missing unlock on error case
  crypto: cast5/avx - avoid using temporary stack buffers
  crypto: serpent/avx - avoid using temporary stack buffers
  crypto: twofish/avx - avoid using temporary stack buffers
  crypto: cast6/avx - avoid using temporary stack buffers
  ...
parents 08242bc2 a2c0911c
......@@ -54,7 +54,8 @@ PROPERTIES
- compatible
Usage: required
Value type: <string>
Definition: Must include "fsl,sec-v4.0"
Definition: Must include "fsl,sec-v4.0". Also includes SEC
ERA versions (optional) with which the device is compatible.
- #address-cells
Usage: required
......@@ -106,7 +107,7 @@ PROPERTIES
EXAMPLE
crypto@300000 {
compatible = "fsl,sec-v4.0";
compatible = "fsl,sec-v4.0", "fsl,sec-era-v2.0";
#address-cells = <1>;
#size-cells = <1>;
reg = <0x300000 0x10000>;
......
......@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
......@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
......@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
This diff is collapsed.
This diff is collapsed.
......@@ -32,53 +32,24 @@
#include <crypto/algapi.h>
#include <crypto/lrw.h>
#include <crypto/xts.h>
#include <asm/crypto/camellia.h>
#include <asm/crypto/glue_helper.h>
#define CAMELLIA_MIN_KEY_SIZE 16
#define CAMELLIA_MAX_KEY_SIZE 32
#define CAMELLIA_BLOCK_SIZE 16
#define CAMELLIA_TABLE_BYTE_LEN 272
struct camellia_ctx {
u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
u32 key_length;
};
/* regular block cipher functions */
asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
EXPORT_SYMBOL_GPL(__camellia_enc_blk);
asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src);
EXPORT_SYMBOL_GPL(camellia_dec_blk);
/* 2-way parallel cipher functions */
asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src);
static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, true);
}
static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, true);
}
EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
......@@ -1275,8 +1246,7 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
camellia_setup256(kk, subkey);
}
static int __camellia_setkey(struct camellia_ctx *cctx,
const unsigned char *key,
int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
unsigned int key_len, u32 *flags)
{
if (key_len != 16 && key_len != 24 && key_len != 32) {
......@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,
return 0;
}
EXPORT_SYMBOL_GPL(__camellia_setkey);
static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
......@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
&tfm->crt_flags);
}
static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
{
u128 iv = *src;
......@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
u128_xor(&dst[1], &dst[1], &iv);
}
EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
if (dst != src)
*dst = *src;
u128_to_be128(&ctrblk, iv);
u128_inc(iv);
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
}
EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblks[2];
......@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
dst[1] = src[1];
}
u128_to_be128(&ctrblks[0], iv);
u128_inc(iv);
u128_to_be128(&ctrblks[1], iv);
u128_inc(iv);
le128_to_be128(&ctrblks[0], iv);
le128_inc(iv);
le128_to_be128(&ctrblks[1], iv);
le128_inc(iv);
camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
}
EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
static const struct common_glue_ctx camellia_enc = {
.num_funcs = 2,
......@@ -1464,12 +1437,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
camellia_dec_blk(ctx, srcdst, srcdst);
}
struct camellia_lrw_ctx {
struct lrw_table_ctx lrw_table;
struct camellia_ctx camellia_ctx;
};
static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
......@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
return lrw_init_table(&ctx->lrw_table,
key + keylen - CAMELLIA_BLOCK_SIZE);
}
EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
......@@ -1519,19 +1488,15 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
return lrw_crypt(desc, dst, src, nbytes, &req);
}
static void lrw_exit_tfm(struct crypto_tfm *tfm)
void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
{
struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
lrw_free_table(&ctx->lrw_table);
}
EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
struct camellia_xts_ctx {
struct camellia_ctx tweak_ctx;
struct camellia_ctx crypt_ctx;
};
static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
......@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
flags);
}
EXPORT_SYMBOL_GPL(xts_camellia_setkey);
static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
......@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {
.cra_alignmask = 0,
.cra_type = &crypto_blkcipher_type,
.cra_module = THIS_MODULE,
.cra_exit = lrw_exit_tfm,
.cra_exit = lrw_camellia_exit_tfm,
.cra_u = {
.blkcipher = {
.min_keysize = CAMELLIA_MIN_KEY_SIZE +
......
This diff is collapsed.
......@@ -37,29 +37,14 @@
#define CAST5_PARALLEL_BLOCKS 16
asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast5_enc_blk_16way(ctx, dst, src, false);
}
static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast5_enc_blk_16way(ctx, dst, src, true);
}
static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
const u8 *src)
{
cast5_dec_blk_16way(ctx, dst, src);
}
asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
__be64 *iv);
static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
{
......@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
const unsigned int bsize = CAST5_BLOCK_SIZE;
unsigned int nbytes;
void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
int err;
fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
err = blkcipher_walk_virt(desc, walk);
desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
......@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
if (enc)
cast5_enc_blk_xway(ctx, wdst, wsrc);
else
cast5_dec_blk_xway(ctx, wdst, wsrc);
fn(ctx, wdst, wsrc);
wsrc += bsize * CAST5_PARALLEL_BLOCKS;
wdst += bsize * CAST5_PARALLEL_BLOCKS;
......@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
goto done;
}
fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
/* Handle leftovers */
do {
if (enc)
__cast5_encrypt(ctx, wdst, wsrc);
else
__cast5_decrypt(ctx, wdst, wsrc);
fn(ctx, wdst, wsrc);
wsrc += bsize;
wdst += bsize;
......@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
u64 last_iv;
int i;
/* Start of the last block. */
src += nbytes / bsize - 1;
......@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
src -= CAST5_PARALLEL_BLOCKS - 1;
dst -= CAST5_PARALLEL_BLOCKS - 1;
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
ivs[i] = src[i];
cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
*(dst + (i + 1)) ^= *(ivs + i);
cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
nbytes -= bsize;
if (nbytes < bsize)
......@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
unsigned int nbytes = walk->nbytes;
u64 *src = (u64 *)walk->src.virt.addr;
u64 *dst = (u64 *)walk->dst.virt.addr;
u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
__be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
int i;
/* Process multi-block batch */
if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
do {
/* create ctrblks for parallel encrypt */
for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
ctrblocks[i] = cpu_to_be64(ctrblk++);
}
cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
(u8 *)ctrblocks);
cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
(__be64 *)walk->iv);
src += CAST5_PARALLEL_BLOCKS;
dst += CAST5_PARALLEL_BLOCKS;
......@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
/* Handle leftovers */
do {
u64 ctrblk;
if (dst != src)
*dst = *src;
ctrblocks[0] = cpu_to_be64(ctrblk++);
ctrblk = *(u64 *)walk->iv;
be64_add_cpu((__be64 *)walk->iv, 1);
__cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
*dst ^= ctrblocks[0];
__cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
*dst ^= ctrblk;
src += 1;
dst += 1;
......@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
} while (nbytes >= bsize);
done:
*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
return nbytes;
}
......
......@@ -23,22 +23,24 @@
*
*/
#include "glue_helper-asm-avx.S"
.file "cast6-avx-x86_64-asm_64.S"
.extern cast6_s1
.extern cast6_s2
.extern cast6_s3
.extern cast6_s4
.extern cast_s1
.extern cast_s2
.extern cast_s3
.extern cast_s4
/* structure of crypto context */
#define km 0
#define kr (12*4*4)
/* s-boxes */
#define s1 cast6_s1
#define s2 cast6_s2
#define s3 cast6_s3
#define s4 cast6_s4
#define s1 cast_s1
#define s2 cast_s2
#define s3 cast_s3
#define s4 cast_s4
/**********************************************************************
8-way AVX cast6
......@@ -205,11 +207,7 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;
#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
......@@ -217,39 +215,21 @@
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);
#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpshufb rmask, x0, x0; \
vpshufb rmask, x1, x1; \
vpshufb rmask, x2, x2; \
vpshufb rmask, x3, x3; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
vpshufb rmask, x3, x3;
.data
.align 16
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.Lrkr_enc_Q_Q_QBAR_QBAR:
.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
......@@ -269,31 +249,26 @@
.text
.align 16
.global __cast6_enc_blk_8way
.type __cast6_enc_blk_8way,@function;
.align 8
.type __cast6_enc_blk8,@function;
__cast6_enc_blk_8way:
__cast6_enc_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/
pushq %rbp;
pushq %rbx;
pushq %rcx;
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
movq %rsi, %r11;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
preload_rkr(0, dummy, none);
Q(0);
......@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
QBAR(10);
QBAR(11);
popq %rcx;
popq %rbx;
popq %rbp;
vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;
testb %cl, %cl;
jnz __enc_xor8;
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret;
__enc_xor8:
outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
.align 8
.type __cast6_dec_blk8,@function;
ret;
.align 16
.global cast6_dec_blk_8way
.type cast6_dec_blk_8way,@function;
cast6_dec_blk_8way:
__cast6_dec_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/
pushq %rbp;
......@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
movq %rsi, %r11;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
Q(11);
......@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
popq %rbp;
vmovdqa .Lbswap_mask, RKM;
leaq (4*4*4)(%r11), %rax;
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
ret;
.align 8
.global cast6_ecb_enc_8way
.type cast6_ecb_enc_8way,@function;
cast6_ecb_enc_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __cast6_enc_blk8;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global cast6_ecb_dec_8way
.type cast6_ecb_dec_8way,@function;
cast6_ecb_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __cast6_dec_blk8;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global cast6_cbc_dec_8way
.type cast6_cbc_dec_8way,@function;
cast6_cbc_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __cast6_dec_blk8;
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
popq %r12;
ret;
.align 8
.global cast6_ctr_8way
.type cast6_ctr_8way,@function;
cast6_ctr_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RX, RKR, RKM);
call __cast6_enc_blk8;
store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
popq %r12;
ret;
......@@ -40,79 +40,34 @@
#define CAST6_PARALLEL_BLOCKS 8
asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);
static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast6_enc_blk_8way(ctx, dst, src, false);
}
static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
__cast6_enc_blk_8way(ctx, dst, src, true);
}
static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
const u8 *src)
{
cast6_dec_blk_8way(ctx, dst, src);
}
static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
{
u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
unsigned int j;
for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];
cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
le128 *iv);
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
u128_to_be128(&ctrblk, iv);
u128_inc(iv);
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
__cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk);
}
static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
{
be128 ctrblks[CAST6_PARALLEL_BLOCKS];
unsigned int i;
for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
}
cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}
static const struct common_glue_ctx cast6_enc = {
.num_funcs = 2,
.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
......@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
......@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
......@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
......@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return;
}
......@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return;
}
......
......@@ -32,6 +32,8 @@
#include <asm/cpufeature.h>
#include <asm/cpu_device_id.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4
......@@ -44,6 +46,31 @@
#define REX_PRE
#endif
#ifdef CONFIG_X86_64
/*
* use carryless multiply version of crc32c when buffer
* size is >= 512 (when eager fpu is enabled) or
* >= 1024 (when eager fpu is disabled) to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
#if defined(X86_FEATURE_EAGER_FPU)
#define set_pcl_breakeven_point() \
do { \
if (!use_eager_fpu()) \
crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
} while (0)
#else
#define set_pcl_breakeven_point() \
(crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
#endif
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
......@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
return 0;
}
#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}
static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
out);
}
#endif /* CONFIG_X86_64 */
static struct shash_alg alg = {
.setkey = crc32c_intel_setkey,
.init = crc32c_intel_init,
......@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
{
if (!x86_match_cpu(crc32c_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
if (cpu_has_pclmulqdq) {
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
set_pcl_breakeven_point();
}
#endif
return crypto_register_shash(&alg);
}
......
This diff is collapsed.
/*
* Shared glue code for 128bit block ciphers, AVX assembler macros
*
* Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
vmovdqu (0*16)(src), x0; \
vmovdqu (1*16)(src), x1; \
vmovdqu (2*16)(src), x2; \
vmovdqu (3*16)(src), x3; \
vmovdqu (4*16)(src), x4; \
vmovdqu (5*16)(src), x5; \
vmovdqu (6*16)(src), x6; \
vmovdqu (7*16)(src), x7;
#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
vmovdqu x0, (0*16)(dst); \
vmovdqu x1, (1*16)(dst); \
vmovdqu x2, (2*16)(dst); \
vmovdqu x3, (3*16)(dst); \
vmovdqu x4, (4*16)(dst); \
vmovdqu x5, (5*16)(dst); \
vmovdqu x6, (6*16)(dst); \
vmovdqu x7, (7*16)(dst);
#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
vpxor (0*16)(src), x1, x1; \
vpxor (1*16)(src), x2, x2; \
vpxor (2*16)(src), x3, x3; \
vpxor (3*16)(src), x4, x4; \
vpxor (4*16)(src), x5, x5; \
vpxor (5*16)(src), x6, x6; \
vpxor (6*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
vpslldq $8, tmp, tmp; \
vpsubq tmp, x, x;
#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
vpcmpeqd t0, t0, t0; \
vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
vmovdqa bswap, t1; \
\
/* load IV and byteswap */ \
vmovdqu (iv), x7; \
vpshufb t1, x7, x0; \
\
/* construct IVs */ \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x1; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x2; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x3; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x4; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x5; \
inc_le128(x7, t0, t2); \
vpshufb t1, x7, x6; \
inc_le128(x7, t0, t2); \
vmovdqa x7, t2; \
vpshufb t1, x7, x7; \
inc_le128(t2, t0, t1); \
vmovdqu t2, (iv);
#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
vpxor (0*16)(src), x0, x0; \
vpxor (1*16)(src), x1, x1; \
vpxor (2*16)(src), x2, x2; \
vpxor (3*16)(src), x3, x3; \
vpxor (4*16)(src), x4, x4; \
vpxor (5*16)(src), x5, x5; \
vpxor (6*16)(src), x6, x6; \
vpxor (7*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
......@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
u8 *src = (u8 *)walk->src.virt.addr;
u8 *dst = (u8 *)walk->dst.virt.addr;
unsigned int nbytes = walk->nbytes;
u128 ctrblk;
le128 ctrblk;
u128 tmp;
be128_to_u128(&ctrblk, (be128 *)walk->iv);
be128_to_le128(&ctrblk, (be128 *)walk->iv);
memcpy(&tmp, src, nbytes);
fn_ctr(ctx, &tmp, &tmp, &ctrblk);
memcpy(dst, &tmp, nbytes);
u128_to_be128((be128 *)walk->iv, &ctrblk);
le128_to_be128((be128 *)walk->iv, &ctrblk);
}
EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
......@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
unsigned int nbytes = walk->nbytes;
u128 *src = (u128 *)walk->src.virt.addr;
u128 *dst = (u128 *)walk->dst.virt.addr;
u128 ctrblk;
le128 ctrblk;
unsigned int num_blocks, func_bytes;
unsigned int i;
be128_to_u128(&ctrblk, (be128 *)walk->iv);
be128_to_le128(&ctrblk, (be128 *)walk->iv);
/* Process multi-block batch */
for (i = 0; i < gctx->num_funcs; i++) {
......@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
}
done:
u128_to_be128((be128 *)walk->iv, &ctrblk);
le128_to_be128((be128 *)walk->iv, &ctrblk);
return nbytes;
}
......
......@@ -24,7 +24,16 @@
*
*/
#include "glue_helper-asm-avx.S"
.file "serpent-avx-x86_64-asm_64.S"
.data
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.text
#define CTX %rdi
......@@ -550,51 +559,27 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;
#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
vmovdqu (0*4*4)(in), x0; \
vmovdqu (1*4*4)(in), x1; \
vmovdqu (2*4*4)(in), x2; \
vmovdqu (3*4*4)(in), x3; \
\
#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vmovdqu x0, (0*4*4)(out); \
vmovdqu x1, (1*4*4)(out); \
vmovdqu x2, (2*4*4)(out); \
vmovdqu x3, (3*4*4)(out);
#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
.align 8
.global __serpent_enc_blk_8way_avx
.type __serpent_enc_blk_8way_avx,@function;
.type __serpent_enc_blk8_avx,@function;
__serpent_enc_blk_8way_avx:
__serpent_enc_blk8_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/
vpcmpeqd RNOT, RNOT, RNOT;
leaq (4*4*4)(%rdx), %rax;
read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
K2(RA, RB, RC, RD, RE, 0);
S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
......@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
leaq (4*4*4)(%rsi), %rax;
testb %cl, %cl;
jnz __enc_xor8;
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret;
__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
ret;
.align 8
.global serpent_dec_blk_8way_avx
.type serpent_dec_blk_8way_avx,@function;
.type __serpent_dec_blk8_avx,@function;
serpent_dec_blk_8way_avx:
__serpent_dec_blk8_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* output:
* RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
*/
vpcmpeqd RNOT, RNOT, RNOT;
leaq (4*4*4)(%rdx), %rax;
read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
K2(RA, RB, RC, RD, RE, 32);
SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
......@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
leaq (4*4*4)(%rsi), %rax;
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
ret;
.align 8
.global serpent_ecb_enc_8way_avx
.type serpent_ecb_enc_8way_avx,@function;
serpent_ecb_enc_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __serpent_enc_blk8_avx;
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global serpent_ecb_dec_8way_avx
.type serpent_ecb_dec_8way_avx,@function;
serpent_ecb_dec_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __serpent_dec_blk8_avx;
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
ret;
.align 8
.global serpent_cbc_dec_8way_avx
.type serpent_cbc_dec_8way_avx,@function;
serpent_cbc_dec_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __serpent_dec_blk8_avx;
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
ret;
.align 8
.global serpent_ctr_8way_avx
.type serpent_ctr_8way_avx,@function;
serpent_ctr_8way_avx:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RK0, RK1, RK2);
call __serpent_enc_blk8_avx;
store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
......@@ -42,55 +42,24 @@
#include <asm/crypto/ablk_helper.h>
#include <asm/crypto/glue_helper.h>
static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
{
u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
unsigned int j;
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];
serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
u128_to_be128(&ctrblk, iv);
u128_inc(iv);
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk);
}
static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
{
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
unsigned int i;
for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
}
serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}
static const struct common_glue_ctx serpent_enc = {
.num_funcs = 2,
.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
......@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
......@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
......@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
}, {
.num_blocks = 1,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
......@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
return;
}
......@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
return;
}
......
......@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
u128_to_be128(&ctrblk, iv);
u128_inc(iv);
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, src, (u128 *)&ctrblk);
}
static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
le128 *iv)
{
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
unsigned int i;
......@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
le128_to_be128(&ctrblks[i], iv);
le128_inc(iv);
}
serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
......
......@@ -23,7 +23,16 @@
*
*/
#include "glue_helper-asm-avx.S"
.file "twofish-avx-x86_64-asm_64.S"
.data
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
.text
/* structure of crypto context */
......@@ -217,69 +226,45 @@
vpunpcklqdq x3, t2, x2; \
vpunpckhqdq x3, t2, x3;
#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
vpxor (0*4*4)(in), wkey, x0; \
vpxor (1*4*4)(in), wkey, x1; \
vpxor (2*4*4)(in), wkey, x2; \
vpxor (3*4*4)(in), wkey, x3; \
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
vpxor x0, wkey, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor x1, wkey, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor x2, wkey, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor x3, wkey, x3; \
vmovdqu x3, (3*4*4)(out);
\
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
\
vpxor x0, wkey, x0; \
vpxor (0*4*4)(out), x0, x0; \
vmovdqu x0, (0*4*4)(out); \
vpxor x1, wkey, x1; \
vpxor (1*4*4)(out), x1, x1; \
vmovdqu x1, (1*4*4)(out); \
vpxor x2, wkey, x2; \
vpxor (2*4*4)(out), x2, x2; \
vmovdqu x2, (2*4*4)(out); \
vpxor x3, wkey, x3; \
vpxor (3*4*4)(out), x3, x3; \
vmovdqu x3, (3*4*4)(out);
vpxor x3, wkey, x3;
.align 8
.global __twofish_enc_blk_8way
.type __twofish_enc_blk_8way,@function;
.type __twofish_enc_blk8,@function;
__twofish_enc_blk_8way:
__twofish_enc_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
*/
vmovdqu w(CTX), RK1;
pushq %rbp;
pushq %rbx;
pushq %rcx;
vmovdqu w(CTX), RK1;
leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
preload_rgi(RA1);
rotate_1l(RD1);
inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
rotate_1l(RD2);
movq %rsi, %r11;
encrypt_cycle(0);
encrypt_cycle(1);
encrypt_cycle(2);
......@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
popq %rbx;
popq %rbp;
leaq (4*4*4)(%r11), %rax;
testb %cl, %cl;
jnz __enc_xor8;
outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
ret;
__enc_xor8:
outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
ret;
.align 8
.global twofish_dec_blk_8way
.type twofish_dec_blk_8way,@function;
.type __twofish_dec_blk8,@function;
twofish_dec_blk_8way:
__twofish_dec_blk8:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/
vmovdqu (w+4*4)(CTX), RK1;
pushq %rbp;
pushq %rbx;
vmovdqu (w+4*4)(CTX), RK1;
leaq (4*4*4)(%rdx), %rax;
inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
preload_rgi(RC1);
rotate_1l(RA1);
inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
rotate_1l(RA2);
movq %rsi, %r11;
decrypt_cycle(7);
decrypt_cycle(6);
decrypt_cycle(5);
......@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
popq %rbx;
popq %rbp;
leaq (4*4*4)(%r11), %rax;
outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
ret;
.align 8
.global twofish_ecb_enc_8way
.type twofish_ecb_enc_8way,@function;
twofish_ecb_enc_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
call __twofish_enc_blk8;
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
ret;
.align 8
.global twofish_ecb_dec_8way
.type twofish_ecb_dec_8way,@function;
twofish_ecb_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
movq %rsi, %r11;
load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
call __twofish_dec_blk8;
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
ret;
.align 8
.global twofish_cbc_dec_8way
.type twofish_cbc_dec_8way,@function;
twofish_cbc_dec_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
call __twofish_dec_blk8;
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
popq %r12;
ret;
.align 8
.global twofish_ctr_8way
.type twofish_ctr_8way,@function;
twofish_ctr_8way:
/* input:
* %rdi: ctx, CTX
* %rsi: dst
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
pushq %r12;
movq %rsi, %r11;
movq %rdx, %r12;
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RX0, RX1, RY0);
call __twofish_enc_blk8;
store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
popq %r12;
ret;
......@@ -45,66 +45,23 @@
#define TWOFISH_PARALLEL_BLOCKS 8
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
}
/* 8-way parallel cipher functions */
asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_8way(ctx, dst, src, false);
}
static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_8way(ctx, dst, src, true);
}
asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
const u8 *src)
{
twofish_dec_blk_8way(ctx, dst, src);
}
static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
{
u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
unsigned int j;
for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
ivs[j] = src[j];
twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
__twofish_enc_blk_3way(ctx, dst, src, false);
}
static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
{
be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
unsigned int i;
for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
if (dst != src)
dst[i] = src[i];
u128_to_be128(&ctrblks[i], iv);
u128_inc(iv);
}
twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
}
static const struct common_glue_ctx twofish_enc = {
.num_funcs = 3,
......@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
......@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
......@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
......@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
}, {
.num_blocks = 3,
.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
......@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
return;
}
......@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
return;
}
......
......@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
}
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
be128 ctrblk;
if (dst != src)
*dst = *src;
u128_to_be128(&ctrblk, iv);
u128_inc(iv);
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
u128_xor(dst, dst, (u128 *)&ctrblk);
......@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
u128 *iv)
le128 *iv)
{
be128 ctrblks[3];
......@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
dst[2] = src[2];
}
u128_to_be128(&ctrblks[0], iv);
u128_inc(iv);
u128_to_be128(&ctrblks[1], iv);
u128_inc(iv);
u128_to_be128(&ctrblks[2], iv);
u128_inc(iv);
le128_to_be128(&ctrblks[0], iv);
le128_inc(iv);
le128_to_be128(&ctrblks[1], iv);
le128_inc(iv);
le128_to_be128(&ctrblks[2], iv);
le128_inc(iv);
twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
}
......
#ifndef ASM_X86_CAMELLIA_H
#define ASM_X86_CAMELLIA_H
#include <linux/kernel.h>
#include <linux/crypto.h>
#define CAMELLIA_MIN_KEY_SIZE 16
#define CAMELLIA_MAX_KEY_SIZE 32
#define CAMELLIA_BLOCK_SIZE 16
#define CAMELLIA_TABLE_BYTE_LEN 272
#define CAMELLIA_PARALLEL_BLOCKS 2
struct camellia_ctx {
u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
u32 key_length;
};
struct camellia_lrw_ctx {
struct lrw_table_ctx lrw_table;
struct camellia_ctx camellia_ctx;
};
struct camellia_xts_ctx {
struct camellia_ctx tweak_ctx;
struct camellia_ctx crypt_ctx;
};
extern int __camellia_setkey(struct camellia_ctx *cctx,
const unsigned char *key,
unsigned int key_len, u32 *flags);
extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);
extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm);
extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);
/* regular block cipher functions */
asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src);
/* 2-way parallel cipher functions */
asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src);
static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, true);
}
static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, false);
}
static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, true);
}
/* glue helpers */
extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
le128 *iv);
extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
le128 *iv);
#endif /* ASM_X86_CAMELLIA_H */
......@@ -13,7 +13,7 @@
typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
u128 *iv);
le128 *iv);
#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
......@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)
kernel_fpu_end();
}
static inline void u128_to_be128(be128 *dst, const u128 *src)
static inline void le128_to_be128(be128 *dst, const le128 *src)
{
dst->a = cpu_to_be64(src->a);
dst->b = cpu_to_be64(src->b);
dst->a = cpu_to_be64(le64_to_cpu(src->a));
dst->b = cpu_to_be64(le64_to_cpu(src->b));
}
static inline void be128_to_u128(u128 *dst, const be128 *src)
static inline void be128_to_le128(le128 *dst, const be128 *src)
{
dst->a = be64_to_cpu(src->a);
dst->b = be64_to_cpu(src->b);
dst->a = cpu_to_le64(be64_to_cpu(src->a));
dst->b = cpu_to_le64(be64_to_cpu(src->b));
}
static inline void u128_inc(u128 *i)
static inline void le128_inc(le128 *i)
{
i->b++;
if (!i->b)
i->a++;
u64 a = le64_to_cpu(i->a);
u64 b = le64_to_cpu(i->b);
b++;
if (!b)
a++;
i->a = cpu_to_le64(a);
i->b = cpu_to_le64(b);
}
extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
......
......@@ -6,27 +6,14 @@
#define SERPENT_PARALLEL_BLOCKS 8
asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src);
static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
__serpent_enc_blk_8way_avx(ctx, dst, src, false);
}
static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
__serpent_enc_blk_8way_avx(ctx, dst, src, true);
}
static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
serpent_dec_blk_8way_avx(ctx, dst, src);
}
asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
const u8 *src, le128 *iv);
#endif
......@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
/* helpers from twofish_x86_64-3way module */
extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
u128 *iv);
le128 *iv);
extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
u128 *iv);
le128 *iv);
extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen);
......
......@@ -324,9 +324,19 @@ config CRYPTO_CRC32C
by iSCSI for header and data digests and by others.
See Castagnoli93. Module will be crc32c.
config CRYPTO_CRC32C_X86_64
bool
depends on X86 && 64BIT
select CRYPTO_HASH
help
In Intel processor with SSE4.2 supported, the processor will
support CRC32C calculation using hardware accelerated CRC32
instruction optimized with PCLMULQDQ instruction when available.
config CRYPTO_CRC32C_INTEL
tristate "CRC32c INTEL hardware acceleration"
depends on X86
select CRYPTO_CRC32C_X86_64 if 64BIT
select CRYPTO_HASH
help
In Intel processor with SSE4.2 supported, the processor will
......@@ -793,6 +803,28 @@ config CRYPTO_CAMELLIA_X86_64
See also:
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)"
depends on X86 && 64BIT
depends on CRYPTO
select CRYPTO_ALGAPI
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_GLUE_HELPER_X86
select CRYPTO_CAMELLIA_X86_64
select CRYPTO_LRW
select CRYPTO_XTS
help
Camellia cipher algorithm module (x86_64/AES-NI/AVX).
Camellia is a symmetric key block cipher developed jointly
at NTT and Mitsubishi Electric Corporation.
The Camellia specifies three key sizes: 128, 192 and 256 bits.
See also:
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
config CRYPTO_CAMELLIA_SPARC64
tristate "Camellia cipher algorithm (SPARC64)"
depends on SPARC64
......@@ -809,9 +841,16 @@ config CRYPTO_CAMELLIA_SPARC64
See also:
<https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
config CRYPTO_CAST_COMMON
tristate
help
Common parts of the CAST cipher algorithms shared by the
generic c and the assembler implementations.
config CRYPTO_CAST5
tristate "CAST5 (CAST-128) cipher algorithm"
select CRYPTO_ALGAPI
select CRYPTO_CAST_COMMON
help
The CAST5 encryption algorithm (synonymous with CAST-128) is
described in RFC2144.
......@@ -822,6 +861,7 @@ config CRYPTO_CAST5_AVX_X86_64
select CRYPTO_ALGAPI
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_CAST_COMMON
select CRYPTO_CAST5
help
The CAST5 encryption algorithm (synonymous with CAST-128) is
......@@ -833,6 +873,7 @@ config CRYPTO_CAST5_AVX_X86_64
config CRYPTO_CAST6
tristate "CAST6 (CAST-256) cipher algorithm"
select CRYPTO_ALGAPI
select CRYPTO_CAST_COMMON
help
The CAST6 encryption algorithm (synonymous with CAST-256) is
described in RFC2612.
......@@ -844,6 +885,7 @@ config CRYPTO_CAST6_AVX_X86_64
select CRYPTO_CRYPTD
select CRYPTO_ABLK_HELPER_X86
select CRYPTO_GLUE_HELPER_X86
select CRYPTO_CAST_COMMON
select CRYPTO_CAST6
select CRYPTO_LRW
select CRYPTO_XTS
......
......@@ -68,6 +68,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
obj-$(CONFIG_CRYPTO_CAST6) += cast6_generic.o
obj-$(CONFIG_CRYPTO_ARC4) += arc4.o
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -971,11 +971,13 @@ static int do_test(int m)
case 3:
ret += tcrypt_test("ecb(des)");
ret += tcrypt_test("cbc(des)");
ret += tcrypt_test("ctr(des)");
break;
case 4:
ret += tcrypt_test("ecb(des3_ede)");
ret += tcrypt_test("cbc(des3_ede)");
ret += tcrypt_test("ctr(des3_ede)");
break;
case 5:
......@@ -1479,6 +1481,10 @@ static int do_test(int m)
test_hash_speed("ghash-generic", sec, hash_speed_template_16);
if (mode > 300 && mode < 400) break;
case 319:
test_hash_speed("crc32c", sec, generic_hash_speed_template);
if (mode > 300 && mode < 400) break;
case 399:
break;
......@@ -1722,6 +1728,29 @@ static int do_test(int m)
speed_template_32_64);
break;
case 508:
test_acipher_speed("ecb(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("ecb(camellia)", DECRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("cbc(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("cbc(camellia)", DECRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("ctr(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("ctr(camellia)", DECRYPT, sec, NULL, 0,
speed_template_16_32);
test_acipher_speed("lrw(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_32_48);
test_acipher_speed("lrw(camellia)", DECRYPT, sec, NULL, 0,
speed_template_32_48);
test_acipher_speed("xts(camellia)", ENCRYPT, sec, NULL, 0,
speed_template_32_64);
test_acipher_speed("xts(camellia)", DECRYPT, sec, NULL, 0,
speed_template_32_64);
break;
case 1000:
test_available();
break;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -254,6 +254,7 @@ config CRYPTO_DEV_OMAP_AES
tristate "Support for OMAP AES hw engine"
depends on ARCH_OMAP2 || ARCH_OMAP3
select CRYPTO_AES
select CRYPTO_BLKCIPHER2
help
OMAP processors have AES module accelerator. Select this if you
want to use the OMAP module for AES algorithms.
......
......@@ -1863,6 +1863,7 @@ static int __devexit spacc_remove(struct platform_device *pdev)
static const struct platform_device_id spacc_id_table[] = {
{ "picochip,spacc-ipsec", },
{ "picochip,spacc-l2", },
{ }
};
static struct platform_driver spacc_driver = {
......
......@@ -30,7 +30,7 @@
#include <crypto/ctr.h>
#include <plat/cpu.h>
#include <plat/dma.h>
#include <mach/dma.h>
#define _SBF(s, v) ((v) << (s))
#define _BIT(b) _SBF(b, 1)
......
......@@ -936,8 +936,7 @@ static int sg_to_link_tbl(struct scatterlist *sg, int sg_count,
sg_count--;
link_tbl_ptr--;
}
link_tbl_ptr->len = cpu_to_be16(be16_to_cpu(link_tbl_ptr->len)
+ cryptlen);
be16_add_cpu(&link_tbl_ptr->len, cryptlen);
/* tag end of link table */
link_tbl_ptr->j_extent = DESC_PTR_LNKTBL_RETURN;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#ifndef _CRYPTO_CAST_COMMON_H
#define _CRYPTO_CAST_COMMON_H
extern const u32 cast_s1[256];
extern const u32 cast_s2[256];
extern const u32 cast_s3[256];
extern const u32 cast_s4[256];
#endif
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment