Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a450ad5..759b1a9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,22 +8,21 @@
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
-obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
-obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -37,14 +36,9 @@
obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
-obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o
-obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o
-obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o
-obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
-
-obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
-obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
@@ -60,32 +54,22 @@
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
- obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
- obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
- obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
-
- obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
endif
-aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
-aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
-aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o
-aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
-morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
-morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
@@ -100,13 +84,17 @@
ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
- chacha20-x86_64-y += chacha20-avx2-x86_64.o
+ chacha-x86_64-y += chacha-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
- morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
+ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
endif
-aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
+ifeq ($(avx512_supported),yes)
+ chacha-x86_64-y += chacha-avx512vl-x86_64.o
+endif
+
+aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 5f7e43d..4434607 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* AES-NI + SSE2 implementation of AEGIS-128
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 2a356b9..46d2271 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -1,18 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* The AEGIS-128 Authenticated-Encryption Algorithm
* Glue for AES-NI + SSE2 implementation
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
-#include <crypto/cryptd.h>
#include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/module.h>
@@ -119,31 +115,20 @@
}
static void crypto_aegis128_aesni_process_crypt(
- struct aegis_state *state, struct aead_request *req,
+ struct aegis_state *state, struct skcipher_walk *walk,
const struct aegis_crypt_ops *ops)
{
- struct skcipher_walk walk;
- u8 *src, *dst;
- unsigned int chunksize, base;
+ while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
+ ops->crypt_blocks(state,
+ round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ }
- ops->skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- src = walk.src.virt.addr;
- dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops->crypt_blocks(state, chunksize, src, dst);
-
- base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1);
- src += base;
- dst += base;
- chunksize &= AEGIS128_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops->crypt_tail(state, chunksize, src, dst);
-
- skcipher_walk_done(&walk, 0);
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
}
}
@@ -186,13 +171,16 @@
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ struct skcipher_walk walk;
struct aegis_state state;
+ ops->skcipher_walk_init(&walk, req, true);
+
kernel_fpu_begin();
crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, req, ops);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
@@ -250,131 +238,35 @@
{
}
-static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead,
- const u8 *key, unsigned int keylen)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
+static struct aead_alg crypto_aegis128_aesni_alg = {
+ .setkey = crypto_aegis128_aesni_setkey,
+ .setauthsize = crypto_aegis128_aesni_setauthsize,
+ .encrypt = crypto_aegis128_aesni_encrypt,
+ .decrypt = crypto_aegis128_aesni_decrypt,
+ .init = crypto_aegis128_aesni_init_tfm,
+ .exit = crypto_aegis128_aesni_exit_tfm,
- return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
+ .ivsize = AEGIS128_NONCE_SIZE,
+ .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
+ .chunksize = AEGIS128_BLOCK_SIZE,
-static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
+ .base = {
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aegis_ctx) +
+ __alignof__(struct aegis_ctx),
+ .cra_alignmask = 0,
+ .cra_priority = 400,
- return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
+ .cra_name = "__aegis128",
+ .cra_driver_name = "__aegis128-aesni",
-static int cryptd_aegis128_aesni_encrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_encrypt(req);
-}
-
-static int cryptd_aegis128_aesni_decrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_decrypt(req);
-}
-
-static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
- return 0;
-}
-
-static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg crypto_aegis128_aesni_alg[] = {
- {
- .setkey = crypto_aegis128_aesni_setkey,
- .setauthsize = crypto_aegis128_aesni_setauthsize,
- .encrypt = crypto_aegis128_aesni_encrypt,
- .decrypt = crypto_aegis128_aesni_decrypt,
- .init = crypto_aegis128_aesni_init_tfm,
- .exit = crypto_aegis128_aesni_exit_tfm,
-
- .ivsize = AEGIS128_NONCE_SIZE,
- .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
- .chunksize = AEGIS128_BLOCK_SIZE,
-
- .base = {
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct aegis_ctx) +
- __alignof__(struct aegis_ctx),
- .cra_alignmask = 0,
-
- .cra_name = "__aegis128",
- .cra_driver_name = "__aegis128-aesni",
-
- .cra_module = THIS_MODULE,
- }
- }, {
- .setkey = cryptd_aegis128_aesni_setkey,
- .setauthsize = cryptd_aegis128_aesni_setauthsize,
- .encrypt = cryptd_aegis128_aesni_encrypt,
- .decrypt = cryptd_aegis128_aesni_decrypt,
- .init = cryptd_aegis128_aesni_init_tfm,
- .exit = cryptd_aegis128_aesni_exit_tfm,
-
- .ivsize = AEGIS128_NONCE_SIZE,
- .maxauthsize = AEGIS128_MAX_AUTH_SIZE,
- .chunksize = AEGIS128_BLOCK_SIZE,
-
- .base = {
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct cryptd_aead *),
- .cra_alignmask = 0,
-
- .cra_priority = 400,
-
- .cra_name = "aegis128",
- .cra_driver_name = "aegis128-aesni",
-
- .cra_module = THIS_MODULE,
- }
+ .cra_module = THIS_MODULE,
}
};
+static struct simd_aead_alg *simd_alg;
+
static int __init crypto_aegis128_aesni_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2) ||
@@ -382,14 +274,13 @@
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
- return crypto_register_aeads(crypto_aegis128_aesni_alg,
- ARRAY_SIZE(crypto_aegis128_aesni_alg));
+ return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
+ &simd_alg);
}
static void __exit crypto_aegis128_aesni_module_exit(void)
{
- crypto_unregister_aeads(crypto_aegis128_aesni_alg,
- ARRAY_SIZE(crypto_aegis128_aesni_alg));
+ simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg);
}
module_init(crypto_aegis128_aesni_module_init);
diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S
deleted file mode 100644
index 491dd61..0000000
--- a/arch/x86/crypto/aegis128l-aesni-asm.S
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * AES-NI + SSE2 implementation of AEGIS-128L
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STATE0 %xmm0
-#define STATE1 %xmm1
-#define STATE2 %xmm2
-#define STATE3 %xmm3
-#define STATE4 %xmm4
-#define STATE5 %xmm5
-#define STATE6 %xmm6
-#define STATE7 %xmm7
-#define MSG0 %xmm8
-#define MSG1 %xmm9
-#define T0 %xmm10
-#define T1 %xmm11
-#define T2 %xmm12
-#define T3 %xmm13
-
-#define STATEP %rdi
-#define LEN %rsi
-#define SRC %rdx
-#define DST %rcx
-
-.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32
-.align 16
-.Laegis128l_const_0:
- .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
- .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
-.Laegis128l_const_1:
- .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
- .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-
-.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16
-.align 16
-.Laegis128l_counter0:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-.Laegis128l_counter1:
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-
-.text
-
-/*
- * __load_partial: internal ABI
- * input:
- * LEN - bytes
- * SRC - src
- * output:
- * MSG0 - first message block
- * MSG1 - second message block
- * changed:
- * T0
- * %r8
- * %r9
- */
-__load_partial:
- xor %r9d, %r9d
- pxor MSG0, MSG0
- pxor MSG1, MSG1
-
- mov LEN, %r8
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov LEN, %r8
- and $0x1E, %r8
- add SRC, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov LEN, %r8
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov LEN, %r8
- and $0x1C, %r8
- add SRC, %r8
- shl $0x10, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov LEN, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov LEN, %r8
- and $0x18, %r8
- add SRC, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG0
-
- mov LEN, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov LEN, %r8
- and $0x10, %r8
- add SRC, %r8
- pslldq $8, MSG0
- movq (%r8), T0
- pxor T0, MSG0
-
-.Lld_partial_8:
- mov LEN, %r8
- and $0x10, %r8
- jz .Lld_partial_16
-
- movdqa MSG0, MSG1
- movdqu (SRC), MSG0
-
-.Lld_partial_16:
- ret
-ENDPROC(__load_partial)
-
-/*
- * __store_partial: internal ABI
- * input:
- * LEN - bytes
- * DST - dst
- * output:
- * T0 - first message block
- * T1 - second message block
- * changed:
- * %r8
- * %r9
- * %r10
- */
-__store_partial:
- mov LEN, %r8
- mov DST, %r9
-
- cmp $16, %r8
- jl .Lst_partial_16
-
- movdqu T0, (%r9)
- movdqa T1, T0
-
- sub $16, %r8
- add $16, %r9
-
-.Lst_partial_16:
- movq T0, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- psrldq $8, T0
- movq T0, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $0x10, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- ret
-ENDPROC(__store_partial)
-
-.macro update
- movdqa STATE7, T0
- aesenc STATE0, STATE7
- aesenc STATE1, STATE0
- aesenc STATE2, STATE1
- aesenc STATE3, STATE2
- aesenc STATE4, STATE3
- aesenc STATE5, STATE4
- aesenc STATE6, STATE5
- aesenc T0, STATE6
-.endm
-
-.macro update0
- update
- pxor MSG0, STATE7
- pxor MSG1, STATE3
-.endm
-
-.macro update1
- update
- pxor MSG0, STATE6
- pxor MSG1, STATE2
-.endm
-
-.macro update2
- update
- pxor MSG0, STATE5
- pxor MSG1, STATE1
-.endm
-
-.macro update3
- update
- pxor MSG0, STATE4
- pxor MSG1, STATE0
-.endm
-
-.macro update4
- update
- pxor MSG0, STATE3
- pxor MSG1, STATE7
-.endm
-
-.macro update5
- update
- pxor MSG0, STATE2
- pxor MSG1, STATE6
-.endm
-
-.macro update6
- update
- pxor MSG0, STATE1
- pxor MSG1, STATE5
-.endm
-
-.macro update7
- update
- pxor MSG0, STATE0
- pxor MSG1, STATE4
-.endm
-
-.macro state_load
- movdqu 0x00(STATEP), STATE0
- movdqu 0x10(STATEP), STATE1
- movdqu 0x20(STATEP), STATE2
- movdqu 0x30(STATEP), STATE3
- movdqu 0x40(STATEP), STATE4
- movdqu 0x50(STATEP), STATE5
- movdqu 0x60(STATEP), STATE6
- movdqu 0x70(STATEP), STATE7
-.endm
-
-.macro state_store s0 s1 s2 s3 s4 s5 s6 s7
- movdqu \s7, 0x00(STATEP)
- movdqu \s0, 0x10(STATEP)
- movdqu \s1, 0x20(STATEP)
- movdqu \s2, 0x30(STATEP)
- movdqu \s3, 0x40(STATEP)
- movdqu \s4, 0x50(STATEP)
- movdqu \s5, 0x60(STATEP)
- movdqu \s6, 0x70(STATEP)
-.endm
-
-.macro state_store0
- state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
-.endm
-
-.macro state_store1
- state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
-.endm
-
-.macro state_store2
- state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
-.endm
-
-.macro state_store3
- state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
-.endm
-
-.macro state_store4
- state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
-.endm
-
-.macro state_store5
- state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
-.endm
-
-.macro state_store6
- state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
-.endm
-
-.macro state_store7
- state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
-.endm
-
-/*
- * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv);
- */
-ENTRY(crypto_aegis128l_aesni_init)
- FRAME_BEGIN
-
- /* load key: */
- movdqa (%rsi), MSG1
- movdqa MSG1, STATE0
- movdqa MSG1, STATE4
- movdqa MSG1, STATE5
- movdqa MSG1, STATE6
- movdqa MSG1, STATE7
-
- /* load IV: */
- movdqu (%rdx), MSG0
- pxor MSG0, STATE0
- pxor MSG0, STATE4
-
- /* load the constants: */
- movdqa .Laegis128l_const_0, STATE2
- movdqa .Laegis128l_const_1, STATE1
- movdqa STATE1, STATE3
- pxor STATE2, STATE5
- pxor STATE1, STATE6
- pxor STATE2, STATE7
-
- /* update 10 times with IV and KEY: */
- update0
- update1
- update2
- update3
- update4
- update5
- update6
- update7
- update0
- update1
-
- state_store1
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_init)
-
-.macro ad_block a i
- movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
- movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
- update\i
- sub $0x20, LEN
- cmp $0x20, LEN
- jl .Lad_out_\i
-.endm
-
-/*
- * void crypto_aegis128l_aesni_ad(void *state, unsigned int length,
- * const void *data);
- */
-ENTRY(crypto_aegis128l_aesni_ad)
- FRAME_BEGIN
-
- cmp $0x20, LEN
- jb .Lad_out
-
- state_load
-
- mov SRC, %r8
- and $0xf, %r8
- jnz .Lad_u_loop
-
-.align 8
-.Lad_a_loop:
- ad_block a 0
- ad_block a 1
- ad_block a 2
- ad_block a 3
- ad_block a 4
- ad_block a 5
- ad_block a 6
- ad_block a 7
-
- add $0x100, SRC
- jmp .Lad_a_loop
-
-.align 8
-.Lad_u_loop:
- ad_block u 0
- ad_block u 1
- ad_block u 2
- ad_block u 3
- ad_block u 4
- ad_block u 5
- ad_block u 6
- ad_block u 7
-
- add $0x100, SRC
- jmp .Lad_u_loop
-
-.Lad_out_0:
- state_store0
- FRAME_END
- ret
-
-.Lad_out_1:
- state_store1
- FRAME_END
- ret
-
-.Lad_out_2:
- state_store2
- FRAME_END
- ret
-
-.Lad_out_3:
- state_store3
- FRAME_END
- ret
-
-.Lad_out_4:
- state_store4
- FRAME_END
- ret
-
-.Lad_out_5:
- state_store5
- FRAME_END
- ret
-
-.Lad_out_6:
- state_store6
- FRAME_END
- ret
-
-.Lad_out_7:
- state_store7
- FRAME_END
- ret
-
-.Lad_out:
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_ad)
-
-.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7
- pxor \s1, \m0
- pxor \s6, \m0
- movdqa \s2, T3
- pand \s3, T3
- pxor T3, \m0
-
- pxor \s2, \m1
- pxor \s5, \m1
- movdqa \s6, T3
- pand \s7, T3
- pxor T3, \m1
-.endm
-
-.macro crypt0 m0 m1
- crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7
-.endm
-
-.macro crypt1 m0 m1
- crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6
-.endm
-
-.macro crypt2 m0 m1
- crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
-.endm
-
-.macro crypt3 m0 m1
- crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4
-.endm
-
-.macro crypt4 m0 m1
- crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3
-.endm
-
-.macro crypt5 m0 m1
- crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2
-.endm
-
-.macro crypt6 m0 m1
- crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1
-.endm
-
-.macro crypt7 m0 m1
- crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0
-.endm
-
-.macro encrypt_block a i
- movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
- movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
- movdqa MSG0, T0
- movdqa MSG1, T1
- crypt\i T0, T1
- movdq\a T0, (\i * 0x20 + 0x00)(DST)
- movdq\a T1, (\i * 0x20 + 0x10)(DST)
-
- update\i
-
- sub $0x20, LEN
- cmp $0x20, LEN
- jl .Lenc_out_\i
-.endm
-
-.macro decrypt_block a i
- movdq\a (\i * 0x20 + 0x00)(SRC), MSG0
- movdq\a (\i * 0x20 + 0x10)(SRC), MSG1
- crypt\i MSG0, MSG1
- movdq\a MSG0, (\i * 0x20 + 0x00)(DST)
- movdq\a MSG1, (\i * 0x20 + 0x10)(DST)
-
- update\i
-
- sub $0x20, LEN
- cmp $0x20, LEN
- jl .Ldec_out_\i
-.endm
-
-/*
- * void crypto_aegis128l_aesni_enc(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis128l_aesni_enc)
- FRAME_BEGIN
-
- cmp $0x20, LEN
- jb .Lenc_out
-
- state_load
-
- mov SRC, %r8
- or DST, %r8
- and $0xf, %r8
- jnz .Lenc_u_loop
-
-.align 8
-.Lenc_a_loop:
- encrypt_block a 0
- encrypt_block a 1
- encrypt_block a 2
- encrypt_block a 3
- encrypt_block a 4
- encrypt_block a 5
- encrypt_block a 6
- encrypt_block a 7
-
- add $0x100, SRC
- add $0x100, DST
- jmp .Lenc_a_loop
-
-.align 8
-.Lenc_u_loop:
- encrypt_block u 0
- encrypt_block u 1
- encrypt_block u 2
- encrypt_block u 3
- encrypt_block u 4
- encrypt_block u 5
- encrypt_block u 6
- encrypt_block u 7
-
- add $0x100, SRC
- add $0x100, DST
- jmp .Lenc_u_loop
-
-.Lenc_out_0:
- state_store0
- FRAME_END
- ret
-
-.Lenc_out_1:
- state_store1
- FRAME_END
- ret
-
-.Lenc_out_2:
- state_store2
- FRAME_END
- ret
-
-.Lenc_out_3:
- state_store3
- FRAME_END
- ret
-
-.Lenc_out_4:
- state_store4
- FRAME_END
- ret
-
-.Lenc_out_5:
- state_store5
- FRAME_END
- ret
-
-.Lenc_out_6:
- state_store6
- FRAME_END
- ret
-
-.Lenc_out_7:
- state_store7
- FRAME_END
- ret
-
-.Lenc_out:
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_enc)
-
-/*
- * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis128l_aesni_enc_tail)
- FRAME_BEGIN
-
- state_load
-
- /* encrypt message: */
- call __load_partial
-
- movdqa MSG0, T0
- movdqa MSG1, T1
- crypt0 T0, T1
-
- call __store_partial
-
- update0
-
- state_store0
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_enc_tail)
-
-/*
- * void crypto_aegis128l_aesni_dec(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis128l_aesni_dec)
- FRAME_BEGIN
-
- cmp $0x20, LEN
- jb .Ldec_out
-
- state_load
-
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
-.align 8
-.Ldec_a_loop:
- decrypt_block a 0
- decrypt_block a 1
- decrypt_block a 2
- decrypt_block a 3
- decrypt_block a 4
- decrypt_block a 5
- decrypt_block a 6
- decrypt_block a 7
-
- add $0x100, SRC
- add $0x100, DST
- jmp .Ldec_a_loop
-
-.align 8
-.Ldec_u_loop:
- decrypt_block u 0
- decrypt_block u 1
- decrypt_block u 2
- decrypt_block u 3
- decrypt_block u 4
- decrypt_block u 5
- decrypt_block u 6
- decrypt_block u 7
-
- add $0x100, SRC
- add $0x100, DST
- jmp .Ldec_u_loop
-
-.Ldec_out_0:
- state_store0
- FRAME_END
- ret
-
-.Ldec_out_1:
- state_store1
- FRAME_END
- ret
-
-.Ldec_out_2:
- state_store2
- FRAME_END
- ret
-
-.Ldec_out_3:
- state_store3
- FRAME_END
- ret
-
-.Ldec_out_4:
- state_store4
- FRAME_END
- ret
-
-.Ldec_out_5:
- state_store5
- FRAME_END
- ret
-
-.Ldec_out_6:
- state_store6
- FRAME_END
- ret
-
-.Ldec_out_7:
- state_store7
- FRAME_END
- ret
-
-.Ldec_out:
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_dec)
-
-/*
- * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis128l_aesni_dec_tail)
- FRAME_BEGIN
-
- state_load
-
- /* decrypt message: */
- call __load_partial
-
- crypt0 MSG0, MSG1
-
- movdqa MSG0, T0
- movdqa MSG1, T1
- call __store_partial
-
- /* mask with byte count: */
- movq LEN, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- movdqa T0, T1
- movdqa .Laegis128l_counter0, T2
- movdqa .Laegis128l_counter1, T3
- pcmpgtb T2, T0
- pcmpgtb T3, T1
- pand T0, MSG0
- pand T1, MSG1
-
- update0
-
- state_store0
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_dec_tail)
-
-/*
- * void crypto_aegis128l_aesni_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
- */
-ENTRY(crypto_aegis128l_aesni_final)
- FRAME_BEGIN
-
- state_load
-
- /* prepare length block: */
- movq %rdx, MSG0
- movq %rcx, T0
- pslldq $8, T0
- pxor T0, MSG0
- psllq $3, MSG0 /* multiply by 8 (to get bit count) */
-
- pxor STATE2, MSG0
- movdqa MSG0, MSG1
-
- /* update state: */
- update0
- update1
- update2
- update3
- update4
- update5
- update6
-
- /* xor tag: */
- movdqu (%rsi), T0
-
- pxor STATE1, T0
- pxor STATE2, T0
- pxor STATE3, T0
- pxor STATE4, T0
- pxor STATE5, T0
- pxor STATE6, T0
- pxor STATE7, T0
-
- movdqu T0, (%rsi)
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis128l_aesni_final)
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
deleted file mode 100644
index dbe8bb9..0000000
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * The AEGIS-128L Authenticated-Encryption Algorithm
- * Glue for AES-NI + SSE2 implementation
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/cryptd.h>
-#include <crypto/internal/aead.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/scatterwalk.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/cpu_device_id.h>
-
-#define AEGIS128L_BLOCK_ALIGN 16
-#define AEGIS128L_BLOCK_SIZE 32
-#define AEGIS128L_NONCE_SIZE 16
-#define AEGIS128L_STATE_BLOCKS 8
-#define AEGIS128L_KEY_SIZE 16
-#define AEGIS128L_MIN_AUTH_SIZE 8
-#define AEGIS128L_MAX_AUTH_SIZE 16
-
-asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv);
-
-asmlinkage void crypto_aegis128l_aesni_ad(
- void *state, unsigned int length, const void *data);
-
-asmlinkage void crypto_aegis128l_aesni_enc(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128l_aesni_dec(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128l_aesni_enc_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128l_aesni_dec_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128l_aesni_final(
- void *state, void *tag_xor, unsigned int cryptlen,
- unsigned int assoclen);
-
-struct aegis_block {
- u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN);
-};
-
-struct aegis_state {
- struct aegis_block blocks[AEGIS128L_STATE_BLOCKS];
-};
-
-struct aegis_ctx {
- struct aegis_block key;
-};
-
-struct aegis_crypt_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
-
- void (*crypt_blocks)(void *state, unsigned int length, const void *src,
- void *dst);
- void (*crypt_tail)(void *state, unsigned int length, const void *src,
- void *dst);
-};
-
-static void crypto_aegis128l_aesni_process_ad(
- struct aegis_state *state, struct scatterlist *sg_src,
- unsigned int assoclen)
-{
- struct scatter_walk walk;
- struct aegis_block buf;
- unsigned int pos = 0;
-
- scatterwalk_start(&walk, sg_src);
- while (assoclen != 0) {
- unsigned int size = scatterwalk_clamp(&walk, assoclen);
- unsigned int left = size;
- void *mapped = scatterwalk_map(&walk);
- const u8 *src = (const u8 *)mapped;
-
- if (pos + size >= AEGIS128L_BLOCK_SIZE) {
- if (pos > 0) {
- unsigned int fill = AEGIS128L_BLOCK_SIZE - pos;
- memcpy(buf.bytes + pos, src, fill);
- crypto_aegis128l_aesni_ad(state,
- AEGIS128L_BLOCK_SIZE,
- buf.bytes);
- pos = 0;
- left -= fill;
- src += fill;
- }
-
- crypto_aegis128l_aesni_ad(state, left, src);
-
- src += left & ~(AEGIS128L_BLOCK_SIZE - 1);
- left &= AEGIS128L_BLOCK_SIZE - 1;
- }
-
- memcpy(buf.bytes + pos, src, left);
- pos += left;
- assoclen -= size;
-
- scatterwalk_unmap(mapped);
- scatterwalk_advance(&walk, size);
- scatterwalk_done(&walk, 0, assoclen);
- }
-
- if (pos > 0) {
- memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos);
- crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes);
- }
-}
-
-static void crypto_aegis128l_aesni_process_crypt(
- struct aegis_state *state, struct aead_request *req,
- const struct aegis_crypt_ops *ops)
-{
- struct skcipher_walk walk;
- u8 *src, *dst;
- unsigned int chunksize, base;
-
- ops->skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- src = walk.src.virt.addr;
- dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops->crypt_blocks(state, chunksize, src, dst);
-
- base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1);
- src += base;
- dst += base;
- chunksize &= AEGIS128L_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops->crypt_tail(state, chunksize, src, dst);
-
- skcipher_walk_done(&walk, 0);
- }
-}
-
-static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead)
-{
- u8 *ctx = crypto_aead_ctx(aead);
- ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
- return (void *)ctx;
-}
-
-static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead,
- const u8 *key, unsigned int keylen)
-{
- struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead);
-
- if (keylen != AEGIS128L_KEY_SIZE) {
- crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
- memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE);
-
- return 0;
-}
-
-static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm,
- unsigned int authsize)
-{
- if (authsize > AEGIS128L_MAX_AUTH_SIZE)
- return -EINVAL;
- if (authsize < AEGIS128L_MIN_AUTH_SIZE)
- return -EINVAL;
- return 0;
-}
-
-static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
- struct aegis_block *tag_xor,
- unsigned int cryptlen,
- const struct aegis_crypt_ops *ops)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
- struct aegis_state state;
-
- kernel_fpu_begin();
-
- crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
- crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128l_aesni_process_crypt(&state, req, ops);
- crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
-
- kernel_fpu_end();
-}
-
-static int crypto_aegis128l_aesni_encrypt(struct aead_request *req)
-{
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = crypto_aegis128l_aesni_enc,
- .crypt_tail = crypto_aegis128l_aesni_enc_tail,
- };
-
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aegis_block tag = {};
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen;
-
- crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
-
- scatterwalk_map_and_copy(tag.bytes, req->dst,
- req->assoclen + cryptlen, authsize, 1);
- return 0;
-}
-
-static int crypto_aegis128l_aesni_decrypt(struct aead_request *req)
-{
- static const struct aegis_block zeros = {};
-
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = crypto_aegis128l_aesni_dec,
- .crypt_tail = crypto_aegis128l_aesni_dec_tail,
- };
-
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aegis_block tag;
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen - authsize;
-
- scatterwalk_map_and_copy(tag.bytes, req->src,
- req->assoclen + cryptlen, authsize, 0);
-
- crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS);
-
- return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
-}
-
-static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
-{
- return 0;
-}
-
-static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
-{
-}
-
-static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead,
- const u8 *key, unsigned int keylen)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-
-static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
-static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_encrypt(req);
-}
-
-static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_decrypt(req);
-}
-
-static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
- return 0;
-}
-
-static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg crypto_aegis128l_aesni_alg[] = {
- {
- .setkey = crypto_aegis128l_aesni_setkey,
- .setauthsize = crypto_aegis128l_aesni_setauthsize,
- .encrypt = crypto_aegis128l_aesni_encrypt,
- .decrypt = crypto_aegis128l_aesni_decrypt,
- .init = crypto_aegis128l_aesni_init_tfm,
- .exit = crypto_aegis128l_aesni_exit_tfm,
-
- .ivsize = AEGIS128L_NONCE_SIZE,
- .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
- .chunksize = AEGIS128L_BLOCK_SIZE,
-
- .base = {
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct aegis_ctx) +
- __alignof__(struct aegis_ctx),
- .cra_alignmask = 0,
-
- .cra_name = "__aegis128l",
- .cra_driver_name = "__aegis128l-aesni",
-
- .cra_module = THIS_MODULE,
- }
- }, {
- .setkey = cryptd_aegis128l_aesni_setkey,
- .setauthsize = cryptd_aegis128l_aesni_setauthsize,
- .encrypt = cryptd_aegis128l_aesni_encrypt,
- .decrypt = cryptd_aegis128l_aesni_decrypt,
- .init = cryptd_aegis128l_aesni_init_tfm,
- .exit = cryptd_aegis128l_aesni_exit_tfm,
-
- .ivsize = AEGIS128L_NONCE_SIZE,
- .maxauthsize = AEGIS128L_MAX_AUTH_SIZE,
- .chunksize = AEGIS128L_BLOCK_SIZE,
-
- .base = {
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct cryptd_aead *),
- .cra_alignmask = 0,
-
- .cra_priority = 400,
-
- .cra_name = "aegis128l",
- .cra_driver_name = "aegis128l-aesni",
-
- .cra_module = THIS_MODULE,
- }
- }
-};
-
-static int __init crypto_aegis128l_aesni_module_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
- !boot_cpu_has(X86_FEATURE_AES) ||
- !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
- return -ENODEV;
-
- return crypto_register_aeads(crypto_aegis128l_aesni_alg,
- ARRAY_SIZE(crypto_aegis128l_aesni_alg));
-}
-
-static void __exit crypto_aegis128l_aesni_module_exit(void)
-{
- crypto_unregister_aeads(crypto_aegis128l_aesni_alg,
- ARRAY_SIZE(crypto_aegis128l_aesni_alg));
-}
-
-module_init(crypto_aegis128l_aesni_module_init);
-module_exit(crypto_aegis128l_aesni_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation");
-MODULE_ALIAS_CRYPTO("aegis128l");
-MODULE_ALIAS_CRYPTO("aegis128l-aesni");
diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S
deleted file mode 100644
index 8870c7c..0000000
--- a/arch/x86/crypto/aegis256-aesni-asm.S
+++ /dev/null
@@ -1,703 +0,0 @@
-/*
- * AES-NI + SSE2 implementation of AEGIS-128L
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define STATE0 %xmm0
-#define STATE1 %xmm1
-#define STATE2 %xmm2
-#define STATE3 %xmm3
-#define STATE4 %xmm4
-#define STATE5 %xmm5
-#define MSG %xmm6
-#define T0 %xmm7
-#define T1 %xmm8
-#define T2 %xmm9
-#define T3 %xmm10
-
-#define STATEP %rdi
-#define LEN %rsi
-#define SRC %rdx
-#define DST %rcx
-
-.section .rodata.cst16.aegis256_const, "aM", @progbits, 32
-.align 16
-.Laegis256_const_0:
- .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
- .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
-.Laegis256_const_1:
- .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
- .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-
-.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16
-.align 16
-.Laegis256_counter:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-
-.text
-
-/*
- * __load_partial: internal ABI
- * input:
- * LEN - bytes
- * SRC - src
- * output:
- * MSG - message block
- * changed:
- * T0
- * %r8
- * %r9
- */
-__load_partial:
- xor %r9d, %r9d
- pxor MSG, MSG
-
- mov LEN, %r8
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov LEN, %r8
- and $0x1E, %r8
- add SRC, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov LEN, %r8
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov LEN, %r8
- and $0x1C, %r8
- add SRC, %r8
- shl $0x10, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov LEN, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov LEN, %r8
- and $0x18, %r8
- add SRC, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG
-
- mov LEN, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov LEN, %r8
- and $0x10, %r8
- add SRC, %r8
- pslldq $8, MSG
- movq (%r8), T0
- pxor T0, MSG
-
-.Lld_partial_8:
- ret
-ENDPROC(__load_partial)
-
-/*
- * __store_partial: internal ABI
- * input:
- * LEN - bytes
- * DST - dst
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
- */
-__store_partial:
- mov LEN, %r8
- mov DST, %r9
-
- movq T0, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- psrldq $8, T0
- movq T0, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $0x10, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- ret
-ENDPROC(__store_partial)
-
-.macro update
- movdqa STATE5, T0
- aesenc STATE0, STATE5
- aesenc STATE1, STATE0
- aesenc STATE2, STATE1
- aesenc STATE3, STATE2
- aesenc STATE4, STATE3
- aesenc T0, STATE4
-.endm
-
-.macro update0 m
- update
- pxor \m, STATE5
-.endm
-
-.macro update1 m
- update
- pxor \m, STATE4
-.endm
-
-.macro update2 m
- update
- pxor \m, STATE3
-.endm
-
-.macro update3 m
- update
- pxor \m, STATE2
-.endm
-
-.macro update4 m
- update
- pxor \m, STATE1
-.endm
-
-.macro update5 m
- update
- pxor \m, STATE0
-.endm
-
-.macro state_load
- movdqu 0x00(STATEP), STATE0
- movdqu 0x10(STATEP), STATE1
- movdqu 0x20(STATEP), STATE2
- movdqu 0x30(STATEP), STATE3
- movdqu 0x40(STATEP), STATE4
- movdqu 0x50(STATEP), STATE5
-.endm
-
-.macro state_store s0 s1 s2 s3 s4 s5
- movdqu \s5, 0x00(STATEP)
- movdqu \s0, 0x10(STATEP)
- movdqu \s1, 0x20(STATEP)
- movdqu \s2, 0x30(STATEP)
- movdqu \s3, 0x40(STATEP)
- movdqu \s4, 0x50(STATEP)
-.endm
-
-.macro state_store0
- state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
-.endm
-
-.macro state_store1
- state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
-.endm
-
-.macro state_store2
- state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
-.endm
-
-.macro state_store3
- state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
-.endm
-
-.macro state_store4
- state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
-.endm
-
-.macro state_store5
- state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
-.endm
-
-/*
- * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv);
- */
-ENTRY(crypto_aegis256_aesni_init)
- FRAME_BEGIN
-
- /* load key: */
- movdqa 0x00(%rsi), MSG
- movdqa 0x10(%rsi), T1
- movdqa MSG, STATE4
- movdqa T1, STATE5
-
- /* load IV: */
- movdqu 0x00(%rdx), T2
- movdqu 0x10(%rdx), T3
- pxor MSG, T2
- pxor T1, T3
- movdqa T2, STATE0
- movdqa T3, STATE1
-
- /* load the constants: */
- movdqa .Laegis256_const_0, STATE3
- movdqa .Laegis256_const_1, STATE2
- pxor STATE3, STATE4
- pxor STATE2, STATE5
-
- /* update 10 times with IV and KEY: */
- update0 MSG
- update1 T1
- update2 T2
- update3 T3
- update4 MSG
- update5 T1
- update0 T2
- update1 T3
- update2 MSG
- update3 T1
- update4 T2
- update5 T3
- update0 MSG
- update1 T1
- update2 T2
- update3 T3
-
- state_store3
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_init)
-
-.macro ad_block a i
- movdq\a (\i * 0x10)(SRC), MSG
- update\i MSG
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_\i
-.endm
-
-/*
- * void crypto_aegis256_aesni_ad(void *state, unsigned int length,
- * const void *data);
- */
-ENTRY(crypto_aegis256_aesni_ad)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Lad_out
-
- state_load
-
- mov SRC, %r8
- and $0xf, %r8
- jnz .Lad_u_loop
-
-.align 8
-.Lad_a_loop:
- ad_block a 0
- ad_block a 1
- ad_block a 2
- ad_block a 3
- ad_block a 4
- ad_block a 5
-
- add $0x60, SRC
- jmp .Lad_a_loop
-
-.align 8
-.Lad_u_loop:
- ad_block u 0
- ad_block u 1
- ad_block u 2
- ad_block u 3
- ad_block u 4
- ad_block u 5
-
- add $0x60, SRC
- jmp .Lad_u_loop
-
-.Lad_out_0:
- state_store0
- FRAME_END
- ret
-
-.Lad_out_1:
- state_store1
- FRAME_END
- ret
-
-.Lad_out_2:
- state_store2
- FRAME_END
- ret
-
-.Lad_out_3:
- state_store3
- FRAME_END
- ret
-
-.Lad_out_4:
- state_store4
- FRAME_END
- ret
-
-.Lad_out_5:
- state_store5
- FRAME_END
- ret
-
-.Lad_out:
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_ad)
-
-.macro crypt m s0 s1 s2 s3 s4 s5
- pxor \s1, \m
- pxor \s4, \m
- pxor \s5, \m
- movdqa \s2, T3
- pand \s3, T3
- pxor T3, \m
-.endm
-
-.macro crypt0 m
- crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5
-.endm
-
-.macro crypt1 m
- crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4
-.endm
-
-.macro crypt2 m
- crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3
-.endm
-
-.macro crypt3 m
- crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2
-.endm
-
-.macro crypt4 m
- crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1
-.endm
-
-.macro crypt5 m
- crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0
-.endm
-
-.macro encrypt_block a i
- movdq\a (\i * 0x10)(SRC), MSG
- movdqa MSG, T0
- crypt\i T0
- movdq\a T0, (\i * 0x10)(DST)
-
- update\i MSG
-
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lenc_out_\i
-.endm
-
-.macro decrypt_block a i
- movdq\a (\i * 0x10)(SRC), MSG
- crypt\i MSG
- movdq\a MSG, (\i * 0x10)(DST)
-
- update\i MSG
-
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Ldec_out_\i
-.endm
-
-/*
- * void crypto_aegis256_aesni_enc(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis256_aesni_enc)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Lenc_out
-
- state_load
-
- mov SRC, %r8
- or DST, %r8
- and $0xf, %r8
- jnz .Lenc_u_loop
-
-.align 8
-.Lenc_a_loop:
- encrypt_block a 0
- encrypt_block a 1
- encrypt_block a 2
- encrypt_block a 3
- encrypt_block a 4
- encrypt_block a 5
-
- add $0x60, SRC
- add $0x60, DST
- jmp .Lenc_a_loop
-
-.align 8
-.Lenc_u_loop:
- encrypt_block u 0
- encrypt_block u 1
- encrypt_block u 2
- encrypt_block u 3
- encrypt_block u 4
- encrypt_block u 5
-
- add $0x60, SRC
- add $0x60, DST
- jmp .Lenc_u_loop
-
-.Lenc_out_0:
- state_store0
- FRAME_END
- ret
-
-.Lenc_out_1:
- state_store1
- FRAME_END
- ret
-
-.Lenc_out_2:
- state_store2
- FRAME_END
- ret
-
-.Lenc_out_3:
- state_store3
- FRAME_END
- ret
-
-.Lenc_out_4:
- state_store4
- FRAME_END
- ret
-
-.Lenc_out_5:
- state_store5
- FRAME_END
- ret
-
-.Lenc_out:
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_enc)
-
-/*
- * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis256_aesni_enc_tail)
- FRAME_BEGIN
-
- state_load
-
- /* encrypt message: */
- call __load_partial
-
- movdqa MSG, T0
- crypt0 T0
-
- call __store_partial
-
- update0 MSG
-
- state_store0
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_enc_tail)
-
-/*
- * void crypto_aegis256_aesni_dec(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis256_aesni_dec)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Ldec_out
-
- state_load
-
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
-.align 8
-.Ldec_a_loop:
- decrypt_block a 0
- decrypt_block a 1
- decrypt_block a 2
- decrypt_block a 3
- decrypt_block a 4
- decrypt_block a 5
-
- add $0x60, SRC
- add $0x60, DST
- jmp .Ldec_a_loop
-
-.align 8
-.Ldec_u_loop:
- decrypt_block u 0
- decrypt_block u 1
- decrypt_block u 2
- decrypt_block u 3
- decrypt_block u 4
- decrypt_block u 5
-
- add $0x60, SRC
- add $0x60, DST
- jmp .Ldec_u_loop
-
-.Ldec_out_0:
- state_store0
- FRAME_END
- ret
-
-.Ldec_out_1:
- state_store1
- FRAME_END
- ret
-
-.Ldec_out_2:
- state_store2
- FRAME_END
- ret
-
-.Ldec_out_3:
- state_store3
- FRAME_END
- ret
-
-.Ldec_out_4:
- state_store4
- FRAME_END
- ret
-
-.Ldec_out_5:
- state_store5
- FRAME_END
- ret
-
-.Ldec_out:
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_dec)
-
-/*
- * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length,
- * const void *src, void *dst);
- */
-ENTRY(crypto_aegis256_aesni_dec_tail)
- FRAME_BEGIN
-
- state_load
-
- /* decrypt message: */
- call __load_partial
-
- crypt0 MSG
-
- movdqa MSG, T0
- call __store_partial
-
- /* mask with byte count: */
- movq LEN, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- movdqa .Laegis256_counter, T1
- pcmpgtb T1, T0
- pand T0, MSG
-
- update0 MSG
-
- state_store0
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_dec_tail)
-
-/*
- * void crypto_aegis256_aesni_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
- */
-ENTRY(crypto_aegis256_aesni_final)
- FRAME_BEGIN
-
- state_load
-
- /* prepare length block: */
- movq %rdx, MSG
- movq %rcx, T0
- pslldq $8, T0
- pxor T0, MSG
- psllq $3, MSG /* multiply by 8 (to get bit count) */
-
- pxor STATE3, MSG
-
- /* update state: */
- update0 MSG
- update1 MSG
- update2 MSG
- update3 MSG
- update4 MSG
- update5 MSG
- update0 MSG
-
- /* xor tag: */
- movdqu (%rsi), MSG
-
- pxor STATE0, MSG
- pxor STATE1, MSG
- pxor STATE2, MSG
- pxor STATE3, MSG
- pxor STATE4, MSG
- pxor STATE5, MSG
-
- movdqu MSG, (%rsi)
-
- FRAME_END
- ret
-ENDPROC(crypto_aegis256_aesni_final)
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
deleted file mode 100644
index 8bebda2..0000000
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * The AEGIS-256 Authenticated-Encryption Algorithm
- * Glue for AES-NI + SSE2 implementation
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/cryptd.h>
-#include <crypto/internal/aead.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/scatterwalk.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/cpu_device_id.h>
-
-#define AEGIS256_BLOCK_ALIGN 16
-#define AEGIS256_BLOCK_SIZE 16
-#define AEGIS256_NONCE_SIZE 32
-#define AEGIS256_STATE_BLOCKS 6
-#define AEGIS256_KEY_SIZE 32
-#define AEGIS256_MIN_AUTH_SIZE 8
-#define AEGIS256_MAX_AUTH_SIZE 16
-
-asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv);
-
-asmlinkage void crypto_aegis256_aesni_ad(
- void *state, unsigned int length, const void *data);
-
-asmlinkage void crypto_aegis256_aesni_enc(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis256_aesni_dec(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis256_aesni_enc_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis256_aesni_dec_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis256_aesni_final(
- void *state, void *tag_xor, unsigned int cryptlen,
- unsigned int assoclen);
-
-struct aegis_block {
- u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN);
-};
-
-struct aegis_state {
- struct aegis_block blocks[AEGIS256_STATE_BLOCKS];
-};
-
-struct aegis_ctx {
- struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE];
-};
-
-struct aegis_crypt_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
-
- void (*crypt_blocks)(void *state, unsigned int length, const void *src,
- void *dst);
- void (*crypt_tail)(void *state, unsigned int length, const void *src,
- void *dst);
-};
-
-static void crypto_aegis256_aesni_process_ad(
- struct aegis_state *state, struct scatterlist *sg_src,
- unsigned int assoclen)
-{
- struct scatter_walk walk;
- struct aegis_block buf;
- unsigned int pos = 0;
-
- scatterwalk_start(&walk, sg_src);
- while (assoclen != 0) {
- unsigned int size = scatterwalk_clamp(&walk, assoclen);
- unsigned int left = size;
- void *mapped = scatterwalk_map(&walk);
- const u8 *src = (const u8 *)mapped;
-
- if (pos + size >= AEGIS256_BLOCK_SIZE) {
- if (pos > 0) {
- unsigned int fill = AEGIS256_BLOCK_SIZE - pos;
- memcpy(buf.bytes + pos, src, fill);
- crypto_aegis256_aesni_ad(state,
- AEGIS256_BLOCK_SIZE,
- buf.bytes);
- pos = 0;
- left -= fill;
- src += fill;
- }
-
- crypto_aegis256_aesni_ad(state, left, src);
-
- src += left & ~(AEGIS256_BLOCK_SIZE - 1);
- left &= AEGIS256_BLOCK_SIZE - 1;
- }
-
- memcpy(buf.bytes + pos, src, left);
- pos += left;
- assoclen -= size;
-
- scatterwalk_unmap(mapped);
- scatterwalk_advance(&walk, size);
- scatterwalk_done(&walk, 0, assoclen);
- }
-
- if (pos > 0) {
- memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos);
- crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes);
- }
-}
-
-static void crypto_aegis256_aesni_process_crypt(
- struct aegis_state *state, struct aead_request *req,
- const struct aegis_crypt_ops *ops)
-{
- struct skcipher_walk walk;
- u8 *src, *dst;
- unsigned int chunksize, base;
-
- ops->skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- src = walk.src.virt.addr;
- dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops->crypt_blocks(state, chunksize, src, dst);
-
- base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1);
- src += base;
- dst += base;
- chunksize &= AEGIS256_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops->crypt_tail(state, chunksize, src, dst);
-
- skcipher_walk_done(&walk, 0);
- }
-}
-
-static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead)
-{
- u8 *ctx = crypto_aead_ctx(aead);
- ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx));
- return (void *)ctx;
-}
-
-static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
-{
- struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead);
-
- if (keylen != AEGIS256_KEY_SIZE) {
- crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
- memcpy(ctx->key, key, AEGIS256_KEY_SIZE);
-
- return 0;
-}
-
-static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm,
- unsigned int authsize)
-{
- if (authsize > AEGIS256_MAX_AUTH_SIZE)
- return -EINVAL;
- if (authsize < AEGIS256_MIN_AUTH_SIZE)
- return -EINVAL;
- return 0;
-}
-
-static void crypto_aegis256_aesni_crypt(struct aead_request *req,
- struct aegis_block *tag_xor,
- unsigned int cryptlen,
- const struct aegis_crypt_ops *ops)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
- struct aegis_state state;
-
- kernel_fpu_begin();
-
- crypto_aegis256_aesni_init(&state, ctx->key, req->iv);
- crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis256_aesni_process_crypt(&state, req, ops);
- crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
-
- kernel_fpu_end();
-}
-
-static int crypto_aegis256_aesni_encrypt(struct aead_request *req)
-{
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = crypto_aegis256_aesni_enc,
- .crypt_tail = crypto_aegis256_aesni_enc_tail,
- };
-
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aegis_block tag = {};
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen;
-
- crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
-
- scatterwalk_map_and_copy(tag.bytes, req->dst,
- req->assoclen + cryptlen, authsize, 1);
- return 0;
-}
-
-static int crypto_aegis256_aesni_decrypt(struct aead_request *req)
-{
- static const struct aegis_block zeros = {};
-
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = crypto_aegis256_aesni_dec,
- .crypt_tail = crypto_aegis256_aesni_dec_tail,
- };
-
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aegis_block tag;
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen - authsize;
-
- scatterwalk_map_and_copy(tag.bytes, req->src,
- req->assoclen + cryptlen, authsize, 0);
-
- crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS);
-
- return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
-}
-
-static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead)
-{
- return 0;
-}
-
-static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
-{
-}
-
-static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead,
- const u8 *key, unsigned int keylen)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-
-static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
-static int cryptd_aegis256_aesni_encrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_encrypt(req);
-}
-
-static int cryptd_aegis256_aesni_decrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_decrypt(req);
-}
-
-static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
- return 0;
-}
-
-static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg crypto_aegis256_aesni_alg[] = {
- {
- .setkey = crypto_aegis256_aesni_setkey,
- .setauthsize = crypto_aegis256_aesni_setauthsize,
- .encrypt = crypto_aegis256_aesni_encrypt,
- .decrypt = crypto_aegis256_aesni_decrypt,
- .init = crypto_aegis256_aesni_init_tfm,
- .exit = crypto_aegis256_aesni_exit_tfm,
-
- .ivsize = AEGIS256_NONCE_SIZE,
- .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
- .chunksize = AEGIS256_BLOCK_SIZE,
-
- .base = {
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct aegis_ctx) +
- __alignof__(struct aegis_ctx),
- .cra_alignmask = 0,
-
- .cra_name = "__aegis256",
- .cra_driver_name = "__aegis256-aesni",
-
- .cra_module = THIS_MODULE,
- }
- }, {
- .setkey = cryptd_aegis256_aesni_setkey,
- .setauthsize = cryptd_aegis256_aesni_setauthsize,
- .encrypt = cryptd_aegis256_aesni_encrypt,
- .decrypt = cryptd_aegis256_aesni_decrypt,
- .init = cryptd_aegis256_aesni_init_tfm,
- .exit = cryptd_aegis256_aesni_exit_tfm,
-
- .ivsize = AEGIS256_NONCE_SIZE,
- .maxauthsize = AEGIS256_MAX_AUTH_SIZE,
- .chunksize = AEGIS256_BLOCK_SIZE,
-
- .base = {
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct cryptd_aead *),
- .cra_alignmask = 0,
-
- .cra_priority = 400,
-
- .cra_name = "aegis256",
- .cra_driver_name = "aegis256-aesni",
-
- .cra_module = THIS_MODULE,
- }
- }
-};
-
-static int __init crypto_aegis256_aesni_module_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
- !boot_cpu_has(X86_FEATURE_AES) ||
- !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
- return -ENODEV;
-
- return crypto_register_aeads(crypto_aegis256_aesni_alg,
- ARRAY_SIZE(crypto_aegis256_aesni_alg));
-}
-
-static void __exit crypto_aegis256_aesni_module_exit(void)
-{
- crypto_unregister_aeads(crypto_aegis256_aesni_alg,
- ARRAY_SIZE(crypto_aegis256_aesni_alg));
-}
-
-module_init(crypto_aegis256_aesni_module_init);
-module_exit(crypto_aegis256_aesni_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation");
-MODULE_ALIAS_CRYPTO("aegis256");
-MODULE_ALIAS_CRYPTO("aegis256-aesni");
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
deleted file mode 100644
index 2849dbc..0000000
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ /dev/null
@@ -1,362 +0,0 @@
-// -------------------------------------------------------------------------
-// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
-// All rights reserved.
-//
-// LICENSE TERMS
-//
-// The free distribution and use of this software in both source and binary
-// form is allowed (with or without changes) provided that:
-//
-// 1. distributions of this source code include the above copyright
-// notice, this list of conditions and the following disclaimer//
-//
-// 2. distributions in binary form include the above copyright
-// notice, this list of conditions and the following disclaimer
-// in the documentation and/or other associated materials//
-//
-// 3. the copyright holder's name is not used to endorse products
-// built using this software without specific written permission.
-//
-//
-// ALTERNATIVELY, provided that this notice is retained in full, this product
-// may be distributed under the terms of the GNU General Public License (GPL),
-// in which case the provisions of the GPL apply INSTEAD OF those given above.
-//
-// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
-// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
-
-// DISCLAIMER
-//
-// This software is provided 'as is' with no explicit or implied warranties
-// in respect of its properties including, but not limited to, correctness
-// and fitness for purpose.
-// -------------------------------------------------------------------------
-// Issue Date: 29/07/2002
-
-.file "aes-i586-asm.S"
-.text
-
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-
-#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
-
-/* offsets to parameters with one register pushed onto stack */
-#define ctx 8
-#define out_blk 12
-#define in_blk 16
-
-/* offsets in crypto_aes_ctx structure */
-#define klen (480)
-#define ekey (0)
-#define dkey (240)
-
-// register mapping for encrypt and decrypt subroutines
-
-#define r0 eax
-#define r1 ebx
-#define r2 ecx
-#define r3 edx
-#define r4 esi
-#define r5 edi
-
-#define eaxl al
-#define eaxh ah
-#define ebxl bl
-#define ebxh bh
-#define ecxl cl
-#define ecxh ch
-#define edxl dl
-#define edxh dh
-
-#define _h(reg) reg##h
-#define h(reg) _h(reg)
-
-#define _l(reg) reg##l
-#define l(reg) _l(reg)
-
-// This macro takes a 32-bit word representing a column and uses
-// each of its four bytes to index into four tables of 256 32-bit
-// words to obtain values that are then xored into the appropriate
-// output registers r0, r1, r4 or r5.
-
-// Parameters:
-// table table base address
-// %1 out_state[0]
-// %2 out_state[1]
-// %3 out_state[2]
-// %4 out_state[3]
-// idx input register for the round (destroyed)
-// tmp scratch register for the round
-// sched key schedule
-
-#define do_col(table, a1,a2,a3,a4, idx, tmp) \
- movzx %l(idx),%tmp; \
- xor table(,%tmp,4),%a1; \
- movzx %h(idx),%tmp; \
- shr $16,%idx; \
- xor table+tlen(,%tmp,4),%a2; \
- movzx %l(idx),%tmp; \
- movzx %h(idx),%idx; \
- xor table+2*tlen(,%tmp,4),%a3; \
- xor table+3*tlen(,%idx,4),%a4;
-
-// initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
- mov 0 sched,%a1; \
- movzx %l(idx),%tmp; \
- mov 12 sched,%a2; \
- xor table(,%tmp,4),%a1; \
- mov 4 sched,%a4; \
- movzx %h(idx),%tmp; \
- shr $16,%idx; \
- xor table+tlen(,%tmp,4),%a2; \
- movzx %l(idx),%tmp; \
- movzx %h(idx),%idx; \
- xor table+3*tlen(,%idx,4),%a4; \
- mov %a3,%idx; \
- mov 8 sched,%a3; \
- xor table+2*tlen(,%tmp,4),%a3;
-
-// initialise output registers from the key schedule
-// NB1: original value of a3 is in idx on exit
-// NB2: original values of a1,a2,a4 aren't used
-#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
- mov 0 sched,%a1; \
- movzx %l(idx),%tmp; \
- mov 4 sched,%a2; \
- xor table(,%tmp,4),%a1; \
- mov 12 sched,%a4; \
- movzx %h(idx),%tmp; \
- shr $16,%idx; \
- xor table+tlen(,%tmp,4),%a2; \
- movzx %l(idx),%tmp; \
- movzx %h(idx),%idx; \
- xor table+3*tlen(,%idx,4),%a4; \
- mov %a3,%idx; \
- mov 8 sched,%a3; \
- xor table+2*tlen(,%tmp,4),%a3;
-
-
-// original Gladman had conditional saves to MMX regs.
-#define save(a1, a2) \
- mov %a2,4*a1(%esp)
-
-#define restore(a1, a2) \
- mov 4*a2(%esp),%a1
-
-// These macros perform a forward encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage.
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit: r2,r1,r4,r5
-#define fwd_rnd1(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
- do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
- restore(r0,0); \
- do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
- restore(r0,1); \
- do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit: r0,r1,r4,r5
-#define fwd_rnd2(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
- do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
- restore(r2,0); \
- do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
- restore(r2,1); \
- do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */
-
-// These macros performs an inverse encryption cycle. They are entered with
-// the first previous round column values in r0,r1,r4,r5 and
-// exit with the final values in the same registers, using stack
-// for temporary storage
-
-// round column values
-// on entry: r0,r1,r4,r5
-// on exit: r2,r1,r4,r5
-#define inv_rnd1(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
- do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
- restore(r0,0); \
- do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
- restore(r0,1); \
- do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */
-
-// round column values
-// on entry: r2,r1,r4,r5
-// on exit: r0,r1,r4,r5
-#define inv_rnd2(arg, table) \
- save (0,r1); \
- save (1,r5); \
- \
- /* compute new column values */ \
- do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
- do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
- restore(r2,0); \
- do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
- restore(r2,1); \
- do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
-
-// AES (Rijndael) Encryption Subroutine
-/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
-
-.extern crypto_ft_tab
-.extern crypto_fl_tab
-
-ENTRY(aes_enc_blk)
- push %ebp
- mov ctx(%esp),%ebp
-
-// CAUTION: the order and the values used in these assigns
-// rely on the register mappings
-
-1: push %ebx
- mov in_blk+4(%esp),%r2
- push %esi
- mov klen(%ebp),%r3 // key size
- push %edi
-#if ekey != 0
- lea ekey(%ebp),%ebp // key pointer
-#endif
-
-// input four columns and xor in first round key
-
- mov (%r2),%r0
- mov 4(%r2),%r1
- mov 8(%r2),%r4
- mov 12(%r2),%r5
- xor (%ebp),%r0
- xor 4(%ebp),%r1
- xor 8(%ebp),%r4
- xor 12(%ebp),%r5
-
- sub $8,%esp // space for register saves on stack
- add $16,%ebp // increment to next round key
- cmp $24,%r3
- jb 4f // 10 rounds for 128-bit key
- lea 32(%ebp),%ebp
- je 3f // 12 rounds for 192-bit key
- lea 32(%ebp),%ebp
-
-2: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key
- fwd_rnd2( -48(%ebp), crypto_ft_tab)
-3: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key
- fwd_rnd2( -16(%ebp), crypto_ft_tab)
-4: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key
- fwd_rnd2( +16(%ebp), crypto_ft_tab)
- fwd_rnd1( +32(%ebp), crypto_ft_tab)
- fwd_rnd2( +48(%ebp), crypto_ft_tab)
- fwd_rnd1( +64(%ebp), crypto_ft_tab)
- fwd_rnd2( +80(%ebp), crypto_ft_tab)
- fwd_rnd1( +96(%ebp), crypto_ft_tab)
- fwd_rnd2(+112(%ebp), crypto_ft_tab)
- fwd_rnd1(+128(%ebp), crypto_ft_tab)
- fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table
-
-// move final values to the output array. CAUTION: the
-// order of these assigns rely on the register mappings
-
- add $8,%esp
- mov out_blk+12(%esp),%ebp
- mov %r5,12(%ebp)
- pop %edi
- mov %r4,8(%ebp)
- pop %esi
- mov %r1,4(%ebp)
- pop %ebx
- mov %r0,(%ebp)
- pop %ebp
- ret
-ENDPROC(aes_enc_blk)
-
-// AES (Rijndael) Decryption Subroutine
-/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */
-
-.extern crypto_it_tab
-.extern crypto_il_tab
-
-ENTRY(aes_dec_blk)
- push %ebp
- mov ctx(%esp),%ebp
-
-// CAUTION: the order and the values used in these assigns
-// rely on the register mappings
-
-1: push %ebx
- mov in_blk+4(%esp),%r2
- push %esi
- mov klen(%ebp),%r3 // key size
- push %edi
-#if dkey != 0
- lea dkey(%ebp),%ebp // key pointer
-#endif
-
-// input four columns and xor in first round key
-
- mov (%r2),%r0
- mov 4(%r2),%r1
- mov 8(%r2),%r4
- mov 12(%r2),%r5
- xor (%ebp),%r0
- xor 4(%ebp),%r1
- xor 8(%ebp),%r4
- xor 12(%ebp),%r5
-
- sub $8,%esp // space for register saves on stack
- add $16,%ebp // increment to next round key
- cmp $24,%r3
- jb 4f // 10 rounds for 128-bit key
- lea 32(%ebp),%ebp
- je 3f // 12 rounds for 192-bit key
- lea 32(%ebp),%ebp
-
-2: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key
- inv_rnd2( -48(%ebp), crypto_it_tab)
-3: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key
- inv_rnd2( -16(%ebp), crypto_it_tab)
-4: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key
- inv_rnd2( +16(%ebp), crypto_it_tab)
- inv_rnd1( +32(%ebp), crypto_it_tab)
- inv_rnd2( +48(%ebp), crypto_it_tab)
- inv_rnd1( +64(%ebp), crypto_it_tab)
- inv_rnd2( +80(%ebp), crypto_it_tab)
- inv_rnd1( +96(%ebp), crypto_it_tab)
- inv_rnd2(+112(%ebp), crypto_it_tab)
- inv_rnd1(+128(%ebp), crypto_it_tab)
- inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table
-
-// move final values to the output array. CAUTION: the
-// order of these assigns rely on the register mappings
-
- add $8,%esp
- mov out_blk+12(%esp),%ebp
- mov %r5,12(%ebp)
- pop %edi
- mov %r4,8(%ebp)
- pop %esi
- mov %r1,4(%ebp)
- pop %ebx
- mov %r0,(%ebp)
- pop %ebp
- ret
-ENDPROC(aes_dec_blk)
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
deleted file mode 100644
index 8739cf7..0000000
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ /dev/null
@@ -1,185 +0,0 @@
-/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
- *
- * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
- *
- * License:
- * This code can be distributed under the terms of the GNU General Public
- * License (GPL) Version 2 provided that the above header down to and
- * including this sentence is retained in full.
- */
-
-.extern crypto_ft_tab
-.extern crypto_it_tab
-.extern crypto_fl_tab
-.extern crypto_il_tab
-
-.text
-
-#include <linux/linkage.h>
-#include <asm/asm-offsets.h>
-
-#define R1 %rax
-#define R1E %eax
-#define R1X %ax
-#define R1H %ah
-#define R1L %al
-#define R2 %rbx
-#define R2E %ebx
-#define R2X %bx
-#define R2H %bh
-#define R2L %bl
-#define R3 %rcx
-#define R3E %ecx
-#define R3X %cx
-#define R3H %ch
-#define R3L %cl
-#define R4 %rdx
-#define R4E %edx
-#define R4X %dx
-#define R4H %dh
-#define R4L %dl
-#define R5 %rsi
-#define R5E %esi
-#define R6 %rdi
-#define R6E %edi
-#define R7 %r9 /* don't use %rbp; it breaks stack traces */
-#define R7E %r9d
-#define R8 %r8
-#define R10 %r10
-#define R11 %r11
-
-#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \
- ENTRY(FUNC); \
- movq r1,r2; \
- leaq KEY+48(r8),r9; \
- movq r10,r11; \
- movl (r7),r5 ## E; \
- movl 4(r7),r1 ## E; \
- movl 8(r7),r6 ## E; \
- movl 12(r7),r7 ## E; \
- movl 480(r8),r10 ## E; \
- xorl -48(r9),r5 ## E; \
- xorl -44(r9),r1 ## E; \
- xorl -40(r9),r6 ## E; \
- xorl -36(r9),r7 ## E; \
- cmpl $24,r10 ## E; \
- jb B128; \
- leaq 32(r9),r9; \
- je B192; \
- leaq 32(r9),r9;
-
-#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \
- movq r1,r2; \
- movl r5 ## E,(r9); \
- movl r6 ## E,4(r9); \
- movl r7 ## E,8(r9); \
- movl r8 ## E,12(r9); \
- ret; \
- ENDPROC(FUNC);
-
-#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
- movzbl r2 ## H,r5 ## E; \
- movzbl r2 ## L,r6 ## E; \
- movl TAB+1024(,r5,4),r5 ## E;\
- movw r4 ## X,r2 ## X; \
- movl TAB(,r6,4),r6 ## E; \
- roll $16,r2 ## E; \
- shrl $16,r4 ## E; \
- movzbl r4 ## L,r7 ## E; \
- movzbl r4 ## H,r4 ## E; \
- xorl OFFSET(r8),ra ## E; \
- xorl OFFSET+4(r8),rb ## E; \
- xorl TAB+3072(,r4,4),r5 ## E;\
- xorl TAB+2048(,r7,4),r6 ## E;\
- movzbl r1 ## L,r7 ## E; \
- movzbl r1 ## H,r4 ## E; \
- movl TAB+1024(,r4,4),r4 ## E;\
- movw r3 ## X,r1 ## X; \
- roll $16,r1 ## E; \
- shrl $16,r3 ## E; \
- xorl TAB(,r7,4),r5 ## E; \
- movzbl r3 ## L,r7 ## E; \
- movzbl r3 ## H,r3 ## E; \
- xorl TAB+3072(,r3,4),r4 ## E;\
- xorl TAB+2048(,r7,4),r5 ## E;\
- movzbl r1 ## L,r7 ## E; \
- movzbl r1 ## H,r3 ## E; \
- shrl $16,r1 ## E; \
- xorl TAB+3072(,r3,4),r6 ## E;\
- movl TAB+2048(,r7,4),r3 ## E;\
- movzbl r1 ## L,r7 ## E; \
- movzbl r1 ## H,r1 ## E; \
- xorl TAB+1024(,r1,4),r6 ## E;\
- xorl TAB(,r7,4),r3 ## E; \
- movzbl r2 ## H,r1 ## E; \
- movzbl r2 ## L,r7 ## E; \
- shrl $16,r2 ## E; \
- xorl TAB+3072(,r1,4),r3 ## E;\
- xorl TAB+2048(,r7,4),r4 ## E;\
- movzbl r2 ## H,r1 ## E; \
- movzbl r2 ## L,r2 ## E; \
- xorl OFFSET+8(r8),rc ## E; \
- xorl OFFSET+12(r8),rd ## E; \
- xorl TAB+1024(,r1,4),r3 ## E;\
- xorl TAB(,r2,4),r4 ## E;
-
-#define move_regs(r1,r2,r3,r4) \
- movl r3 ## E,r1 ## E; \
- movl r4 ## E,r2 ## E;
-
-#define entry(FUNC,KEY,B128,B192) \
- prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11)
-
-#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11)
-
-#define encrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
- move_regs(R1,R2,R5,R6)
-
-#define encrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
-
-#define decrypt_round(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
- move_regs(R1,R2,R5,R6)
-
-#define decrypt_final(TAB,OFFSET) \
- round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
-
-/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
-
- entry(aes_enc_blk,0,.Le128,.Le192)
- encrypt_round(crypto_ft_tab,-96)
- encrypt_round(crypto_ft_tab,-80)
-.Le192: encrypt_round(crypto_ft_tab,-64)
- encrypt_round(crypto_ft_tab,-48)
-.Le128: encrypt_round(crypto_ft_tab,-32)
- encrypt_round(crypto_ft_tab,-16)
- encrypt_round(crypto_ft_tab, 0)
- encrypt_round(crypto_ft_tab, 16)
- encrypt_round(crypto_ft_tab, 32)
- encrypt_round(crypto_ft_tab, 48)
- encrypt_round(crypto_ft_tab, 64)
- encrypt_round(crypto_ft_tab, 80)
- encrypt_round(crypto_ft_tab, 96)
- encrypt_final(crypto_fl_tab,112)
- return(aes_enc_blk)
-
-/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
-
- entry(aes_dec_blk,240,.Ld128,.Ld192)
- decrypt_round(crypto_it_tab,-96)
- decrypt_round(crypto_it_tab,-80)
-.Ld192: decrypt_round(crypto_it_tab,-64)
- decrypt_round(crypto_it_tab,-48)
-.Ld128: decrypt_round(crypto_it_tab,-32)
- decrypt_round(crypto_it_tab,-16)
- decrypt_round(crypto_it_tab, 0)
- decrypt_round(crypto_it_tab, 16)
- decrypt_round(crypto_it_tab, 32)
- decrypt_round(crypto_it_tab, 48)
- decrypt_round(crypto_it_tab, 64)
- decrypt_round(crypto_it_tab, 80)
- decrypt_round(crypto_it_tab, 96)
- decrypt_final(crypto_il_tab,112)
- return(aes_dec_blk)
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
index e26984f..7b7dc05 100644
--- a/arch/x86/crypto/aes_glue.c
+++ b/arch/x86/crypto/aes_glue.c
@@ -1,70 +1 @@
-/*
- * Glue Code for the asm optimized version of the AES Cipher Algorithm
- *
- */
-
-#include <linux/module.h>
-#include <crypto/aes.h>
-#include <asm/crypto/aes.h>
-
-asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
-asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
-
-void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
-{
- aes_enc_blk(ctx, dst, src);
-}
-EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86);
-
-void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
-{
- aes_dec_blk(ctx, dst, src);
-}
-EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86);
-
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- aes_enc_blk(crypto_tfm_ctx(tfm), dst, src);
-}
-
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- aes_dec_blk(crypto_tfm_ctx(tfm), dst, src);
-}
-
-static struct crypto_alg aes_alg = {
- .cra_name = "aes",
- .cra_driver_name = "aes-asm",
- .cra_priority = 200,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- .cra_u = {
- .cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = crypto_aes_set_key,
- .cia_encrypt = aes_encrypt,
- .cia_decrypt = aes_decrypt
- }
- }
-};
-
-static int __init aes_init(void)
-{
- return crypto_register_alg(&aes_alg);
-}
-
-static void __exit aes_fini(void)
-{
- crypto_unregister_alg(&aes_alg);
-}
-
-module_init(aes_init);
-module_exit(aes_fini);
-
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("aes");
-MODULE_ALIAS_CRYPTO("aes-asm");
+// SPDX-License-Identifier: GPL-2.0-only
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index cb2deb6..e40bdf0 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Implement AES algorithm in Intel AES-NI instructions.
*
@@ -22,11 +23,6 @@
*
* Ported x86_64 version to x86:
* Author: Mathias Krause <minipli@googlemail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 1985ea0..91c039a 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -182,43 +182,30 @@
.text
-##define the fields of the gcm aes context
-#{
-# u8 expanded_keys[16*11] store expanded keys
-# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
-# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
-# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
-# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
-# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
-# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
-# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
-# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
-# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-#} gcm_ctx#
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
-HashKey = 16*11 # store HashKey <<1 mod poly here
-HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
-HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
-HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
-HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
-HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
-HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
-HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
-HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+HashKey = 16*6 # store HashKey <<1 mod poly here
+HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
+HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
+HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
+HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
+HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
+HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
+HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
+HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
#define arg1 %rdi
#define arg2 %rsi
@@ -229,6 +216,8 @@
#define arg7 STACK_OFFSET+8*1(%r14)
#define arg8 STACK_OFFSET+8*2(%r14)
#define arg9 STACK_OFFSET+8*3(%r14)
+#define arg10 STACK_OFFSET+8*4(%r14)
+#define keysize 2*15*16(arg1)
i = 0
j = 0
@@ -267,19 +256,636 @@
# Utility Macros
################################
-# Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK XMM0
- vpxor (arg1), \XMM0, \XMM0
- i = 1
- setreg
-.rep 9
- vaesenc 16*i(arg1), \XMM0, \XMM0
- i = (i+1)
- setreg
-.endr
- vaesenclast 16*10(arg1), \XMM0, \XMM0
+.macro FUNC_SAVE
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
.endm
+.macro FUNC_RESTORE
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
+# Encryption of a single block
+.macro ENCRYPT_SINGLE_BLOCK REP XMM0
+ vpxor (arg1), \XMM0, \XMM0
+ i = 1
+ setreg
+.rep \REP
+ vaesenc 16*i(arg1), \XMM0, \XMM0
+ i = (i+1)
+ setreg
+.endr
+ vaesenclast 16*i(arg1), \XMM0, \XMM0
+.endm
+
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
+ vmovdqu AadHash(arg2), %xmm8
+ vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
+ add arg5, InLen(arg2)
+
+ # initialize the data pointer offset as zero
+ xor %r11d, %r11d
+
+ PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
+ sub %r11, arg5
+
+ mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ cmp $0, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ vmovdqu %xmm14, AadHash(arg2)
+ vmovdqu %xmm9, CurCount(arg2)
+
+ # check for 0 length
+ mov arg5, %r13
+ and $15, %r13 # r13 = (arg5 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block separately
+
+ mov %r13, PBlockLen(arg2)
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vmovdqu %xmm9, CurCount(arg2)
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
+ vmovdqu %xmm9, PBlockEncKey(arg2)
+
+ cmp $16, arg5
+ jge _large_enough_update\@
+
+ lea (arg4,%r11,1), %r10
+ mov %r13, %r12
+
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+ jmp _final_ghash_mul\@
+
+_large_enough_update\@:
+ sub $16, %r11
+ add %r13, %r11
+
+ # receive the last <16 Byte block
+ vmovdqu (arg4, %r11, 1), %xmm1
+
+ sub %r13, %r11
+ add $16, %r11
+
+ lea SHIFT_MASK+16(%rip), %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ sub %r13, %r12
+ # get the appropriate shuffle mask
+ vmovdqu (%r12), %xmm2
+ # shift right 16-r13 bytes
+ vpshufb %xmm2, %xmm1, %xmm1
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+
+ vmovdqu %xmm14, AadHash(arg2)
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+
+ vmovdqu %xmm14, AadHash(arg2)
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg3 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg3 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+.endm
+
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
+ vmovdqu AadHash(arg2), %xmm14
+ vmovdqu HashKey(arg2), %xmm13
+
+ mov PBlockLen(arg2), %r12
+ cmp $0, %r12
+ je _partial_done\@
+
+ #GHASH computation for the last <16 Byte block
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
+ mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ mov InLen(arg2), %r12
+ shl $3, %r12 # len(C) in bits (*128)
+ vmovq %r12, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ vmovdqu OrigIV(arg2), %xmm9
+
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov \AUTH_TAG, %r10 # r10 = authTag
+ mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $8, %r11
+ jl _T_4\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_4\@:
+ vmovd %xmm9, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+.endm
+
+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
+
+ mov \AAD, %r10 # r10 = AAD
+ mov \AADLEN, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor \T8, \T8, \T8
+ vpxor \T7, \T7, \T7
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), \T7
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T7, \T8, \T8
+ \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu \T8, \T7
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor \T7, \T7, \T7
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, \T7, \T7
+ vpxor \T1, \T7, \T7
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, \T7, \T7
+ vpxor \T1, \T7, \T7
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ vmovdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, \T7, \T7
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T8, \T7, \T7
+ \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+ vmovdqu \T7, AadHash(arg2)
+.endm
+
+.macro INIT GHASH_MUL PRECOMPUTE
+ mov arg6, %r11
+ mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
+ xor %r11d, %r11d
+ mov %r11, InLen(arg2) # ctx_data.in_length = 0
+
+ mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
+ mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
+ mov arg3, %rax
+ movdqu (%rax), %xmm0
+ movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
+
+ vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
+ movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
+
+ vmovdqu (arg4), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
+
+ CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
+
+ \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+.endm
+
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+ vpxor \XMMDst, \XMMDst, \XMMDst
+
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ vpinsrq $1, %rax, \XMMDst, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+ AAD_HASH ENC_DEC
+ mov PBlockLen(arg2), %r13
+ cmp $0, %r13
+ je _partial_block_done_\@ # Leave Macro if no partial blocks
+ # Read in input data without over reading
+ cmp $16, \PLAIN_CYPH_LEN
+ jl _fewer_than_16_bytes_\@
+ vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+ jmp _data_read_\@
+
+_fewer_than_16_bytes_\@:
+ lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+ mov \PLAIN_CYPH_LEN, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+ mov PBlockLen(arg2), %r13
+
+_data_read_\@: # Finished reading in data
+
+ vmovdqu PBlockEncKey(arg2), %xmm9
+ vmovdqu HashKey(arg2), %xmm13
+
+ lea SHIFT_MASK(%rip), %r12
+
+ # adjust the shuffle mask pointer to be able to shift r13 bytes
+ # r16-r13 is the number of bytes in plaintext mod 16)
+ add %r13, %r12
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
+
+.if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm3
+ pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_1_\@
+ sub %r10, %r12
+_no_extra_mask_1_\@:
+
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
+
+ vpand %xmm1, %xmm3, %xmm3
+ vmovdqa SHUF_MASK(%rip), %xmm10
+ vpshufb %xmm10, %xmm3, %xmm3
+ vpshufb %xmm2, %xmm3, %xmm3
+ vpxor %xmm3, \AAD_HASH, \AAD_HASH
+
+ cmp $0, %r10
+ jl _partial_incomplete_1_\@
+
+ # GHASH computation for the last <16 Byte block
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax,%eax
+
+ mov %rax, PBlockLen(arg2)
+ jmp _dec_done_\@
+_partial_incomplete_1_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
+_dec_done_\@:
+ vmovdqu \AAD_HASH, AadHash(arg2)
+.else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_2_\@
+ sub %r10, %r12
+_no_extra_mask_2_\@:
+
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9
+
+ vmovdqa SHUF_MASK(%rip), %xmm1
+ vpshufb %xmm1, %xmm9, %xmm9
+ vpshufb %xmm2, %xmm9, %xmm9
+ vpxor %xmm9, \AAD_HASH, \AAD_HASH
+
+ cmp $0, %r10
+ jl _partial_incomplete_2_\@
+
+ # GHASH computation for the last <16 Byte block
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax,%eax
+
+ mov %rax, PBlockLen(arg2)
+ jmp _encode_done_\@
+_partial_incomplete_2_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
+_encode_done_\@:
+ vmovdqu \AAD_HASH, AadHash(arg2)
+
+ vmovdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm9 back to output as ciphertext
+ vpshufb %xmm10, %xmm9, %xmm9
+ vpshufb %xmm2, %xmm9, %xmm9
+.endif
+ # output encrypted Bytes
+ cmp $0, %r10
+ jl _partial_fill_\@
+ mov %r13, %r12
+ mov $16, %r13
+ # Set r13 to be the number of bytes to write out
+ sub %r12, %r13
+ jmp _count_set_\@
+_partial_fill_\@:
+ mov \PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+ vmovdqa %xmm9, %xmm0
+ vmovq %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+
+ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $8, \DATA_OFFSET
+ psrldq $8, %xmm0
+ vmovq %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $1, \DATA_OFFSET
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
+
#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -341,49 +947,49 @@
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_k(arg1)
+ vmovdqu \T1, HashKey_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_2_k(arg1)
+ vmovdqu \T1, HashKey_2_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqa \T5, HashKey_3(arg1)
+ vmovdqu \T5, HashKey_3(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_3_k(arg1)
+ vmovdqu \T1, HashKey_3_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqa \T5, HashKey_4(arg1)
+ vmovdqu \T5, HashKey_4(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_4_k(arg1)
+ vmovdqu \T1, HashKey_4_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqa \T5, HashKey_5(arg1)
+ vmovdqu \T5, HashKey_5(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_5_k(arg1)
+ vmovdqu \T1, HashKey_5_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqa \T5, HashKey_6(arg1)
+ vmovdqu \T5, HashKey_6(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_6_k(arg1)
+ vmovdqu \T1, HashKey_6_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqa \T5, HashKey_7(arg1)
+ vmovdqu \T5, HashKey_7(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_7_k(arg1)
+ vmovdqu \T1, HashKey_7_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqa \T5, HashKey_8(arg1)
+ vmovdqu \T5, HashKey_8(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_8_k(arg1)
+ vmovdqu \T1, HashKey_8_k(arg2)
.endm
@@ -392,84 +998,15 @@
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
-.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
- j = 0
setreg
-
- mov arg6, %r10 # r10 = AAD
- mov arg7, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_j, reg_j, reg_j
- vpxor reg_i, reg_i, reg_i
- cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
- vmovdqu (%r10), reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge _get_AAD_blocks\@
- vmovdqu reg_j, reg_i
- cmp $0, %r11
- je _get_AAD_done\@
-
- vpxor reg_i, reg_i, reg_i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
- cmp $4, %r11
- jle _get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
- cmp $0, %r11
- jle _get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_j, reg_i, reg_i
- GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
- # initialize the data pointer offset as zero
- xor %r11d, %r11d
+ vmovdqu AadHash(arg2), reg_i
# start AES for num_initial_blocks blocks
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), \CTR # CTR = Y0
- vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+ vmovdqu CurCount(arg2), \CTR
i = (9-\num_initial_blocks)
setreg
@@ -490,10 +1027,10 @@
setreg
.endr
- j = 1
- setreg
-.rep 9
- vmovdqa 16*j(arg1), \T_key
+ j = 1
+ setreg
+.rep \REP
+ vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
@@ -502,12 +1039,11 @@
setreg
.endr
- j = (j+1)
- setreg
+ j = (j+1)
+ setreg
.endr
-
- vmovdqa 16*10(arg1), \T_key
+ vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
@@ -519,9 +1055,9 @@
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
add $16, %r11
.if \ENC_DEC == DEC
vmovdqa \T1, reg_i
@@ -595,9 +1131,9 @@
vpxor \T_key, \XMM7, \XMM7
vpxor \T_key, \XMM8, \XMM8
- i = 1
- setreg
-.rep 9 # do 9 rounds
+ i = 1
+ setreg
+.rep \REP # do REP rounds
vmovdqa 16*i(arg1), \T_key
vaesenc \T_key, \XMM1, \XMM1
vaesenc \T_key, \XMM2, \XMM2
@@ -607,11 +1143,10 @@
vaesenc \T_key, \XMM6, \XMM6
vaesenc \T_key, \XMM7, \XMM7
vaesenc \T_key, \XMM8, \XMM8
- i = (i+1)
- setreg
+ i = (i+1)
+ setreg
.endr
-
vmovdqa 16*i(arg1), \T_key
vaesenclast \T_key, \XMM1, \XMM1
vaesenclast \T_key, \XMM2, \XMM2
@@ -622,58 +1157,58 @@
vaesenclast \T_key, \XMM7, \XMM7
vaesenclast \T_key, \XMM8, \XMM8
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg2 , %r11)
+ vmovdqu \XMM1, (arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM1
.endif
- vmovdqu 16*1(arg3, %r11), \T1
+ vmovdqu 16*1(arg4, %r11), \T1
vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg2 , %r11)
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM2
.endif
- vmovdqu 16*2(arg3, %r11), \T1
+ vmovdqu 16*2(arg4, %r11), \T1
vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg2 , %r11)
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM3
.endif
- vmovdqu 16*3(arg3, %r11), \T1
+ vmovdqu 16*3(arg4, %r11), \T1
vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg2 , %r11)
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM4
.endif
- vmovdqu 16*4(arg3, %r11), \T1
+ vmovdqu 16*4(arg4, %r11), \T1
vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg2 , %r11)
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM5
.endif
- vmovdqu 16*5(arg3, %r11), \T1
+ vmovdqu 16*5(arg4, %r11), \T1
vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg2 , %r11)
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM6
.endif
- vmovdqu 16*6(arg3, %r11), \T1
+ vmovdqu 16*6(arg4, %r11), \T1
vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg2 , %r11)
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM7
.endif
- vmovdqu 16*7(arg3, %r11), \T1
+ vmovdqu 16*7(arg4, %r11), \T1
vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg2 , %r11)
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM8
.endif
@@ -698,9 +1233,9 @@
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
vmovdqa \XMM1, \T2
vmovdqa \XMM2, TMP2(%rsp)
@@ -784,14 +1319,14 @@
#######################################################################
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
vpshufd $0b01001110, \T2, \T6
vpxor \T2, \T6, \T6
- vmovdqa HashKey_8_k(arg1), \T5
+ vmovdqu HashKey_8_k(arg2), \T5
vpclmulqdq $0x00, \T5, \T6, \T6
vmovdqu 16*3(arg1), \T1
@@ -805,7 +1340,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP2(%rsp), \T1
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -813,7 +1348,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_7_k(arg1), \T5
+ vmovdqu HashKey_7_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -830,7 +1365,7 @@
#######################################################################
vmovdqa TMP3(%rsp), \T1
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -838,7 +1373,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_6_k(arg1), \T5
+ vmovdqu HashKey_6_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -853,7 +1388,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP4(%rsp), \T1
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -861,7 +1396,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_5_k(arg1), \T5
+ vmovdqu HashKey_5_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -877,7 +1412,7 @@
vmovdqa TMP5(%rsp), \T1
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -885,7 +1420,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_4_k(arg1), \T5
+ vmovdqu HashKey_4_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -900,7 +1435,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -908,7 +1443,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_3_k(arg1), \T5
+ vmovdqu HashKey_3_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -924,7 +1459,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -932,7 +1467,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_2_k(arg1), \T5
+ vmovdqu HashKey_2_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -949,7 +1484,7 @@
vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -957,7 +1492,7 @@
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_k(arg1), \T5
+ vmovdqu HashKey_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -966,17 +1501,35 @@
vmovdqu 16*10(arg1), \T5
+ i = 11
+ setreg
+.rep (\REP-9)
+
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqu 16*i(arg1), \T5
+ i = i + 1
+ setreg
+.endr
+
i = 0
j = 1
setreg
.rep 8
- vpxor 16*i(arg3, %r11), \T5, \T2
+ vpxor 16*i(arg4, %r11), \T5, \T2
.if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j
.else
vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg3, %r11), reg_j
- vmovdqu \T3, 16*i(arg2, %r11)
+ vmovdqu 16*i(arg4, %r11), reg_j
+ vmovdqu \T3, 16*i(arg3, %r11)
.endif
i = (i+1)
j = (j+1)
@@ -1008,14 +1561,14 @@
vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
.if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
.endif
#######################################################################
@@ -1056,25 +1609,25 @@
vpshufd $0b01001110, \XMM1, \T2
vpxor \XMM1, \T2, \T2
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM1, \T6
vpclmulqdq $0x00, \T5, \XMM1, \T7
- vmovdqa HashKey_8_k(arg1), \T3
+ vmovdqu HashKey_8_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \XMM1
######################
vpshufd $0b01001110, \XMM2, \T2
vpxor \XMM2, \T2, \T2
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM2, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM2, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_7_k(arg1), \T3
+ vmovdqu HashKey_7_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1082,14 +1635,14 @@
vpshufd $0b01001110, \XMM3, \T2
vpxor \XMM3, \T2, \T2
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM3, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM3, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_6_k(arg1), \T3
+ vmovdqu HashKey_6_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1097,14 +1650,14 @@
vpshufd $0b01001110, \XMM4, \T2
vpxor \XMM4, \T2, \T2
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM4, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM4, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_5_k(arg1), \T3
+ vmovdqu HashKey_5_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1112,14 +1665,14 @@
vpshufd $0b01001110, \XMM5, \T2
vpxor \XMM5, \T2, \T2
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM5, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM5, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_4_k(arg1), \T3
+ vmovdqu HashKey_4_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1127,14 +1680,14 @@
vpshufd $0b01001110, \XMM6, \T2
vpxor \XMM6, \T2, \T2
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM6, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM6, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_3_k(arg1), \T3
+ vmovdqu HashKey_3_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1142,14 +1695,14 @@
vpshufd $0b01001110, \XMM7, \T2
vpxor \XMM7, \T2, \T2
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM7, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM7, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_2_k(arg1), \T3
+ vmovdqu HashKey_2_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1157,14 +1710,14 @@
vpshufd $0b01001110, \XMM8, \T2
vpxor \XMM8, \T2, \T2
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM8, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM8, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_k(arg1), \T3
+ vmovdqu HashKey_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1210,413 +1763,112 @@
.endm
-
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro GCM_ENC_DEC_AVX ENC_DEC
-
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
-
- vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
-
- mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
-
- mov %r13, %r12
- shr $4, %r12
- and $7, %r12
- jz _initial_num_blocks_is_0\@
-
- cmp $7, %r12
- je _initial_num_blocks_is_7\@
- cmp $6, %r12
- je _initial_num_blocks_is_6\@
- cmp $5, %r12
- je _initial_num_blocks_is_5\@
- cmp $4, %r12
- je _initial_num_blocks_is_4\@
- cmp $3, %r12
- je _initial_num_blocks_is_3\@
- cmp $2, %r12
- je _initial_num_blocks_is_2\@
-
- jmp _initial_num_blocks_is_1\@
-
-_initial_num_blocks_is_7\@:
- INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*7, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_6\@:
- INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*6, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_5\@:
- INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*5, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_4\@:
- INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*4, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_3\@:
- INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*3, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_2\@:
- INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*2, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_1\@:
- INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*1, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_0\@:
- INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-_initial_blocks_encrypted\@:
- cmp $0, %r13
- je _zero_cipher_left\@
-
- sub $128, %r13
- je _eight_cipher_left\@
-
-
-
-
- vmovd %xmm9, %r15d
- and $255, %r15d
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-_encrypt_by_8_new\@:
- cmp $(255-8), %r15d
- jg _encrypt_by_8\@
-
-
-
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp _eight_cipher_left\@
-
-_encrypt_by_8\@:
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-_eight_cipher_left\@:
- GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-_zero_cipher_left\@:
- cmp $16, arg4
- jl _only_less_than_16\@
-
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
- sub $16, %r11
- add %r13, %r11
- vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
- jmp _final_ghash_mul\@
-
-_only_less_than_16\@:
- # check for 0 length
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
-
-_get_last_16_byte_loop\@:
- movb (arg3, %r11), %al
- movb %al, TMP1 (%rsp , %r11)
- add $1, %r11
- cmp %r13, %r11
- jne _get_last_16_byte_loop\@
-
- vmovdqu TMP1(%rsp), %xmm1
-
- sub $16, %r11
-
-_final_ghash_mul\@:
- .if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm2
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm2, %xmm2
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- .else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- vpxor %xmm9, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
- .endif
-
-
- #############################
- # output r13 Bytes
- vmovq %xmm9, %rax
- cmp $8, %r13
- jle _less_than_8_bytes_left\@
-
- mov %rax, (arg2 , %r11)
- add $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- vmovq %xmm9, %rax
- sub $8, %r13
-
-_less_than_8_bytes_left\@:
- movb %al, (arg2 , %r11)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne _less_than_8_bytes_left\@
- #############################
-
-_multiple_of_16_bytes\@:
- mov arg7, %r12 # r12 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- vmovd %r12d, %xmm15 # len(A) in xmm15
-
- shl $3, arg4 # len(C) in bits (*128)
- vmovq arg4, %xmm1
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
-
- vpxor %xmm15, %xmm14, %xmm14
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
-
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), %xmm9 # xmm9 = Y0
-
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
-
- vpxor %xmm14, %xmm9, %xmm9
-
-
-
-_return_T\@:
- mov arg8, %r10 # r10 = authTag
- mov arg9, %r11 # r11 = auth_tag_len
-
- cmp $16, %r11
- je _T_16\@
-
- cmp $8, %r11
- jl _T_4\@
-
-_T_8\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_4\@:
- vmovd %xmm9, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- vpsrldq $4, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_123\@:
- vmovd %xmm9, %eax
- cmp $2, %r11
- jl _T_1\@
- mov %ax, (%r10)
- cmp $2, %r11
- je _return_T_done\@
- add $2, %r10
- sar $16, %eax
-_T_1\@:
- mov %al, (%r10)
- jmp _return_T_done\@
-
-_T_16\@:
- vmovdqu %xmm9, (%r10)
-
-_return_T_done\@:
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-
#############################################################
#void aesni_gcm_precomp_avx_gen2
# (gcm_data *my_ctx_data,
-# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# gcm_context_data *data,
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
#############################################################
-ENTRY(aesni_gcm_precomp_avx_gen2)
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
- vmovdqu (arg2), %xmm6 # xmm6 = HashKey
-
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
- vmovdqa %xmm6, %xmm2
- vpsllq $1, %xmm6, %xmm6
- vpsrlq $63, %xmm2, %xmm2
- vmovdqa %xmm2, %xmm1
- vpslldq $8, %xmm2, %xmm2
- vpsrldq $8, %xmm1, %xmm1
- vpor %xmm2, %xmm6, %xmm6
- #reduction
- vpshufd $0b00100100, %xmm1, %xmm2
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
- vpand POLY(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
- #######################################################################
- vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
-
-
- PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
+ENTRY(aesni_gcm_init_avx_gen2)
+ FUNC_SAVE
+ INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
+ FUNC_RESTORE
ret
-ENDPROC(aesni_gcm_precomp_avx_gen2)
+ENDPROC(aesni_gcm_init_avx_gen2)
###############################################################################
-#void aesni_gcm_enc_avx_gen2(
+#void aesni_gcm_enc_update_avx_gen2(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
# const u8 *in, /* Plaintext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen2)
- GCM_ENC_DEC_AVX ENC
- ret
-ENDPROC(aesni_gcm_enc_avx_gen2)
+ENTRY(aesni_gcm_enc_update_avx_gen2)
+ FUNC_SAVE
+ mov keysize, %eax
+ cmp $32, %eax
+ je key_256_enc_update
+ cmp $16, %eax
+ je key_128_enc_update
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
+ FUNC_RESTORE
+ ret
+key_128_enc_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
+ FUNC_RESTORE
+ ret
+key_256_enc_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update_avx_gen2)
###############################################################################
-#void aesni_gcm_dec_avx_gen2(
+#void aesni_gcm_dec_update_avx_gen2(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen2)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_dec_update
+ cmp $16, %eax
+ je key_128_dec_update
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
+ FUNC_RESTORE
+ ret
+key_128_dec_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
+ FUNC_RESTORE
+ ret
+key_256_dec_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_finalize_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *auth_tag, /* Authenticated Tag output. */
# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
# Valid values are 16 (most likely), 12 or 8. */
###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen2)
- GCM_ENC_DEC_AVX DEC
- ret
-ENDPROC(aesni_gcm_dec_avx_gen2)
+ENTRY(aesni_gcm_finalize_avx_gen2)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_finalize
+ cmp $16, %eax
+ je key_128_finalize
+ # must be 192
+ GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_128_finalize:
+ GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_256_finalize:
+ GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize_avx_gen2)
+
#endif /* CONFIG_AS_AVX */
#ifdef CONFIG_AS_AVX2
@@ -1670,113 +1922,42 @@
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
vmovdqa \HK, \T5
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqa \T5, HashKey_3(arg1)
+ vmovdqu \T5, HashKey_3(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqa \T5, HashKey_4(arg1)
+ vmovdqu \T5, HashKey_4(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqa \T5, HashKey_5(arg1)
+ vmovdqu \T5, HashKey_5(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqa \T5, HashKey_6(arg1)
+ vmovdqu \T5, HashKey_6(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqa \T5, HashKey_7(arg1)
+ vmovdqu \T5, HashKey_7(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqa \T5, HashKey_8(arg1)
+ vmovdqu \T5, HashKey_8(arg2)
.endm
-
## if a = number of total plaintext bytes
## b = floor(a/16)
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
-.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
- j = 0
setreg
-
- mov arg6, %r10 # r10 = AAD
- mov arg7, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_j, reg_j, reg_j
- vpxor reg_i, reg_i, reg_i
-
- cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
- vmovdqu (%r10), reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge _get_AAD_blocks\@
- vmovdqu reg_j, reg_i
- cmp $0, %r11
- je _get_AAD_done\@
-
- vpxor reg_i, reg_i, reg_i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
- cmp $4, %r11
- jle _get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
- cmp $0, %r11
- jle _get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_j, reg_i, reg_i
- GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
- # initialize the data pointer offset as zero
- xor %r11d, %r11d
+ vmovdqu AadHash(arg2), reg_i
# start AES for num_initial_blocks blocks
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), \CTR # CTR = Y0
- vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+ vmovdqu CurCount(arg2), \CTR
i = (9-\num_initial_blocks)
setreg
@@ -1799,7 +1980,7 @@
j = 1
setreg
-.rep 9
+.rep \REP
vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
@@ -1814,7 +1995,7 @@
.endr
- vmovdqa 16*10(arg1), \T_key
+ vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
@@ -1826,9 +2007,9 @@
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
# num_initial_blocks blocks
add $16, %r11
.if \ENC_DEC == DEC
@@ -1905,7 +2086,7 @@
i = 1
setreg
-.rep 9 # do 9 rounds
+.rep \REP # do REP rounds
vmovdqa 16*i(arg1), \T_key
vaesenc \T_key, \XMM1, \XMM1
vaesenc \T_key, \XMM2, \XMM2
@@ -1930,58 +2111,58 @@
vaesenclast \T_key, \XMM7, \XMM7
vaesenclast \T_key, \XMM8, \XMM8
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg2 , %r11)
+ vmovdqu \XMM1, (arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM1
.endif
- vmovdqu 16*1(arg3, %r11), \T1
+ vmovdqu 16*1(arg4, %r11), \T1
vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg2 , %r11)
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM2
.endif
- vmovdqu 16*2(arg3, %r11), \T1
+ vmovdqu 16*2(arg4, %r11), \T1
vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg2 , %r11)
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM3
.endif
- vmovdqu 16*3(arg3, %r11), \T1
+ vmovdqu 16*3(arg4, %r11), \T1
vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg2 , %r11)
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM4
.endif
- vmovdqu 16*4(arg3, %r11), \T1
+ vmovdqu 16*4(arg4, %r11), \T1
vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg2 , %r11)
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM5
.endif
- vmovdqu 16*5(arg3, %r11), \T1
+ vmovdqu 16*5(arg4, %r11), \T1
vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg2 , %r11)
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM6
.endif
- vmovdqu 16*6(arg3, %r11), \T1
+ vmovdqu 16*6(arg4, %r11), \T1
vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg2 , %r11)
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM7
.endif
- vmovdqu 16*7(arg3, %r11), \T1
+ vmovdqu 16*7(arg4, %r11), \T1
vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg2 , %r11)
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM8
.endif
@@ -2010,9 +2191,9 @@
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
vmovdqa \XMM1, \T2
vmovdqa \XMM2, TMP2(%rsp)
@@ -2096,7 +2277,7 @@
#######################################################################
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
@@ -2114,7 +2295,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP2(%rsp), \T1
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2140,7 +2321,7 @@
#######################################################################
vmovdqa TMP3(%rsp), \T1
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2164,7 +2345,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP4(%rsp), \T1
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2189,7 +2370,7 @@
vmovdqa TMP5(%rsp), \T1
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2213,7 +2394,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2237,7 +2418,7 @@
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2264,7 +2445,7 @@
vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
@@ -2281,17 +2462,34 @@
vmovdqu 16*10(arg1), \T5
+ i = 11
+ setreg
+.rep (\REP-9)
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqu 16*i(arg1), \T5
+ i = i + 1
+ setreg
+.endr
+
i = 0
j = 1
setreg
.rep 8
- vpxor 16*i(arg3, %r11), \T5, \T2
+ vpxor 16*i(arg4, %r11), \T5, \T2
.if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j
.else
vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg3, %r11), reg_j
- vmovdqu \T3, 16*i(arg2, %r11)
+ vmovdqu 16*i(arg4, %r11), reg_j
+ vmovdqu \T3, 16*i(arg3, %r11)
.endif
i = (i+1)
j = (j+1)
@@ -2317,14 +2515,14 @@
vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
.if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
.endif
#######################################################################
@@ -2361,7 +2559,7 @@
## Karatsuba Method
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpshufd $0b01001110, \XMM1, \T2
vpshufd $0b01001110, \T5, \T3
@@ -2375,7 +2573,7 @@
######################
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpshufd $0b01001110, \XMM2, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM2, \T2, \T2
@@ -2393,7 +2591,7 @@
######################
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpshufd $0b01001110, \XMM3, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM3, \T2, \T2
@@ -2411,7 +2609,7 @@
######################
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpshufd $0b01001110, \XMM4, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM4, \T2, \T2
@@ -2429,7 +2627,7 @@
######################
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpshufd $0b01001110, \XMM5, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM5, \T2, \T2
@@ -2447,7 +2645,7 @@
######################
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpshufd $0b01001110, \XMM6, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM6, \T2, \T2
@@ -2465,7 +2663,7 @@
######################
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpshufd $0b01001110, \XMM7, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM7, \T2, \T2
@@ -2483,7 +2681,7 @@
######################
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpshufd $0b01001110, \XMM8, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM8, \T2, \T2
@@ -2536,411 +2734,110 @@
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro GCM_ENC_DEC_AVX2 ENC_DEC
-
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
-
- vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
-
- mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
-
- mov %r13, %r12
- shr $4, %r12
- and $7, %r12
- jz _initial_num_blocks_is_0\@
-
- cmp $7, %r12
- je _initial_num_blocks_is_7\@
- cmp $6, %r12
- je _initial_num_blocks_is_6\@
- cmp $5, %r12
- je _initial_num_blocks_is_5\@
- cmp $4, %r12
- je _initial_num_blocks_is_4\@
- cmp $3, %r12
- je _initial_num_blocks_is_3\@
- cmp $2, %r12
- je _initial_num_blocks_is_2\@
-
- jmp _initial_num_blocks_is_1\@
-
-_initial_num_blocks_is_7\@:
- INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*7, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_6\@:
- INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*6, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_5\@:
- INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*5, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_4\@:
- INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*4, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_3\@:
- INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*3, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_2\@:
- INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*2, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_1\@:
- INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*1, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_0\@:
- INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-_initial_blocks_encrypted\@:
- cmp $0, %r13
- je _zero_cipher_left\@
-
- sub $128, %r13
- je _eight_cipher_left\@
-
-
-
-
- vmovd %xmm9, %r15d
- and $255, %r15d
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-_encrypt_by_8_new\@:
- cmp $(255-8), %r15d
- jg _encrypt_by_8\@
-
-
-
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp _eight_cipher_left\@
-
-_encrypt_by_8\@:
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-_eight_cipher_left\@:
- GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-_zero_cipher_left\@:
- cmp $16, arg4
- jl _only_less_than_16\@
-
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
- sub $16, %r11
- add %r13, %r11
- vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer
- # to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
- jmp _final_ghash_mul\@
-
-_only_less_than_16\@:
- # check for 0 length
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
-
-_get_last_16_byte_loop\@:
- movb (arg3, %r11), %al
- movb %al, TMP1 (%rsp , %r11)
- add $1, %r11
- cmp %r13, %r11
- jne _get_last_16_byte_loop\@
-
- vmovdqu TMP1(%rsp), %xmm1
-
- sub $16, %r11
-
-_final_ghash_mul\@:
- .if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm2
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm2, %xmm2
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- .else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- vpxor %xmm9, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
- .endif
-
-
- #############################
- # output r13 Bytes
- vmovq %xmm9, %rax
- cmp $8, %r13
- jle _less_than_8_bytes_left\@
-
- mov %rax, (arg2 , %r11)
- add $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- vmovq %xmm9, %rax
- sub $8, %r13
-
-_less_than_8_bytes_left\@:
- movb %al, (arg2 , %r11)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne _less_than_8_bytes_left\@
- #############################
-
-_multiple_of_16_bytes\@:
- mov arg7, %r12 # r12 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- vmovd %r12d, %xmm15 # len(A) in xmm15
-
- shl $3, arg4 # len(C) in bits (*128)
- vmovq arg4, %xmm1
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
-
- vpxor %xmm15, %xmm14, %xmm14
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
-
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), %xmm9 # xmm9 = Y0
-
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
-
- vpxor %xmm14, %xmm9, %xmm9
-
-
-
-_return_T\@:
- mov arg8, %r10 # r10 = authTag
- mov arg9, %r11 # r11 = auth_tag_len
-
- cmp $16, %r11
- je _T_16\@
-
- cmp $8, %r11
- jl _T_4\@
-
-_T_8\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_4\@:
- vmovd %xmm9, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- vpsrldq $4, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_123\@:
- vmovd %xmm9, %eax
- cmp $2, %r11
- jl _T_1\@
- mov %ax, (%r10)
- cmp $2, %r11
- je _return_T_done\@
- add $2, %r10
- sar $16, %eax
-_T_1\@:
- mov %al, (%r10)
- jmp _return_T_done\@
-
-_T_16\@:
- vmovdqu %xmm9, (%r10)
-
-_return_T_done\@:
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-
#############################################################
-#void aesni_gcm_precomp_avx_gen4
+#void aesni_gcm_init_avx_gen4
# (gcm_data *my_ctx_data,
-# u8 *hash_subkey)# /* H, the Hash sub key input.
-# Data starts on a 16-byte boundary. */
-#############################################################
-ENTRY(aesni_gcm_precomp_avx_gen4)
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
- vmovdqu (arg2), %xmm6 # xmm6 = HashKey
-
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
- vmovdqa %xmm6, %xmm2
- vpsllq $1, %xmm6, %xmm6
- vpsrlq $63, %xmm2, %xmm2
- vmovdqa %xmm2, %xmm1
- vpslldq $8, %xmm2, %xmm2
- vpsrldq $8, %xmm1, %xmm1
- vpor %xmm2, %xmm6, %xmm6
- #reduction
- vpshufd $0b00100100, %xmm1, %xmm2
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
- vpand POLY(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
- #######################################################################
- vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
-
-
- PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- ret
-ENDPROC(aesni_gcm_precomp_avx_gen4)
-
-
-###############################################################################
-#void aesni_gcm_enc_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
-# const u8 *in, /* Plaintext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen4)
- GCM_ENC_DEC_AVX2 ENC
- ret
-ENDPROC(aesni_gcm_enc_avx_gen4)
-
-###############################################################################
-#void aesni_gcm_dec_avx_gen4(
-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
-# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
+# gcm_context_data *data,
# u8 *iv, /* Pre-counter block j0: 4 byte salt
# (from Security Association) concatenated with 8 byte
# Initialisation Vector (from IPSec ESP Payload)
# concatenated with 0x00000001. 16-byte aligned pointer. */
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+#############################################################
+ENTRY(aesni_gcm_init_avx_gen4)
+ FUNC_SAVE
+ INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_init_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_enc_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+# const u8 *in, /* Plaintext input */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_enc_update_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_enc_update4
+ cmp $16, %eax
+ je key_128_enc_update4
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
+ FUNC_RESTORE
+ ret
+key_128_enc_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
+ FUNC_RESTORE
+ ret
+key_256_enc_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_dec_update_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
+# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+# const u8 *in, /* Ciphertext input */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_dec_update4
+ cmp $16, %eax
+ je key_128_dec_update4
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
+ FUNC_RESTORE
+ ret
+key_128_dec_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
+ FUNC_RESTORE
+ ret
+key_256_dec_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_finalize_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *auth_tag, /* Authenticated Tag output. */
# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
+# Valid values are 16 (most likely), 12 or 8. */
###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen4)
- GCM_ENC_DEC_AVX2 DEC
- ret
-ENDPROC(aesni_gcm_dec_avx_gen4)
+ENTRY(aesni_gcm_finalize_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_finalize4
+ cmp $16, %eax
+ je key_128_finalize4
+ # must be 192
+ GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_128_finalize4:
+ GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_256_finalize4:
+ GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize_avx_gen4)
#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e4b78f9..3e707e8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Support for Intel AES-NI instructions. This file contains glue
* code, the real AES implementation is in intel-aes_asm.S.
@@ -12,11 +13,6 @@
* Tadeusz Struk (tadeusz.struk@intel.com)
* Aidan O'Mahony (aidan.o.mahony@intel.com)
* Copyright (c) 2010, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/hardirq.h>
@@ -25,14 +21,12 @@
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/aes.h>
-#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <crypto/b128ops.h>
#include <crypto/gcm.h>
#include <crypto/xts.h>
#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <asm/crypto/aes.h>
+#include <asm/simd.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/simd.h>
@@ -84,7 +78,7 @@
u8 current_counter[GCM_BLOCK_LEN];
u64 partial_block_len;
u64 unused;
- u8 hash_keys[GCM_BLOCK_LEN * 8];
+ u8 hash_keys[GCM_BLOCK_LEN * 16];
};
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -102,9 +96,6 @@
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-int crypto_fpu_init(void);
-void crypto_fpu_exit(void);
-
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
@@ -178,6 +169,24 @@
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
+static const struct aesni_gcm_tfm_s {
+ void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len);
+ void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+ void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len);
+ void (*finalize)(void *ctx, struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+} *aesni_gcm_tfm;
+
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+ .init = &aesni_gcm_init,
+ .enc_update = &aesni_gcm_enc_update,
+ .dec_update = &aesni_gcm_dec_update,
+ .finalize = &aesni_gcm_finalize,
+};
+
#ifdef CONFIG_AS_AVX
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
@@ -186,136 +195,94 @@
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
/*
- * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
-asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey,
+ const u8 *aad,
+ unsigned long aad_len);
-asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-static void aesni_gcm_enc_avx(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
- aesni_gcm_enc(ctx, data, out, in,
- plaintext_len, iv, hash_subkey, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+ .init = &aesni_gcm_init_avx_gen2,
+ .enc_update = &aesni_gcm_enc_update_avx_gen2,
+ .dec_update = &aesni_gcm_dec_update_avx_gen2,
+ .finalize = &aesni_gcm_finalize_avx_gen2,
+};
-static void aesni_gcm_dec_avx(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
- aesni_gcm_dec(ctx, data, out, in,
- ciphertext_len, iv, hash_subkey, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
#endif
#ifdef CONFIG_AS_AVX2
/*
- * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
-asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey,
+ const u8 *aad,
+ unsigned long aad_len);
-asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-static void aesni_gcm_enc_avx2(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
- aesni_gcm_enc(ctx, data, out, in,
- plaintext_len, iv, hash_subkey, aad,
- aad_len, auth_tag, auth_tag_len);
- } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
- aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+ .init = &aesni_gcm_init_avx_gen4,
+ .enc_update = &aesni_gcm_enc_update_avx_gen4,
+ .dec_update = &aesni_gcm_dec_update_avx_gen4,
+ .finalize = &aesni_gcm_finalize_avx_gen4,
+};
-static void aesni_gcm_dec_avx2(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
- aesni_gcm_dec(ctx, data, out, in,
- ciphertext_len, iv, hash_subkey,
- aad, aad_len, auth_tag, auth_tag_len);
- } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
- aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
#endif
-static void (*aesni_gcm_enc_tfm)(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long plaintext_len,
- u8 *iv, u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len, u8 *auth_tag,
- unsigned long auth_tag_len);
-
-static void (*aesni_gcm_dec_tfm)(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long ciphertext_len,
- u8 *iv, u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len, u8 *auth_tag,
- unsigned long auth_tag_len);
-
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
@@ -360,8 +327,8 @@
return -EINVAL;
}
- if (!irq_fpu_usable())
- err = crypto_aes_expand_key(ctx, in_key, key_len);
+ if (!crypto_simd_usable())
+ err = aes_expandkey(ctx, in_key, key_len);
else {
kernel_fpu_begin();
err = aesni_set_key(ctx, in_key, key_len);
@@ -377,46 +344,32 @@
return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
}
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (!irq_fpu_usable())
- crypto_aes_encrypt_x86(ctx, dst, src);
- else {
+ if (!crypto_simd_usable()) {
+ aes_encrypt(ctx, dst, src);
+ } else {
kernel_fpu_begin();
aesni_enc(ctx, dst, src);
kernel_fpu_end();
}
}
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
- if (!irq_fpu_usable())
- crypto_aes_decrypt_x86(ctx, dst, src);
- else {
+ if (!crypto_simd_usable()) {
+ aes_decrypt(ctx, dst, src);
+ } else {
kernel_fpu_begin();
aesni_dec(ctx, dst, src);
kernel_fpu_end();
}
}
-static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
-
- aesni_enc(ctx, dst, src);
-}
-
-static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
-
- aesni_dec(ctx, dst, src);
-}
-
static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int len)
{
@@ -656,7 +609,8 @@
return glue_xts_req_128bit(&aesni_enc_xts, req,
XTS_TWEAK_CAST(aesni_xts_tweak),
aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx));
+ aes_ctx(ctx->raw_crypt_ctx),
+ false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -667,55 +621,28 @@
return glue_xts_req_128bit(&aesni_dec_xts, req,
XTS_TWEAK_CAST(aesni_xts_tweak),
aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx));
-}
-
-static int rfc4106_init(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
- return 0;
-}
-
-static void rfc4106_exit(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
+ aes_ctx(ctx->raw_crypt_ctx),
+ true);
}
static int
rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
{
- struct crypto_cipher *tfm;
+ struct crypto_aes_ctx ctx;
int ret;
- tfm = crypto_alloc_cipher("aes", 0, 0);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- ret = crypto_cipher_setkey(tfm, key, key_len);
+ ret = aes_expandkey(&ctx, key, key_len);
if (ret)
- goto out_free_cipher;
+ return ret;
/* Clear the data in the hash sub key container to zero.*/
/* We want to cipher all zeros to create the hash sub key. */
memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
- crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey);
+ aes_encrypt(&ctx, hash_subkey, hash_subkey);
-out_free_cipher:
- crypto_free_cipher(tfm);
- return ret;
+ memzero_explicit(&ctx, sizeof(ctx));
+ return 0;
}
static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
@@ -737,15 +664,8 @@
rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
}
-static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key,
- unsigned int key_len)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(parent);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setkey(&cryptd_tfm->base, key, key_len);
-}
-
+/* This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long. */
static int common_rfc4106_set_authsize(struct crypto_aead *aead,
unsigned int authsize)
{
@@ -761,17 +681,6 @@
return 0;
}
-/* This is the Integrity Check Value (aka the authentication tag length and can
- * be 8, 12 or 16 bytes long. */
-static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent,
- unsigned int authsize)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(parent);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-
static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
unsigned int authsize)
{
@@ -797,6 +706,7 @@
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
struct gcm_context_data data AESNI_ALIGN_ATTR;
struct scatter_walk dst_sg_walk = {};
unsigned long left = req->cryptlen;
@@ -814,6 +724,15 @@
if (!enc)
left -= auth_tag_len;
+#ifdef CONFIG_AS_AVX2
+ if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
+ gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+#endif
+#ifdef CONFIG_AS_AVX
+ if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
+ gcm_tfm = &aesni_gcm_tfm_sse;
+#endif
+
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length &&
(!PageHighMem(sg_page(req->src)) ||
@@ -830,15 +749,18 @@
scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
}
- src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
- scatterwalk_start(&src_sg_walk, src_sg);
- if (req->src != req->dst) {
- dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
- scatterwalk_start(&dst_sg_walk, dst_sg);
+ if (left) {
+ src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+ scatterwalk_start(&src_sg_walk, src_sg);
+ if (req->src != req->dst) {
+ dst_sg = scatterwalk_ffwd(dst_start, req->dst,
+ req->assoclen);
+ scatterwalk_start(&dst_sg_walk, dst_sg);
+ }
}
kernel_fpu_begin();
- aesni_gcm_init(aes_ctx, &data, iv,
+ gcm_tfm->init(aes_ctx, &data, iv,
hash_subkey, assoc, assoclen);
if (req->src != req->dst) {
while (left) {
@@ -849,10 +771,10 @@
len = min(srclen, dstlen);
if (len) {
if (enc)
- aesni_gcm_enc_update(aes_ctx, &data,
+ gcm_tfm->enc_update(aes_ctx, &data,
dst, src, len);
else
- aesni_gcm_dec_update(aes_ctx, &data,
+ gcm_tfm->dec_update(aes_ctx, &data,
dst, src, len);
}
left -= len;
@@ -870,10 +792,10 @@
len = scatterwalk_clamp(&src_sg_walk, left);
if (len) {
if (enc)
- aesni_gcm_enc_update(aes_ctx, &data,
+ gcm_tfm->enc_update(aes_ctx, &data,
src, src, len);
else
- aesni_gcm_dec_update(aes_ctx, &data,
+ gcm_tfm->dec_update(aes_ctx, &data,
src, src, len);
}
left -= len;
@@ -882,7 +804,7 @@
scatterwalk_done(&src_sg_walk, 1, left);
}
}
- aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
+ gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
kernel_fpu_end();
if (!assocmem)
@@ -915,147 +837,15 @@
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- u8 one_entry_in_sg = 0;
- u8 *src, *dst, *assoc;
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- struct scatter_walk src_sg_walk;
- struct scatter_walk dst_sg_walk = {};
- struct gcm_context_data data AESNI_ALIGN_ATTR;
-
- if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
- aesni_gcm_enc_tfm == aesni_gcm_enc ||
- req->cryptlen < AVX_GEN2_OPTSIZE) {
- return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
- aes_ctx);
- }
- if (sg_is_last(req->src) &&
- (!PageHighMem(sg_page(req->src)) ||
- req->src->offset + req->src->length <= PAGE_SIZE) &&
- sg_is_last(req->dst) &&
- (!PageHighMem(sg_page(req->dst)) ||
- req->dst->offset + req->dst->length <= PAGE_SIZE)) {
- one_entry_in_sg = 1;
- scatterwalk_start(&src_sg_walk, req->src);
- assoc = scatterwalk_map(&src_sg_walk);
- src = assoc + req->assoclen;
- dst = src;
- if (unlikely(req->src != req->dst)) {
- scatterwalk_start(&dst_sg_walk, req->dst);
- dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
- }
- } else {
- /* Allocate memory for src, dst, assoc */
- assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
- GFP_ATOMIC);
- if (unlikely(!assoc))
- return -ENOMEM;
- scatterwalk_map_and_copy(assoc, req->src, 0,
- req->assoclen + req->cryptlen, 0);
- src = assoc + req->assoclen;
- dst = src;
- }
-
- kernel_fpu_begin();
- aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
- hash_subkey, assoc, assoclen,
- dst + req->cryptlen, auth_tag_len);
- kernel_fpu_end();
-
- /* The authTag (aka the Integrity Check Value) needs to be written
- * back to the packet. */
- if (one_entry_in_sg) {
- if (unlikely(req->src != req->dst)) {
- scatterwalk_unmap(dst - req->assoclen);
- scatterwalk_advance(&dst_sg_walk, req->dst->length);
- scatterwalk_done(&dst_sg_walk, 1, 0);
- }
- scatterwalk_unmap(assoc);
- scatterwalk_advance(&src_sg_walk, req->src->length);
- scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
- } else {
- scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
- req->cryptlen + auth_tag_len, 1);
- kfree(assoc);
- }
- return 0;
+ return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+ aes_ctx);
}
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- u8 one_entry_in_sg = 0;
- u8 *src, *dst, *assoc;
- unsigned long tempCipherLen = 0;
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 authTag[16];
- struct scatter_walk src_sg_walk;
- struct scatter_walk dst_sg_walk = {};
- struct gcm_context_data data AESNI_ALIGN_ATTR;
- int retval = 0;
-
- if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
- aesni_gcm_enc_tfm == aesni_gcm_enc ||
- req->cryptlen < AVX_GEN2_OPTSIZE) {
- return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
- aes_ctx);
- }
- tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
-
- if (sg_is_last(req->src) &&
- (!PageHighMem(sg_page(req->src)) ||
- req->src->offset + req->src->length <= PAGE_SIZE) &&
- sg_is_last(req->dst) && req->dst->length &&
- (!PageHighMem(sg_page(req->dst)) ||
- req->dst->offset + req->dst->length <= PAGE_SIZE)) {
- one_entry_in_sg = 1;
- scatterwalk_start(&src_sg_walk, req->src);
- assoc = scatterwalk_map(&src_sg_walk);
- src = assoc + req->assoclen;
- dst = src;
- if (unlikely(req->src != req->dst)) {
- scatterwalk_start(&dst_sg_walk, req->dst);
- dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
- }
- } else {
- /* Allocate memory for src, dst, assoc */
- assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
- if (!assoc)
- return -ENOMEM;
- scatterwalk_map_and_copy(assoc, req->src, 0,
- req->assoclen + req->cryptlen, 0);
- src = assoc + req->assoclen;
- dst = src;
- }
-
-
- kernel_fpu_begin();
- aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
- hash_subkey, assoc, assoclen,
- authTag, auth_tag_len);
- kernel_fpu_end();
-
- /* Compare generated tag with passed in tag. */
- retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
- -EBADMSG : 0;
-
- if (one_entry_in_sg) {
- if (unlikely(req->src != req->dst)) {
- scatterwalk_unmap(dst - req->assoclen);
- scatterwalk_advance(&dst_sg_walk, req->dst->length);
- scatterwalk_done(&dst_sg_walk, 1, 0);
- }
- scatterwalk_unmap(assoc);
- scatterwalk_advance(&src_sg_walk, req->src->length);
- scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
- } else {
- scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
- tempCipherLen, 1);
- kfree(assoc);
- }
- return retval;
-
+ return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+ aes_ctx);
}
static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -1110,41 +900,9 @@
return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
aes_ctx);
}
-
-static int gcmaes_wrapper_encrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- tfm = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- tfm = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, tfm);
-
- return crypto_aead_encrypt(req);
-}
-
-static int gcmaes_wrapper_decrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- tfm = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- tfm = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, tfm);
-
- return crypto_aead_decrypt(req);
-}
#endif
-static struct crypto_alg aesni_algs[] = { {
+static struct crypto_alg aesni_cipher_alg = {
.cra_name = "aes",
.cra_driver_name = "aes-aesni",
.cra_priority = 300,
@@ -1157,28 +915,11 @@
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key,
- .cia_encrypt = aes_encrypt,
- .cia_decrypt = aes_decrypt
+ .cia_encrypt = aesni_encrypt,
+ .cia_decrypt = aesni_decrypt
}
}
-}, {
- .cra_name = "__aes",
- .cra_driver_name = "__aes-aesni",
- .cra_priority = 300,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .cipher = {
- .cia_min_keysize = AES_MIN_KEY_SIZE,
- .cia_max_keysize = AES_MAX_KEY_SIZE,
- .cia_setkey = aes_set_key,
- .cia_encrypt = __aes_encrypt,
- .cia_decrypt = __aes_decrypt
- }
- }
-} };
+};
static struct skcipher_alg aesni_skciphers[] = {
{
@@ -1253,22 +994,6 @@
static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
-static struct {
- const char *algname;
- const char *drvname;
- const char *basename;
- struct simd_skcipher_alg *simd;
-} aesni_simd_skciphers2[] = {
-#if (defined(MODULE) && IS_ENABLED(CONFIG_CRYPTO_PCBC)) || \
- IS_BUILTIN(CONFIG_CRYPTO_PCBC)
- {
- .algname = "pcbc(aes)",
- .drvname = "pcbc-aes-aesni",
- .basename = "fpu(pcbc(__aes-aesni))",
- },
-#endif
-};
-
#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
unsigned int key_len)
@@ -1310,31 +1035,7 @@
aes_ctx);
}
-static int generic_gcmaes_init(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
-
- return 0;
-}
-
-static void generic_gcmaes_exit(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
-}
-
-static struct aead_alg aesni_aead_algs[] = { {
+static struct aead_alg aesni_aeads[] = { {
.setkey = common_rfc4106_set_key,
.setauthsize = common_rfc4106_set_authsize,
.encrypt = helper_rfc4106_encrypt,
@@ -1342,8 +1043,9 @@
.ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
- .cra_name = "__gcm-aes-aesni",
- .cra_driver_name = "__driver-gcm-aes-aesni",
+ .cra_name = "__rfc4106(gcm(aes))",
+ .cra_driver_name = "__rfc4106-gcm-aesni",
+ .cra_priority = 400,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
@@ -1351,24 +1053,6 @@
.cra_module = THIS_MODULE,
},
}, {
- .init = rfc4106_init,
- .exit = rfc4106_exit,
- .setkey = gcmaes_wrapper_set_key,
- .setauthsize = gcmaes_wrapper_set_authsize,
- .encrypt = gcmaes_wrapper_encrypt,
- .decrypt = gcmaes_wrapper_decrypt,
- .ivsize = GCM_RFC4106_IV_SIZE,
- .maxauthsize = 16,
- .base = {
- .cra_name = "rfc4106(gcm(aes))",
- .cra_driver_name = "rfc4106-gcm-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct cryptd_aead *),
- .cra_module = THIS_MODULE,
- },
-}, {
.setkey = generic_gcmaes_set_key,
.setauthsize = generic_gcmaes_set_authsize,
.encrypt = generic_gcmaes_encrypt,
@@ -1376,38 +1060,21 @@
.ivsize = GCM_AES_IV_SIZE,
.maxauthsize = 16,
.base = {
- .cra_name = "__generic-gcm-aes-aesni",
- .cra_driver_name = "__driver-generic-gcm-aes-aesni",
- .cra_priority = 0,
+ .cra_name = "__gcm(aes)",
+ .cra_driver_name = "__generic-gcm-aesni",
+ .cra_priority = 400,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
.cra_alignmask = AESNI_ALIGN - 1,
.cra_module = THIS_MODULE,
},
-}, {
- .init = generic_gcmaes_init,
- .exit = generic_gcmaes_exit,
- .setkey = gcmaes_wrapper_set_key,
- .setauthsize = gcmaes_wrapper_set_authsize,
- .encrypt = gcmaes_wrapper_encrypt,
- .decrypt = gcmaes_wrapper_decrypt,
- .ivsize = GCM_AES_IV_SIZE,
- .maxauthsize = 16,
- .base = {
- .cra_name = "gcm(aes)",
- .cra_driver_name = "generic-gcm-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct cryptd_aead *),
- .cra_module = THIS_MODULE,
- },
} };
#else
-static struct aead_alg aesni_aead_algs[0];
+static struct aead_alg aesni_aeads[0];
#endif
+static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
static const struct x86_cpu_id aesni_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_AES),
@@ -1415,27 +1082,9 @@
};
MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
-static void aesni_free_simds(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) &&
- aesni_simd_skciphers[i]; i++)
- simd_skcipher_free(aesni_simd_skciphers[i]);
-
- for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++)
- if (aesni_simd_skciphers2[i].simd)
- simd_skcipher_free(aesni_simd_skciphers2[i].simd);
-}
-
static int __init aesni_init(void)
{
- struct simd_skcipher_alg *simd;
- const char *basename;
- const char *algname;
- const char *drvname;
int err;
- int i;
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
@@ -1443,21 +1092,18 @@
#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
- aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
- aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+ aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
} else
#endif
#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
- aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
- aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+ aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
} else
#endif
{
pr_info("SSE version of gcm_enc/dec engaged.\n");
- aesni_gcm_enc_tfm = aesni_gcm_enc;
- aesni_gcm_dec_tfm = aesni_gcm_dec;
+ aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
aesni_ctr_enc_tfm = aesni_ctr_enc;
#ifdef CONFIG_AS_AVX
@@ -1469,72 +1115,38 @@
#endif
#endif
- err = crypto_fpu_init();
+ err = crypto_register_alg(&aesni_cipher_alg);
if (err)
return err;
- err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+ err = simd_register_skciphers_compat(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers),
+ aesni_simd_skciphers);
if (err)
- goto fpu_exit;
+ goto unregister_cipher;
- err = crypto_register_skciphers(aesni_skciphers,
- ARRAY_SIZE(aesni_skciphers));
- if (err)
- goto unregister_algs;
-
- err = crypto_register_aeads(aesni_aead_algs,
- ARRAY_SIZE(aesni_aead_algs));
+ err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ aesni_simd_aeads);
if (err)
goto unregister_skciphers;
- for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) {
- algname = aesni_skciphers[i].base.cra_name + 2;
- drvname = aesni_skciphers[i].base.cra_driver_name + 2;
- basename = aesni_skciphers[i].base.cra_driver_name;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- goto unregister_simds;
-
- aesni_simd_skciphers[i] = simd;
- }
-
- for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) {
- algname = aesni_simd_skciphers2[i].algname;
- drvname = aesni_simd_skciphers2[i].drvname;
- basename = aesni_simd_skciphers2[i].basename;
- simd = simd_skcipher_create_compat(algname, drvname, basename);
- err = PTR_ERR(simd);
- if (IS_ERR(simd))
- continue;
-
- aesni_simd_skciphers2[i].simd = simd;
- }
-
return 0;
-unregister_simds:
- aesni_free_simds();
- crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
unregister_skciphers:
- crypto_unregister_skciphers(aesni_skciphers,
- ARRAY_SIZE(aesni_skciphers));
-unregister_algs:
- crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
-fpu_exit:
- crypto_fpu_exit();
+ simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+ aesni_simd_skciphers);
+unregister_cipher:
+ crypto_unregister_alg(&aesni_cipher_alg);
return err;
}
static void __exit aesni_exit(void)
{
- aesni_free_simds();
- crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
- crypto_unregister_skciphers(aesni_skciphers,
- ARRAY_SIZE(aesni_skciphers));
- crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
-
- crypto_fpu_exit();
+ simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ aesni_simd_aeads);
+ simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+ aesni_simd_skciphers);
+ crypto_unregister_alg(&aesni_cipher_alg);
}
late_initcall(aesni_init);
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 8c1fcb6..330db7a 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -1,23 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Blowfish Cipher Algorithm (x86_64)
*
* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 3e0c07c..cedfdba 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for assembler optimized version of Blowfish
*
@@ -7,22 +8,6 @@
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
* CTR part based on code (crypto/ctr.c) by:
* (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <crypto/algapi.h>
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index b66bbfa..4be4c7c 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -1,13 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* x86_64/AVX2/AES-NI assembler implementation of Camellia
*
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 95ba695..23528bc 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -1,23 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Camellia Cipher Algorithm (x86_64)
*
* Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index d4992e4..a4f0012 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
*
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
*/
#include <asm/crypto/camellia.h>
@@ -187,7 +182,7 @@
return glue_xts_req_128bit(&camellia_enc_xts, req,
XTS_TWEAK_CAST(camellia_enc_blk),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -197,7 +192,7 @@
return glue_xts_req_128bit(&camellia_dec_xts, req,
XTS_TWEAK_CAST(camellia_enc_blk),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
static struct skcipher_alg camellia_algs[] = {
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index d09f652..f28d282 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
*/
#include <asm/crypto/camellia.h>
@@ -213,7 +208,7 @@
return glue_xts_req_128bit(&camellia_enc_xts, req,
XTS_TWEAK_CAST(camellia_enc_blk),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -223,7 +218,7 @@
return glue_xts_req_128bit(&camellia_dec_xts, req,
XTS_TWEAK_CAST(camellia_enc_blk),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
static struct skcipher_alg camellia_algs[] = {
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index dcd5e0f..7c62db5 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for assembler optimized version of Camellia
*
@@ -5,22 +6,6 @@
*
* Camellia parts based on code by:
* Copyright (C) 2006 NTT (Nippon Telegraph and Telephone Corporation)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <asm/unaligned.h>
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 86107c9..dc55c33 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
*
@@ -5,22 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 4103474..384ccb0 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -1,24 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Glue Code for the AVX assembler implemention of the Cast5 Cipher
+ * Glue Code for the AVX assembler implementation of the Cast5 Cipher
*
* Copyright (C) 2012 Johannes Goetzfried
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <asm/crypto/glue_helper.h>
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 7f30b6f..4f0a7cd 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
*
@@ -5,22 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 9fb66b5..a8a38ff 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -1,26 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Glue Code for the AVX assembler implemention of the Cast6 Cipher
+ * Glue Code for the AVX assembler implementation of the Cast6 Cipher
*
* Copyright (C) 2012 Johannes Goetzfried
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/module.h>
@@ -216,7 +201,7 @@
return glue_xts_req_128bit(&cast6_enc_xts, req,
XTS_TWEAK_CAST(__cast6_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -226,7 +211,7 @@
return glue_xts_req_128bit(&cast6_dec_xts, req,
XTS_TWEAK_CAST(__cast6_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
static struct skcipher_alg cast6_algs[] = {
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
new file mode 100644
index 0000000..831e443
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -0,0 +1,1021 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+ .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+ .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.text
+
+ENTRY(chacha_2block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ vmovdqa ROT8(%rip),%ymm4
+ vmovdqa ROT16(%rip),%ymm5
+
+ mov %rcx,%rax
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rax
+ jl .Lxorpart2
+ vpxor 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rax
+ jl .Lxorpart2
+ vpxor 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rax
+ jl .Lxorpart2
+ vpxor 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rax
+ jl .Lxorpart2
+ vpxor 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rax
+ jl .Lxorpart2
+ vpxor 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rax
+ jl .Lxorpart2
+ vpxor 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rax
+ jl .Lxorpart2
+ vpxor 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rax
+ jl .Lxorpart2
+ vpxor 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone2
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm7,%xmm7
+ vmovdqa %xmm7,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone2
+
+ENDPROC(chacha_2block_xor_avx2)
+
+ENTRY(chacha_4block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ vmovdqa ROT8(%rip),%ymm8
+ vmovdqa ROT16(%rip),%ymm9
+
+ mov %rcx,%rax
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ vpxor 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ vpxor 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ vpxor 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ vpxor 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ vpxor 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ vpxor 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ vpxor 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ vpxor 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ vpxor 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ vpxor 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ vpxor 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ vpxor 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ vpxor 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ vpxor 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ vpxor 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ vpxor 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm10,%xmm10
+ vmovdqa %xmm10,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_avx2)
+
+ENTRY(chacha_8block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. As we need some
+ # scratch registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+ # words, which allows us to do XOR in AVX registers. 8/16-bit word
+ # rotation is done with the slightly better performing byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ vzeroupper
+ # 4 * 32 byte stack, 32-byte aligned
+ lea 8(%rsp),%r10
+ and $~31, %rsp
+ sub $0x80, %rsp
+ mov %rcx,%rax
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+ # x0..3 on stack
+ vmovdqa %ymm0,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm3,0x60(%rsp)
+
+ vmovdqa CTRINC(%rip),%ymm1
+ vmovdqa ROT8(%rip),%ymm2
+ vmovdqa ROT16(%rip),%ymm3
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpaddd 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpbroadcastd 0x04(%rdi),%ymm0
+ vpaddd 0x20(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpbroadcastd 0x08(%rdi),%ymm0
+ vpaddd 0x40(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpbroadcastd 0x0c(%rdi),%ymm0
+ vpaddd 0x60(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpbroadcastd 0x10(%rdi),%ymm0
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm0
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm0
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm7,%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm0
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm0
+ vpaddd %ymm0,%ymm9,%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm0
+ vpaddd %ymm0,%ymm10,%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm0
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm0
+ vpaddd %ymm0,%ymm13,%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm0
+ vpaddd %ymm0,%ymm14,%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm15,%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ # interleave 32-bit words in state n, n+1
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm1,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpckldq %ymm5,%ymm0,%ymm4
+ vpunpckhdq %ymm5,%ymm0,%ymm5
+ vmovdqa %ymm6,%ymm0
+ vpunpckldq %ymm7,%ymm0,%ymm6
+ vpunpckhdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpckldq %ymm9,%ymm0,%ymm8
+ vpunpckhdq %ymm9,%ymm0,%ymm9
+ vmovdqa %ymm10,%ymm0
+ vpunpckldq %ymm11,%ymm0,%ymm10
+ vpunpckhdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpckldq %ymm13,%ymm0,%ymm12
+ vpunpckhdq %ymm13,%ymm0,%ymm13
+ vmovdqa %ymm14,%ymm0
+ vpunpckldq %ymm15,%ymm0,%ymm14
+ vpunpckhdq %ymm15,%ymm0,%ymm15
+
+ # interleave 64-bit words in state n, n+2
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x40(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpcklqdq %ymm6,%ymm0,%ymm4
+ vpunpckhqdq %ymm6,%ymm0,%ymm6
+ vmovdqa %ymm5,%ymm0
+ vpunpcklqdq %ymm7,%ymm0,%ymm5
+ vpunpckhqdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpcklqdq %ymm10,%ymm0,%ymm8
+ vpunpckhqdq %ymm10,%ymm0,%ymm10
+ vmovdqa %ymm9,%ymm0
+ vpunpcklqdq %ymm11,%ymm0,%ymm9
+ vpunpckhqdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpcklqdq %ymm14,%ymm0,%ymm12
+ vpunpckhqdq %ymm14,%ymm0,%ymm14
+ vmovdqa %ymm13,%ymm0
+ vpunpcklqdq %ymm15,%ymm0,%ymm13
+ vpunpckhqdq %ymm15,%ymm0,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa 0x00(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
+ cmp $0x0020,%rax
+ jl .Lxorpart8
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rax
+ jl .Lxorpart8
+ vpxor 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vmovdqa 0x40(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
+ cmp $0x0060,%rax
+ jl .Lxorpart8
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rax
+ jl .Lxorpart8
+ vpxor 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vmovdqa 0x20(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rax
+ jl .Lxorpart8
+ vpxor 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vmovdqa 0x60(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
+ cmp $0x00e0,%rax
+ jl .Lxorpart8
+ vpxor 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rax
+ jl .Lxorpart8
+ vpxor 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa %ymm4,%ymm0
+ cmp $0x0120,%rax
+ jl .Lxorpart8
+ vpxor 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0100(%rsi)
+
+ vmovdqa %ymm12,%ymm0
+ cmp $0x0140,%rax
+ jl .Lxorpart8
+ vpxor 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0120(%rsi)
+
+ vmovdqa %ymm6,%ymm0
+ cmp $0x0160,%rax
+ jl .Lxorpart8
+ vpxor 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0140(%rsi)
+
+ vmovdqa %ymm14,%ymm0
+ cmp $0x0180,%rax
+ jl .Lxorpart8
+ vpxor 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0160(%rsi)
+
+ vmovdqa %ymm5,%ymm0
+ cmp $0x01a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0180(%rsi)
+
+ vmovdqa %ymm13,%ymm0
+ cmp $0x01c0,%rax
+ jl .Lxorpart8
+ vpxor 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01a0(%rsi)
+
+ vmovdqa %ymm7,%ymm0
+ cmp $0x01e0,%rax
+ jl .Lxorpart8
+ vpxor 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01c0(%rsi)
+
+ vmovdqa %ymm15,%ymm0
+ cmp $0x0200,%rax
+ jl .Lxorpart8
+ vpxor 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ lea -8(%r10),%rsp
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x1f,%r9
+ jz .Ldone8
+ and $~0x1f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone8
+
+ENDPROC(chacha_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
new file mode 100644
index 0000000..848f9c7
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha_2block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rcx
+ jl .Lxorpart2
+ vpxord 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rcx
+ jl .Lxorpart2
+ vpxord 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rcx
+ jl .Lxorpart2
+ vpxord 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rcx
+ jl .Lxorpart2
+ vpxord 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rcx
+ jl .Lxorpart2
+ vpxord 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rcx
+ jl .Lxorpart2
+ vpxord 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rcx
+ jl .Lxorpart2
+ vpxord 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rcx
+ jl .Lxorpart2
+ vpxord 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm7,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone2
+
+ENDPROC(chacha_2block_xor_avx512vl)
+
+ENTRY(chacha_4block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rcx
+ jl .Lxorpart4
+ vpxord 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rcx
+ jl .Lxorpart4
+ vpxord 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rcx
+ jl .Lxorpart4
+ vpxord 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rcx
+ jl .Lxorpart4
+ vpxord 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rcx
+ jl .Lxorpart4
+ vpxord 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rcx
+ jl .Lxorpart4
+ vpxord 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rcx
+ jl .Lxorpart4
+ vpxord 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rcx
+ jl .Lxorpart4
+ vpxord 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rcx
+ jl .Lxorpart4
+ vpxord 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rcx
+ jl .Lxorpart4
+ vpxord 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rcx
+ jl .Lxorpart4
+ vpxord 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rcx
+ jl .Lxorpart4
+ vpxord 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rcx
+ jl .Lxorpart4
+ vpxord 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rcx
+ jl .Lxorpart4
+ vpxord 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rcx
+ jl .Lxorpart4
+ vpxord 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rcx
+ jl .Lxorpart4
+ vpxord 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm10,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_avx512vl)
+
+ENTRY(chacha_8block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. Compared to AVX2, this
+ # mostly benefits from the new rotate instructions in VL and the
+ # additional registers.
+
+ vzeroupper
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd CTR8BL(%rip),%ymm12,%ymm12
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ # interleave 32-bit words in state n, n+1
+ vpunpckldq %ymm1,%ymm0,%ymm16
+ vpunpckhdq %ymm1,%ymm0,%ymm17
+ vpunpckldq %ymm3,%ymm2,%ymm18
+ vpunpckhdq %ymm3,%ymm2,%ymm19
+ vpunpckldq %ymm5,%ymm4,%ymm20
+ vpunpckhdq %ymm5,%ymm4,%ymm21
+ vpunpckldq %ymm7,%ymm6,%ymm22
+ vpunpckhdq %ymm7,%ymm6,%ymm23
+ vpunpckldq %ymm9,%ymm8,%ymm24
+ vpunpckhdq %ymm9,%ymm8,%ymm25
+ vpunpckldq %ymm11,%ymm10,%ymm26
+ vpunpckhdq %ymm11,%ymm10,%ymm27
+ vpunpckldq %ymm13,%ymm12,%ymm28
+ vpunpckhdq %ymm13,%ymm12,%ymm29
+ vpunpckldq %ymm15,%ymm14,%ymm30
+ vpunpckhdq %ymm15,%ymm14,%ymm31
+
+ # interleave 64-bit words in state n, n+2
+ vpunpcklqdq %ymm18,%ymm16,%ymm0
+ vpunpcklqdq %ymm19,%ymm17,%ymm1
+ vpunpckhqdq %ymm18,%ymm16,%ymm2
+ vpunpckhqdq %ymm19,%ymm17,%ymm3
+ vpunpcklqdq %ymm22,%ymm20,%ymm4
+ vpunpcklqdq %ymm23,%ymm21,%ymm5
+ vpunpckhqdq %ymm22,%ymm20,%ymm6
+ vpunpckhqdq %ymm23,%ymm21,%ymm7
+ vpunpcklqdq %ymm26,%ymm24,%ymm8
+ vpunpcklqdq %ymm27,%ymm25,%ymm9
+ vpunpckhqdq %ymm26,%ymm24,%ymm10
+ vpunpckhqdq %ymm27,%ymm25,%ymm11
+ vpunpcklqdq %ymm30,%ymm28,%ymm12
+ vpunpcklqdq %ymm31,%ymm29,%ymm13
+ vpunpckhqdq %ymm30,%ymm28,%ymm14
+ vpunpckhqdq %ymm31,%ymm29,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa64 %ymm0,%ymm16
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
+ cmp $0x0020,%rcx
+ jl .Lxorpart8
+ vpxord 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0000(%rsi)
+ vmovdqa64 %ymm16,%ymm0
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rcx
+ jl .Lxorpart8
+ vpxord 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
+ cmp $0x0060,%rcx
+ jl .Lxorpart8
+ vpxord 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rcx
+ jl .Lxorpart8
+ vpxord 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
+ cmp $0x00e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rcx
+ jl .Lxorpart8
+ vpxord 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa64 %ymm4,%ymm0
+ cmp $0x0120,%rcx
+ jl .Lxorpart8
+ vpxord 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0100(%rsi)
+
+ vmovdqa64 %ymm12,%ymm0
+ cmp $0x0140,%rcx
+ jl .Lxorpart8
+ vpxord 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0120(%rsi)
+
+ vmovdqa64 %ymm6,%ymm0
+ cmp $0x0160,%rcx
+ jl .Lxorpart8
+ vpxord 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0140(%rsi)
+
+ vmovdqa64 %ymm14,%ymm0
+ cmp $0x0180,%rcx
+ jl .Lxorpart8
+ vpxord 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0160(%rsi)
+
+ vmovdqa64 %ymm5,%ymm0
+ cmp $0x01a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0180(%rsi)
+
+ vmovdqa64 %ymm13,%ymm0
+ cmp $0x01c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01a0(%rsi)
+
+ vmovdqa64 %ymm7,%ymm0
+ cmp $0x01e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01c0(%rsi)
+
+ vmovdqa64 %ymm15,%ymm0
+ cmp $0x0200,%rcx
+ jl .Lxorpart8
+ vpxord 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0x1f,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0x1f,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
+ vpxord %ymm0,%ymm1,%ymm1
+ vmovdqu8 %ymm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone8
+
+ENDPROC(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
similarity index 76%
rename from arch/x86/crypto/chacha20-ssse3-x86_64.S
rename to arch/x86/crypto/chacha-ssse3-x86_64.S
index 512a2b5..2d86c7d 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -1,15 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
.section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16
@@ -23,35 +20,25 @@
.text
-ENTRY(chacha20_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: 1 data block output, o
- # %rdx: 1 data block input, i
-
- # This function encrypts one ChaCha20 block by loading the state matrix
- # in four SSE registers. It performs matrix operation on four words in
- # parallel, but requireds shuffling to rearrange the words after each
- # round. 8/16-bit word rotation is done with the slightly better
- # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
- # traditional shift+OR.
-
- # x0..3 = s0..3
- movdqa 0x00(%rdi),%xmm0
- movdqa 0x10(%rdi),%xmm1
- movdqa 0x20(%rdi),%xmm2
- movdqa 0x30(%rdi),%xmm3
- movdqa %xmm0,%xmm8
- movdqa %xmm1,%xmm9
- movdqa %xmm2,%xmm10
- movdqa %xmm3,%xmm11
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round. 8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+chacha_permute:
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
- mov $10,%ecx
-
.Ldoubleround:
-
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -118,39 +105,129 @@
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
pshufd $0x39,%xmm3,%xmm3
- dec %ecx
+ sub $2,%r8d
jnz .Ldoubleround
+ ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 1 data block output, o
+ # %rdx: up to 1 data block input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+ FRAME_BEGIN
+
+ # x0..3 = s0..3
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+
+ mov %rcx,%rax
+ call chacha_permute
+
# o0 = i0 ^ (x0 + s0)
- movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart
+ movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
- movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1
- pxor %xmm5,%xmm1
- movdqu %xmm1,0x10(%rsi)
+ movdqa %xmm1,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart
+ movdqu 0x10(%rdx),%xmm0
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
- movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2
- pxor %xmm6,%xmm2
- movdqu %xmm2,0x20(%rsi)
+ movdqa %xmm2,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart
+ movdqu 0x20(%rdx),%xmm0
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
- movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3
- pxor %xmm7,%xmm3
- movdqu %xmm3,0x30(%rsi)
+ movdqa %xmm3,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart
+ movdqu 0x30(%rdx),%xmm0
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+.Ldone:
+ FRAME_END
ret
-ENDPROC(chacha20_block_xor_ssse3)
-ENTRY(chacha20_4block_xor_ssse3)
+.Lxorpart:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone
+
+ENDPROC(chacha_block_xor_ssse3)
+
+ENTRY(hchacha_block_ssse3)
# %rdi: Input state matrix, s
- # %rsi: 4 data blocks output, o
- # %rdx: 4 data blocks input, i
+ # %rsi: output (8 32-bit words)
+ # %edx: nrounds
+ FRAME_BEGIN
- # This function encrypts four consecutive ChaCha20 blocks by loading the
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+
+ mov %edx,%r8d
+ call chacha_permute
+
+ movdqu %xmm0,0x00(%rsi)
+ movdqu %xmm3,0x10(%rsi)
+
+ FRAME_END
+ ret
+ENDPROC(hchacha_block_ssse3)
+
+ENTRY(chacha_4block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four consecutive ChaCha blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
# registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
@@ -163,6 +240,7 @@
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
+ mov %rcx,%rax
# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@@ -202,8 +280,6 @@
# x12 += counter values 0-3
paddd %xmm1,%xmm12
- mov $10,%ecx
-
.Ldoubleround4:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
movdqa 0x00(%rsp),%xmm0
@@ -421,7 +497,7 @@
psrld $25,%xmm4
por %xmm0,%xmm4
- dec %ecx
+ sub $2,%r8d
jnz .Ldoubleround4
# x0[0-3] += s0[0]
@@ -573,58 +649,143 @@
# xor with corresponding input, write to output
movdqa 0x00(%rsp),%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
- movdqa 0x10(%rsp),%xmm0
- movdqu 0x80(%rdx),%xmm1
+
+ movdqu %xmm4,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
+ movdqu %xmm0,0x10(%rsi)
+
+ movdqu %xmm8,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+
+ movdqu %xmm12,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
movdqa 0x20(%rsp),%xmm0
+ cmp $0x50,%rax
+ jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
+
+ movdqu %xmm6,%xmm0
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x50(%rsi)
+
+ movdqu %xmm10,%xmm0
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x60(%rsi)
+
+ movdqu %xmm14,%xmm0
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x70(%rsi)
+
+ movdqa 0x10(%rsp),%xmm0
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+
+ movdqu %xmm5,%xmm0
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x90(%rsi)
+
+ movdqu %xmm9,%xmm0
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xa0(%rsi)
+
+ movdqu %xmm13,%xmm0
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xb0(%rsi)
+
movdqa 0x30(%rsp),%xmm0
+ cmp $0xd0,%rax
+ jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm4
- movdqu %xmm4,0x10(%rsi)
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm5
- movdqu %xmm5,0x90(%rsi)
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm6
- movdqu %xmm6,0x50(%rsi)
- movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm7
- movdqu %xmm7,0xd0(%rsi)
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm8
- movdqu %xmm8,0x20(%rsi)
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm9
- movdqu %xmm9,0xa0(%rsi)
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm10
- movdqu %xmm10,0x60(%rsi)
- movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm11
- movdqu %xmm11,0xe0(%rsi)
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm12
- movdqu %xmm12,0x30(%rsi)
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm13
- movdqu %xmm13,0xb0(%rsi)
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm14
- movdqu %xmm14,0x70(%rsi)
- movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm15
- movdqu %xmm15,0xf0(%rsi)
+ movdqu %xmm7,%xmm0
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ movdqu 0xd0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xd0(%rsi)
+
+ movdqu %xmm11,%xmm0
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ movdqu 0xe0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xe0(%rsi)
+
+ movdqu %xmm15,%xmm0
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ movdqu 0xf0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xf0(%rsi)
+
+.Ldone4:
lea -8(%r10),%rsp
ret
-ENDPROC(chacha20_4block_xor_ssse3)
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
deleted file mode 100644
index f3cd26f..0000000
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
- .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
- .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.text
-
-ENTRY(chacha20_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: 8 data blocks output, o
- # %rdx: 8 data blocks input, i
-
- # This function encrypts eight consecutive ChaCha20 blocks by loading
- # the state matrix in AVX registers eight times. As we need some
- # scratch registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32-, 64- and then 128-bit
- # words, which allows us to do XOR in AVX registers. 8/16-bit word
- # rotation is done with the slightly better performing byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- vzeroupper
- # 4 * 32 byte stack, 32-byte aligned
- lea 8(%rsp),%r10
- and $~31, %rsp
- sub $0x80, %rsp
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
- # x0..3 on stack
- vmovdqa %ymm0,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm3,0x60(%rsp)
-
- vmovdqa CTRINC(%rip),%ymm1
- vmovdqa ROT8(%rip),%ymm2
- vmovdqa ROT16(%rip),%ymm3
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- mov $10,%ecx
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- dec %ecx
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpaddd 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpbroadcastd 0x04(%rdi),%ymm0
- vpaddd 0x20(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpbroadcastd 0x08(%rdi),%ymm0
- vpaddd 0x40(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpbroadcastd 0x0c(%rdi),%ymm0
- vpaddd 0x60(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpbroadcastd 0x10(%rdi),%ymm0
- vpaddd %ymm0,%ymm4,%ymm4
- vpbroadcastd 0x14(%rdi),%ymm0
- vpaddd %ymm0,%ymm5,%ymm5
- vpbroadcastd 0x18(%rdi),%ymm0
- vpaddd %ymm0,%ymm6,%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm0
- vpaddd %ymm0,%ymm7,%ymm7
- vpbroadcastd 0x20(%rdi),%ymm0
- vpaddd %ymm0,%ymm8,%ymm8
- vpbroadcastd 0x24(%rdi),%ymm0
- vpaddd %ymm0,%ymm9,%ymm9
- vpbroadcastd 0x28(%rdi),%ymm0
- vpaddd %ymm0,%ymm10,%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm0
- vpaddd %ymm0,%ymm11,%ymm11
- vpbroadcastd 0x30(%rdi),%ymm0
- vpaddd %ymm0,%ymm12,%ymm12
- vpbroadcastd 0x34(%rdi),%ymm0
- vpaddd %ymm0,%ymm13,%ymm13
- vpbroadcastd 0x38(%rdi),%ymm0
- vpaddd %ymm0,%ymm14,%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm0
- vpaddd %ymm0,%ymm15,%ymm15
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- # interleave 32-bit words in state n, n+1
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x20(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm1,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpckldq %ymm5,%ymm0,%ymm4
- vpunpckhdq %ymm5,%ymm0,%ymm5
- vmovdqa %ymm6,%ymm0
- vpunpckldq %ymm7,%ymm0,%ymm6
- vpunpckhdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpckldq %ymm9,%ymm0,%ymm8
- vpunpckhdq %ymm9,%ymm0,%ymm9
- vmovdqa %ymm10,%ymm0
- vpunpckldq %ymm11,%ymm0,%ymm10
- vpunpckhdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpckldq %ymm13,%ymm0,%ymm12
- vpunpckhdq %ymm13,%ymm0,%ymm13
- vmovdqa %ymm14,%ymm0
- vpunpckldq %ymm15,%ymm0,%ymm14
- vpunpckhdq %ymm15,%ymm0,%ymm15
-
- # interleave 64-bit words in state n, n+2
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x40(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpcklqdq %ymm6,%ymm0,%ymm4
- vpunpckhqdq %ymm6,%ymm0,%ymm6
- vmovdqa %ymm5,%ymm0
- vpunpcklqdq %ymm7,%ymm0,%ymm5
- vpunpckhqdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpcklqdq %ymm10,%ymm0,%ymm8
- vpunpckhqdq %ymm10,%ymm0,%ymm10
- vmovdqa %ymm9,%ymm0
- vpunpcklqdq %ymm11,%ymm0,%ymm9
- vpunpckhqdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpcklqdq %ymm14,%ymm0,%ymm12
- vpunpckhqdq %ymm14,%ymm0,%ymm14
- vmovdqa %ymm13,%ymm0
- vpunpcklqdq %ymm15,%ymm0,%ymm13
- vpunpckhqdq %ymm15,%ymm0,%ymm15
-
- # interleave 128-bit words in state n, n+4
- vmovdqa 0x00(%rsp),%ymm0
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
- vmovdqa %ymm1,0x40(%rsp)
- vmovdqa 0x60(%rsp),%ymm0
- vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
- vmovdqa %ymm1,0x60(%rsp)
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
- vmovdqa %ymm0,%ymm8
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
- vmovdqa %ymm0,%ymm9
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
- vmovdqa %ymm0,%ymm10
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
- vmovdqa %ymm0,%ymm11
-
- # xor with corresponding input, write to output
- vmovdqa 0x00(%rsp),%ymm0
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vmovdqa 0x20(%rsp),%ymm0
- vpxor 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0080(%rsi)
- vmovdqa 0x40(%rsp),%ymm0
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vmovdqa 0x60(%rsp),%ymm0
- vpxor 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00c0(%rsi)
- vpxor 0x0100(%rdx),%ymm4,%ymm4
- vmovdqu %ymm4,0x0100(%rsi)
- vpxor 0x0180(%rdx),%ymm5,%ymm5
- vmovdqu %ymm5,0x00180(%rsi)
- vpxor 0x0140(%rdx),%ymm6,%ymm6
- vmovdqu %ymm6,0x0140(%rsi)
- vpxor 0x01c0(%rdx),%ymm7,%ymm7
- vmovdqu %ymm7,0x01c0(%rsi)
- vpxor 0x0020(%rdx),%ymm8,%ymm8
- vmovdqu %ymm8,0x0020(%rsi)
- vpxor 0x00a0(%rdx),%ymm9,%ymm9
- vmovdqu %ymm9,0x00a0(%rsi)
- vpxor 0x0060(%rdx),%ymm10,%ymm10
- vmovdqu %ymm10,0x0060(%rsi)
- vpxor 0x00e0(%rdx),%ymm11,%ymm11
- vmovdqu %ymm11,0x00e0(%rsi)
- vpxor 0x0120(%rdx),%ymm12,%ymm12
- vmovdqu %ymm12,0x0120(%rsi)
- vpxor 0x01a0(%rdx),%ymm13,%ymm13
- vmovdqu %ymm13,0x01a0(%rsi)
- vpxor 0x0160(%rdx),%ymm14,%ymm14
- vmovdqu %ymm14,0x0160(%rsi)
- vpxor 0x01e0(%rdx),%ymm15,%ymm15
- vmovdqu %ymm15,0x01e0(%rsi)
-
- vzeroupper
- lea -8(%r10),%rsp
- ret
-ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
deleted file mode 100644
index dce7c5d..0000000
--- a/arch/x86/crypto/chacha20_glue.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/simd.h>
-
-#define CHACHA20_STATE_ALIGN 16
-
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
-static bool chacha20_use_avx2;
-#endif
-
-static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes)
-{
- u8 buf[CHACHA20_BLOCK_SIZE];
-
-#ifdef CONFIG_AS_AVX2
- if (chacha20_use_avx2) {
- while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
- chacha20_8block_xor_avx2(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 8;
- src += CHACHA20_BLOCK_SIZE * 8;
- dst += CHACHA20_BLOCK_SIZE * 8;
- state[12] += 8;
- }
- }
-#endif
- while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
- chacha20_4block_xor_ssse3(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 4;
- src += CHACHA20_BLOCK_SIZE * 4;
- dst += CHACHA20_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_xor_ssse3(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha20_block_xor_ssse3(state, buf, buf);
- memcpy(dst, buf, bytes);
- }
-}
-
-static int chacha20_simd(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *state, state_buf[16 + 2] __aligned(8);
- struct skcipher_walk walk;
- int err;
-
- BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-
- if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
- return crypto_chacha20_crypt(req);
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_chacha20_init(state, ctx, walk.iv);
-
- kernel_fpu_begin();
-
- while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
- err = skcipher_walk_done(&walk,
- walk.nbytes % CHACHA20_BLOCK_SIZE);
- }
-
- if (walk.nbytes) {
- chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes);
- err = skcipher_walk_done(&walk, 0);
- }
-
- kernel_fpu_end();
-
- return err;
-}
-
-static struct skcipher_alg alg = {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-simd",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha20_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA20_KEY_SIZE,
- .max_keysize = CHACHA20_KEY_SIZE,
- .ivsize = CHACHA20_IV_SIZE,
- .chunksize = CHACHA20_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
- .encrypt = chacha20_simd,
- .decrypt = chacha20_simd,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
- chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#endif
- return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
- crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
new file mode 100644
index 0000000..388f95a
--- /dev/null
+++ b/arch/x86/crypto/chacha_glue.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+#define CHACHA_STATE_ALIGN 16
+
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+static bool chacha_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+static bool chacha_use_avx512vl;
+#endif
+#endif
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+ len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+ return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+ if (chacha_use_avx512vl) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_2block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+#endif
+ if (chacha_use_avx2) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+#endif
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state[12]++;
+ }
+}
+
+static int chacha_simd_stream_xor(struct skcipher_walk *walk,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ int next_yield = 4096; /* bytes until next FPU yield */
+ int err = 0;
+
+ BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ while (walk->nbytes > 0) {
+ unsigned int nbytes = walk->nbytes;
+
+ if (nbytes < walk->total) {
+ nbytes = round_down(nbytes, walk->stride);
+ next_yield -= nbytes;
+ }
+
+ chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes, ctx->nrounds);
+
+ if (next_yield <= 0) {
+ /* temporarily allow preemption */
+ kernel_fpu_end();
+ kernel_fpu_begin();
+ next_yield = 4096;
+ }
+
+ err = skcipher_walk_done(walk, walk->nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int chacha_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+ return crypto_chacha_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ err = chacha_simd_stream_xor(&walk, ctx, req->iv);
+ kernel_fpu_end();
+ return err;
+}
+
+static int xchacha_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ struct chacha_ctx subctx;
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ u8 real_iv[16];
+ int err;
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+ return crypto_xchacha_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
+
+ BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ crypto_chacha_init(state, ctx, req->iv);
+
+ kernel_fpu_begin();
+
+ hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
+
+ kernel_fpu_end();
+
+ return err;
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha_simd,
+ .decrypt = chacha_simd,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = xchacha_simd,
+ .decrypt = xchacha_simd,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha12_setkey,
+ .encrypt = xchacha_simd,
+ .decrypt = xchacha_simd,
+ },
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+ chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+ chacha_use_avx512vl = chacha_use_avx2 &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
+#endif
+ return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index c8d9cda..cb4ab66 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -32,10 +32,11 @@
#include <linux/kernel.h>
#include <linux/crc32.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
+#include <asm/simd.h>
#define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4
@@ -54,7 +55,7 @@
unsigned int iremainder;
unsigned int prealign;
- if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable())
+ if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable())
return crc32_le(crc, p, len);
if ((long)p & SCALE_F_MASK) {
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 5773e11..eefa086 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
* CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
@@ -9,30 +10,17 @@
* Copyright (C) 2008 Intel Corporation
* Authors: Austin Zhang <austin_zhang@linux.intel.com>
* Kent Liu <kent.liu@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
-#include <asm/fpu/internal.h>
+#include <asm/simd.h>
#define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4
@@ -177,7 +165,7 @@
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
- if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
+ if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
@@ -189,7 +177,7 @@
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
- if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
+ if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index de04d3e..3d873e6 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -43,609 +43,291 @@
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-########################################################################
-# Function API:
-# UINT16 crc_t10dif_pcl(
-# UINT16 init_crc, //initial CRC value, 16 bits
-# const unsigned char *buf, //buffer pointer to calculate CRC on
-# UINT64 len //buffer length in bytes (64-bit data)
-# );
#
# Reference paper titled "Fast CRC Computation for Generic
# Polynomials Using PCLMULQDQ Instruction"
# URL: http://www.intel.com/content/dam/www/public/us/en/documents
# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
#
-#
#include <linux/linkage.h>
.text
-#define arg1 %rdi
-#define arg2 %rsi
-#define arg3 %rdx
+#define init_crc %edi
+#define buf %rsi
+#define len %rdx
-#define arg1_low32 %edi
+#define FOLD_CONSTS %xmm10
+#define BSWAP_MASK %xmm11
-ENTRY(crc_t10dif_pcl)
+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
+# reg1, reg2.
+.macro fold_32_bytes offset, reg1, reg2
+ movdqu \offset(buf), %xmm9
+ movdqu \offset+16(buf), %xmm12
+ pshufb BSWAP_MASK, %xmm9
+ pshufb BSWAP_MASK, %xmm12
+ movdqa \reg1, %xmm8
+ movdqa \reg2, %xmm13
+ pclmulqdq $0x00, FOLD_CONSTS, \reg1
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm8
+ pclmulqdq $0x00, FOLD_CONSTS, \reg2
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm13
+ pxor %xmm9 , \reg1
+ xorps %xmm8 , \reg1
+ pxor %xmm12, \reg2
+ xorps %xmm13, \reg2
+.endm
+
+# Fold src_reg into dst_reg.
+.macro fold_16_bytes src_reg, dst_reg
+ movdqa \src_reg, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, \src_reg
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, \dst_reg
+ xorps \src_reg, \dst_reg
+.endm
+
+#
+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
+#
+# Assumes len >= 16.
+#
.align 16
+ENTRY(crc_t10dif_pcl)
- # adjust the 16-bit initial_crc value, scale it to 32 bits
- shl $16, arg1_low32
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
- # Allocate Stack Space
- mov %rsp, %rcx
- sub $16*2, %rsp
- # align stack to 16 byte boundary
- and $~(0x10 - 1), %rsp
+ # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp $256, len
+ jl .Lless_than_256_bytes
- # check if smaller than 256
- cmp $256, arg3
+ # Load the first 128 data bytes. Byte swapping is necessary to make the
+ # bit order match the polynomial coefficient order.
+ movdqu 16*0(buf), %xmm0
+ movdqu 16*1(buf), %xmm1
+ movdqu 16*2(buf), %xmm2
+ movdqu 16*3(buf), %xmm3
+ movdqu 16*4(buf), %xmm4
+ movdqu 16*5(buf), %xmm5
+ movdqu 16*6(buf), %xmm6
+ movdqu 16*7(buf), %xmm7
+ add $128, buf
+ pshufb BSWAP_MASK, %xmm0
+ pshufb BSWAP_MASK, %xmm1
+ pshufb BSWAP_MASK, %xmm2
+ pshufb BSWAP_MASK, %xmm3
+ pshufb BSWAP_MASK, %xmm4
+ pshufb BSWAP_MASK, %xmm5
+ pshufb BSWAP_MASK, %xmm6
+ pshufb BSWAP_MASK, %xmm7
- # for sizes less than 128, we can't fold 64B at a time...
- jl _less_than_128
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm8, %xmm8
+ pinsrw $7, init_crc, %xmm8
+ pxor %xmm8, %xmm0
+ movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
- # load the initial crc value
- movd arg1_low32, %xmm10 # initial crc
+ # Subtract 128 for the 128 data bytes just consumed. Subtract another
+ # 128 to simplify the termination condition of the following loop.
+ sub $256, len
- # crc value does not need to be byte-reflected, but it needs
- # to be moved to the high part of the register.
- # because data will be byte-reflected and will align with
- # initial crc at correct place.
- pslldq $12, %xmm10
+ # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
+ # bytes xmm0-7 into them, storing the result back into xmm0-7.
+.Lfold_128_bytes_loop:
+ fold_32_bytes 0, %xmm0, %xmm1
+ fold_32_bytes 32, %xmm2, %xmm3
+ fold_32_bytes 64, %xmm4, %xmm5
+ fold_32_bytes 96, %xmm6, %xmm7
+ add $128, buf
+ sub $128, len
+ jge .Lfold_128_bytes_loop
- movdqa SHUF_MASK(%rip), %xmm11
- # receive the initial 64B data, xor the initial crc value
- movdqu 16*0(arg2), %xmm0
- movdqu 16*1(arg2), %xmm1
- movdqu 16*2(arg2), %xmm2
- movdqu 16*3(arg2), %xmm3
- movdqu 16*4(arg2), %xmm4
- movdqu 16*5(arg2), %xmm5
- movdqu 16*6(arg2), %xmm6
- movdqu 16*7(arg2), %xmm7
+ # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
- pshufb %xmm11, %xmm0
- # XOR the initial_crc value
- pxor %xmm10, %xmm0
- pshufb %xmm11, %xmm1
- pshufb %xmm11, %xmm2
- pshufb %xmm11, %xmm3
- pshufb %xmm11, %xmm4
- pshufb %xmm11, %xmm5
- pshufb %xmm11, %xmm6
- pshufb %xmm11, %xmm7
+ # Fold across 64 bytes.
+ movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm0, %xmm4
+ fold_16_bytes %xmm1, %xmm5
+ fold_16_bytes %xmm2, %xmm6
+ fold_16_bytes %xmm3, %xmm7
+ # Fold across 32 bytes.
+ movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm4, %xmm6
+ fold_16_bytes %xmm5, %xmm7
+ # Fold across 16 bytes.
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm6, %xmm7
- movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
- #imm value of pclmulqdq instruction
- #will determine which constant to use
+ # Add 128 to get the correct number of data bytes remaining in 0...127
+ # (not counting xmm7), following the previous extra subtraction by 128.
+ # Then subtract 16 to simplify the termination condition of the
+ # following loop.
+ add $128-16, len
- #################################################################
- # we subtract 256 instead of 128 to save one instruction from the loop
- sub $256, arg3
-
- # at this section of the code, there is 64*x+y (0<=y<64) bytes of
- # buffer. The _fold_64_B_loop will fold 64B at a time
- # until we have 64+y Bytes of buffer
-
-
- # fold 64B at a time. This section of the code folds 4 xmm
- # registers in parallel
-_fold_64_B_loop:
-
- # update the buffer pointer
- add $128, arg2 # buf += 64#
-
- movdqu 16*0(arg2), %xmm9
- movdqu 16*1(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm0, %xmm8
- movdqa %xmm1, %xmm13
- pclmulqdq $0x0 , %xmm10, %xmm0
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0 , %xmm10, %xmm1
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm0
- xorps %xmm8 , %xmm0
- pxor %xmm12, %xmm1
- xorps %xmm13, %xmm1
-
- movdqu 16*2(arg2), %xmm9
- movdqu 16*3(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm2, %xmm8
- movdqa %xmm3, %xmm13
- pclmulqdq $0x0, %xmm10, %xmm2
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0, %xmm10, %xmm3
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm2
- xorps %xmm8 , %xmm2
- pxor %xmm12, %xmm3
- xorps %xmm13, %xmm3
-
- movdqu 16*4(arg2), %xmm9
- movdqu 16*5(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm4, %xmm8
- movdqa %xmm5, %xmm13
- pclmulqdq $0x0, %xmm10, %xmm4
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0, %xmm10, %xmm5
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm4
- xorps %xmm8 , %xmm4
- pxor %xmm12, %xmm5
- xorps %xmm13, %xmm5
-
- movdqu 16*6(arg2), %xmm9
- movdqu 16*7(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm6 , %xmm8
- movdqa %xmm7 , %xmm13
- pclmulqdq $0x0 , %xmm10, %xmm6
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0 , %xmm10, %xmm7
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm6
- xorps %xmm8 , %xmm6
- pxor %xmm12, %xmm7
- xorps %xmm13, %xmm7
-
- sub $128, arg3
-
- # check if there is another 64B in the buffer to be able to fold
- jge _fold_64_B_loop
- ##################################################################
-
-
- add $128, arg2
- # at this point, the buffer pointer is pointing at the last y Bytes
- # of the buffer the 64B of folded data is in 4 of the xmm
- # registers: xmm0, xmm1, xmm2, xmm3
-
-
- # fold the 8 xmm registers to 1 xmm register with different constants
-
- movdqa rk9(%rip), %xmm10
- movdqa %xmm0, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm0
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm0, %xmm7
-
- movdqa rk11(%rip), %xmm10
- movdqa %xmm1, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm1
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm1, %xmm7
-
- movdqa rk13(%rip), %xmm10
- movdqa %xmm2, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm2
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- pxor %xmm2, %xmm7
-
- movdqa rk15(%rip), %xmm10
- movdqa %xmm3, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm3
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm3, %xmm7
-
- movdqa rk17(%rip), %xmm10
- movdqa %xmm4, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm4
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- pxor %xmm4, %xmm7
-
- movdqa rk19(%rip), %xmm10
- movdqa %xmm5, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm5
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm5, %xmm7
-
- movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
- #imm value of pclmulqdq instruction
- #will determine which constant to use
- movdqa %xmm6, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm6
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- pxor %xmm6, %xmm7
-
-
- # instead of 64, we add 48 to the loop counter to save 1 instruction
- # from the loop instead of a cmp instruction, we use the negative
- # flag with the jl instruction
- add $128-16, arg3
- jl _final_reduction_for_128
-
- # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
- # and the rest is in memory. We can fold 16 bytes at a time if y>=16
- # continue folding 16B at a time
-
-_16B_reduction_loop:
+ # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
+ # xmm7 into them, storing the result back into xmm7.
+ jl .Lfold_16_bytes_loop_done
+.Lfold_16_bytes_loop:
movdqa %xmm7, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm7
- pclmulqdq $0x0 , %xmm10, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
pxor %xmm8, %xmm7
- movdqu (arg2), %xmm0
- pshufb %xmm11, %xmm0
+ movdqu (buf), %xmm0
+ pshufb BSWAP_MASK, %xmm0
pxor %xmm0 , %xmm7
- add $16, arg2
- sub $16, arg3
- # instead of a cmp instruction, we utilize the flags with the
- # jge instruction equivalent of: cmp arg3, 16-16
- # check if there is any more 16B in the buffer to be able to fold
- jge _16B_reduction_loop
+ add $16, buf
+ sub $16, len
+ jge .Lfold_16_bytes_loop
- #now we have 16+z bytes left to reduce, where 0<= z < 16.
- #first, we reduce the data in the xmm7 register
+.Lfold_16_bytes_loop_done:
+ # Add 16 to get the correct number of data bytes remaining in 0...15
+ # (not counting xmm7), following the previous extra subtraction by 16.
+ add $16, len
+ je .Lreduce_final_16_bytes
+.Lhandle_partial_segment:
+ # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
+ # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do
+ # this without needing a fold constant for each possible 'len', redivide
+ # the bytes into a first chunk of 'len' bytes and a second chunk of 16
+ # bytes, then fold the first chunk into the second.
-_final_reduction_for_128:
- # check if any more data to fold. If not, compute the CRC of
- # the final 128 bits
- add $16, arg3
- je _128_done
-
- # here we are getting data that is less than 16 bytes.
- # since we know that there was data before the pointer, we can
- # offset the input pointer before the actual point, to receive
- # exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_xmms:
movdqa %xmm7, %xmm2
- movdqu -16(arg2, arg3), %xmm1
- pshufb %xmm11, %xmm1
+ # xmm1 = last 16 original data bytes
+ movdqu -16(buf, len), %xmm1
+ pshufb BSWAP_MASK, %xmm1
- # get rid of the extra data that was loaded before
- # load the shift constant
- lea pshufb_shf_table+16(%rip), %rax
- sub arg3, %rax
+ # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
+ lea .Lbyteshift_table+16(%rip), %rax
+ sub len, %rax
movdqu (%rax), %xmm0
-
- # shift xmm2 to the left by arg3 bytes
pshufb %xmm0, %xmm2
- # shift xmm7 to the right by 16-arg3 bytes
- pxor mask1(%rip), %xmm0
+ # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
+ pxor .Lmask1(%rip), %xmm0
pshufb %xmm0, %xmm7
+
+ # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
+ # then '16-len' bytes from xmm2 (high-order bytes).
pblendvb %xmm2, %xmm1 #xmm0 is implicit
- # fold 16 Bytes
- movdqa %xmm1, %xmm2
+ # Fold the first chunk into the second chunk, storing the result in xmm7.
movdqa %xmm7, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm7
- pclmulqdq $0x0 , %xmm10, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
pxor %xmm8, %xmm7
- pxor %xmm2, %xmm7
+ pxor %xmm1, %xmm7
-_128_done:
- # compute crc of a 128-bit value
- movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
+.Lreduce_final_16_bytes:
+ # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
+
+ # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS
+
+ # Fold the high 64 bits into the low 64 bits, while also multiplying by
+ # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ # whose low 48 bits are 0.
movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
+ pslldq $8, %xmm0
+ pxor %xmm0, %xmm7 # + low bits * x^64
- #64b fold
- pclmulqdq $0x1, %xmm10, %xmm7
- pslldq $8 , %xmm0
- pxor %xmm0, %xmm7
-
- #32b fold
+ # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ # value congruent to x^64 * M(x) and whose low 48 bits are 0.
movdqa %xmm7, %xmm0
+ pand .Lmask2(%rip), %xmm0 # zero high 32 bits
+ psrldq $12, %xmm7 # extract high 32 bits
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
+ pxor %xmm0, %xmm7 # + low bits
- pand mask2(%rip), %xmm0
+ # Load G(x) and floor(x^48 / G(x)).
+ movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
- psrldq $12, %xmm7
- pclmulqdq $0x10, %xmm10, %xmm7
- pxor %xmm0, %xmm7
-
- #barrett reduction
-_barrett:
- movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
+ # Use Barrett reduction to compute the final CRC value.
movdqa %xmm7, %xmm0
- pclmulqdq $0x01, %xmm10, %xmm7
- pslldq $4, %xmm7
- pclmulqdq $0x11, %xmm10, %xmm7
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
+ psrlq $32, %xmm7 # /= x^32
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
+ psrlq $48, %xmm0
+ pxor %xmm7, %xmm0 # + low 16 nonzero bits
+ # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
- pslldq $4, %xmm7
- pxor %xmm0, %xmm7
- pextrd $1, %xmm7, %eax
-
-_cleanup:
- # scale the result back to 16 bits
- shr $16, %eax
- mov %rcx, %rsp
+ pextrw $0, %xmm0, %eax
ret
-########################################################################
-
.align 16
-_less_than_128:
+.Lless_than_256_bytes:
+ # Checksumming a buffer of length 16...255 bytes
- # check if there is enough buffer to be able to fold 16B at a time
- cmp $32, arg3
- jl _less_than_32
- movdqa SHUF_MASK(%rip), %xmm11
+ # Load the first 16 data bytes.
+ movdqu (buf), %xmm7
+ pshufb BSWAP_MASK, %xmm7
+ add $16, buf
- # now if there is, load the constants
- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
-
- movd arg1_low32, %xmm0 # get the initial crc value
- pslldq $12, %xmm0 # align it to its correct place
- movdqu (arg2), %xmm7 # load the plaintext
- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm0, %xmm0
+ pinsrw $7, init_crc, %xmm0
pxor %xmm0, %xmm7
-
- # update the buffer pointer
- add $16, arg2
-
- # update the counter. subtract 32 instead of 16 to save one
- # instruction from the loop
- sub $32, arg3
-
- jmp _16B_reduction_loop
-
-
-.align 16
-_less_than_32:
- # mov initial crc to the return value. this is necessary for
- # zero-length buffers.
- mov arg1_low32, %eax
- test arg3, arg3
- je _cleanup
-
- movdqa SHUF_MASK(%rip), %xmm11
-
- movd arg1_low32, %xmm0 # get the initial crc value
- pslldq $12, %xmm0 # align it to its correct place
-
- cmp $16, arg3
- je _exact_16_left
- jl _less_than_16_left
-
- movdqu (arg2), %xmm7 # load the plaintext
- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
- pxor %xmm0 , %xmm7 # xor the initial crc value
- add $16, arg2
- sub $16, arg3
- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
- jmp _get_last_two_xmms
-
-
-.align 16
-_less_than_16_left:
- # use stack space to load data less than 16 bytes, zero-out
- # the 16B in memory first.
-
- pxor %xmm1, %xmm1
- mov %rsp, %r11
- movdqa %xmm1, (%r11)
-
- cmp $4, arg3
- jl _only_less_than_4
-
- # backup the counter value
- mov arg3, %r9
- cmp $8, arg3
- jl _less_than_8_left
-
- # load 8 Bytes
- mov (arg2), %rax
- mov %rax, (%r11)
- add $8, %r11
- sub $8, arg3
- add $8, arg2
-_less_than_8_left:
-
- cmp $4, arg3
- jl _less_than_4_left
-
- # load 4 Bytes
- mov (arg2), %eax
- mov %eax, (%r11)
- add $4, %r11
- sub $4, arg3
- add $4, arg2
-_less_than_4_left:
-
- cmp $2, arg3
- jl _less_than_2_left
-
- # load 2 Bytes
- mov (arg2), %ax
- mov %ax, (%r11)
- add $2, %r11
- sub $2, arg3
- add $2, arg2
-_less_than_2_left:
- cmp $1, arg3
- jl _zero_left
-
- # load 1 Byte
- mov (arg2), %al
- mov %al, (%r11)
-_zero_left:
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- # shl r9, 4
- lea pshufb_shf_table+16(%rip), %rax
- sub %r9, %rax
- movdqu (%rax), %xmm0
- pxor mask1(%rip), %xmm0
-
- pshufb %xmm0, %xmm7
- jmp _128_done
-
-.align 16
-_exact_16_left:
- movdqu (arg2), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- jmp _128_done
-
-_only_less_than_4:
- cmp $3, arg3
- jl _only_less_than_3
-
- # load 3 Bytes
- mov (arg2), %al
- mov %al, (%r11)
-
- mov 1(arg2), %al
- mov %al, 1(%r11)
-
- mov 2(arg2), %al
- mov %al, 2(%r11)
-
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- psrldq $5, %xmm7
-
- jmp _barrett
-_only_less_than_3:
- cmp $2, arg3
- jl _only_less_than_2
-
- # load 2 Bytes
- mov (arg2), %al
- mov %al, (%r11)
-
- mov 1(arg2), %al
- mov %al, 1(%r11)
-
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- psrldq $6, %xmm7
-
- jmp _barrett
-_only_less_than_2:
-
- # load 1 Byte
- mov (arg2), %al
- mov %al, (%r11)
-
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- psrldq $7, %xmm7
-
- jmp _barrett
-
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ cmp $16, len
+ je .Lreduce_final_16_bytes # len == 16
+ sub $32, len
+ jge .Lfold_16_bytes_loop # 32 <= len <= 255
+ add $16, len
+ jmp .Lhandle_partial_segment # 17 <= len <= 31
ENDPROC(crc_t10dif_pcl)
.section .rodata, "a", @progbits
.align 16
-# precomputed constants
-# these constants are precomputed from the poly:
-# 0x8bb70000 (0x8bb7 scaled to 32 bits)
-# Q = 0x18BB70000
-# rk1 = 2^(32*3) mod Q << 32
-# rk2 = 2^(32*5) mod Q << 32
-# rk3 = 2^(32*15) mod Q << 32
-# rk4 = 2^(32*17) mod Q << 32
-# rk5 = 2^(32*3) mod Q << 32
-# rk6 = 2^(32*2) mod Q << 32
-# rk7 = floor(2^64/Q)
-# rk8 = Q
-rk1:
-.quad 0x2d56000000000000
-rk2:
-.quad 0x06df000000000000
-rk3:
-.quad 0x9d9d000000000000
-rk4:
-.quad 0x7cf5000000000000
-rk5:
-.quad 0x2d56000000000000
-rk6:
-.quad 0x1368000000000000
-rk7:
-.quad 0x00000001f65a57f8
-rk8:
-.quad 0x000000018bb70000
-rk9:
-.quad 0xceae000000000000
-rk10:
-.quad 0xbfd6000000000000
-rk11:
-.quad 0x1e16000000000000
-rk12:
-.quad 0x713c000000000000
-rk13:
-.quad 0xf7f9000000000000
-rk14:
-.quad 0x80a6000000000000
-rk15:
-.quad 0x044c000000000000
-rk16:
-.quad 0xe658000000000000
-rk17:
-.quad 0xad18000000000000
-rk18:
-.quad 0xa497000000000000
-rk19:
-.quad 0x6ee3000000000000
-rk20:
-.quad 0xe7b5000000000000
-
-
+# Fold constants precomputed from the polynomial 0x18bb7
+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 # x^(8*128) mod G(x)
+ .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
+.Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 # x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
+.Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d # x^(2*128) mod G(x)
+ .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 # x^(1*128) mod G(x)
+ .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
+.Lfinal_fold_consts:
+ .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
+.Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 # G(x)
+ .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
.section .rodata.cst16.mask1, "aM", @progbits, 16
.align 16
-mask1:
-.octa 0x80808080808080808080808080808080
+.Lmask1:
+ .octa 0x80808080808080808080808080808080
.section .rodata.cst16.mask2, "aM", @progbits, 16
.align 16
-mask2:
-.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+.Lmask2:
+ .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
.align 16
-SHUF_MASK:
-.octa 0x000102030405060708090A0B0C0D0E0F
+.Lbswap_mask:
+ .octa 0x000102030405060708090A0B0C0D0E0F
-.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
-.align 32
-pshufb_shf_table:
-# use these values for shift constants for the pshufb instruction
-# different alignments result in values as shown:
-# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
-# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
-# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
-# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
-# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
-# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
-# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
-# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
-# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
-.octa 0x8f8e8d8c8b8a89888786858483828100
-.octa 0x000e0d0c0b0a09080706050403020100
+.section .rodata.cst32.byteshift_table, "aM", @progbits, 32
+.align 16
+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index cd4df93..3c81e15 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -26,25 +26,20 @@
#include <linux/module.h>
#include <linux/crc-t10dif.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <asm/fpu/api.h>
#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
-asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
- size_t len);
+asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
struct chksum_desc_ctx {
__u16 crc;
};
-/*
- * Steps through buffer one byte at at time, calculates reflected
- * crc using table.
- */
-
static int chksum_init(struct shash_desc *desc)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
@@ -59,7 +54,7 @@
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
- if (irq_fpu_usable()) {
+ if (length >= 16 && crypto_simd_usable()) {
kernel_fpu_begin();
ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
kernel_fpu_end();
@@ -76,15 +71,14 @@
return 0;
}
-static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
- u8 *out)
+static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out)
{
- if (irq_fpu_usable()) {
+ if (len >= 16 && crypto_simd_usable()) {
kernel_fpu_begin();
- *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
+ *(__u16 *)out = crc_t10dif_pcl(crc, data, len);
kernel_fpu_end();
} else
- *(__u16 *)out = crc_t10dif_generic(*crcp, data, len);
+ *(__u16 *)out = crc_t10dif_generic(crc, data, len);
return 0;
}
@@ -93,15 +87,13 @@
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
- return __chksum_finup(&ctx->crc, data, len, out);
+ return __chksum_finup(ctx->crc, data, len, out);
}
static int chksum_digest(struct shash_desc *desc, const u8 *data,
unsigned int length, u8 *out)
{
- struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
-
- return __chksum_finup(&ctx->crc, data, length, out);
+ return __chksum_finup(0, data, length, out);
}
static struct shash_alg alg = {
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 8e49ce1..7fca430 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -1,17 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
*
* Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 5c610d4..89830e5 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for assembler optimized version of 3DES
*
@@ -7,17 +8,6 @@
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
* CTR part based on code (crypto/ctr.c) by:
* (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <crypto/algapi.h>
@@ -29,8 +19,8 @@
#include <linux/types.h>
struct des3_ede_x86_ctx {
- u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
- u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
+ struct des3_ede_ctx enc;
+ struct des3_ede_ctx dec;
};
/* regular block cipher functions */
@@ -44,7 +34,7 @@
static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
const u8 *src)
{
- u32 *enc_ctx = ctx->enc_expkey;
+ u32 *enc_ctx = ctx->enc.expkey;
des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
}
@@ -52,7 +42,7 @@
static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
const u8 *src)
{
- u32 *dec_ctx = ctx->dec_expkey;
+ u32 *dec_ctx = ctx->dec.expkey;
des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
}
@@ -60,7 +50,7 @@
static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
const u8 *src)
{
- u32 *enc_ctx = ctx->enc_expkey;
+ u32 *enc_ctx = ctx->enc.expkey;
des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
}
@@ -68,7 +58,7 @@
static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
const u8 *src)
{
- u32 *dec_ctx = ctx->dec_expkey;
+ u32 *dec_ctx = ctx->dec.expkey;
des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
}
@@ -132,7 +122,7 @@
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
- return ecb_crypt(req, ctx->enc_expkey);
+ return ecb_crypt(req, ctx->enc.expkey);
}
static int ecb_decrypt(struct skcipher_request *req)
@@ -140,7 +130,7 @@
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
- return ecb_crypt(req, ctx->dec_expkey);
+ return ecb_crypt(req, ctx->dec.expkey);
}
static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx,
@@ -358,20 +348,28 @@
u32 i, j, tmp;
int err;
- /* Generate encryption context using generic implementation. */
- err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
- if (err < 0)
+ err = des3_ede_expand_key(&ctx->enc, key, keylen);
+ if (err == -ENOKEY) {
+ if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)
+ err = -EINVAL;
+ else
+ err = 0;
+ }
+
+ if (err) {
+ memset(ctx, 0, sizeof(*ctx));
return err;
+ }
/* Fix encryption context for this implementation and form decryption
* context. */
j = DES3_EDE_EXPKEY_WORDS - 2;
for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
- tmp = ror32(ctx->enc_expkey[i + 1], 4);
- ctx->enc_expkey[i + 1] = tmp;
+ tmp = ror32(ctx->enc.expkey[i + 1], 4);
+ ctx->enc.expkey[i + 1] = tmp;
- ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
- ctx->dec_expkey[j + 1] = tmp;
+ ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0];
+ ctx->dec.expkey[j + 1] = tmp;
}
return 0;
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
deleted file mode 100644
index 4066804..0000000
--- a/arch/x86/crypto/fpu.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * FPU: Wrapper for blkcipher touching fpu
- *
- * Copyright (c) Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/internal/skcipher.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/fpu/api.h>
-
-struct crypto_fpu_ctx {
- struct crypto_skcipher *child;
-};
-
-static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key,
- unsigned int keylen)
-{
- struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent);
- struct crypto_skcipher *child = ctx->child;
- int err;
-
- crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) &
- CRYPTO_TFM_REQ_MASK);
- err = crypto_skcipher_setkey(child, key, keylen);
- crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) &
- CRYPTO_TFM_RES_MASK);
- return err;
-}
-
-static int crypto_fpu_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct crypto_skcipher *child = ctx->child;
- SKCIPHER_REQUEST_ON_STACK(subreq, child);
- int err;
-
- skcipher_request_set_tfm(subreq, child);
- skcipher_request_set_callback(subreq, 0, NULL, NULL);
- skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
- req->iv);
-
- kernel_fpu_begin();
- err = crypto_skcipher_encrypt(subreq);
- kernel_fpu_end();
-
- skcipher_request_zero(subreq);
- return err;
-}
-
-static int crypto_fpu_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct crypto_skcipher *child = ctx->child;
- SKCIPHER_REQUEST_ON_STACK(subreq, child);
- int err;
-
- skcipher_request_set_tfm(subreq, child);
- skcipher_request_set_callback(subreq, 0, NULL, NULL);
- skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
- req->iv);
-
- kernel_fpu_begin();
- err = crypto_skcipher_decrypt(subreq);
- kernel_fpu_end();
-
- skcipher_request_zero(subreq);
- return err;
-}
-
-static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm)
-{
- struct skcipher_instance *inst = skcipher_alg_instance(tfm);
- struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct crypto_skcipher_spawn *spawn;
- struct crypto_skcipher *cipher;
-
- spawn = skcipher_instance_ctx(inst);
- cipher = crypto_spawn_skcipher(spawn);
- if (IS_ERR(cipher))
- return PTR_ERR(cipher);
-
- ctx->child = cipher;
-
- return 0;
-}
-
-static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm)
-{
- struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- crypto_free_skcipher(ctx->child);
-}
-
-static void crypto_fpu_free(struct skcipher_instance *inst)
-{
- crypto_drop_skcipher(skcipher_instance_ctx(inst));
- kfree(inst);
-}
-
-static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb)
-{
- struct crypto_skcipher_spawn *spawn;
- struct skcipher_instance *inst;
- struct crypto_attr_type *algt;
- struct skcipher_alg *alg;
- const char *cipher_name;
- int err;
-
- algt = crypto_get_attr_type(tb);
- if (IS_ERR(algt))
- return PTR_ERR(algt);
-
- if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) &
- algt->mask)
- return -EINVAL;
-
- if (!(algt->mask & CRYPTO_ALG_INTERNAL))
- return -EINVAL;
-
- cipher_name = crypto_attr_alg_name(tb[1]);
- if (IS_ERR(cipher_name))
- return PTR_ERR(cipher_name);
-
- inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
- if (!inst)
- return -ENOMEM;
-
- spawn = skcipher_instance_ctx(inst);
-
- crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst));
- err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC);
- if (err)
- goto out_free_inst;
-
- alg = crypto_skcipher_spawn_alg(spawn);
-
- err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu",
- &alg->base);
- if (err)
- goto out_drop_skcipher;
-
- inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL;
- inst->alg.base.cra_priority = alg->base.cra_priority;
- inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
- inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
-
- inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg);
- inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
- inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg);
-
- inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
-
- inst->alg.init = crypto_fpu_init_tfm;
- inst->alg.exit = crypto_fpu_exit_tfm;
-
- inst->alg.setkey = crypto_fpu_setkey;
- inst->alg.encrypt = crypto_fpu_encrypt;
- inst->alg.decrypt = crypto_fpu_decrypt;
-
- inst->free = crypto_fpu_free;
-
- err = skcipher_register_instance(tmpl, inst);
- if (err)
- goto out_drop_skcipher;
-
-out:
- return err;
-
-out_drop_skcipher:
- crypto_drop_skcipher(spawn);
-out_free_inst:
- kfree(inst);
- goto out;
-}
-
-static struct crypto_template crypto_fpu_tmpl = {
- .name = "fpu",
- .create = crypto_fpu_create,
- .module = THIS_MODULE,
-};
-
-int __init crypto_fpu_init(void)
-{
- return crypto_register_template(&crypto_fpu_tmpl);
-}
-
-void crypto_fpu_exit(void)
-{
- crypto_unregister_template(&crypto_fpu_tmpl);
-}
-
-MODULE_ALIAS_CRYPTO("fpu");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index f94375a..5d53eff 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
* instructions. This file contains accelerated part of ghash
@@ -10,10 +11,6 @@
* Vinodh Gopal
* Erdinc Ozturk
* Deniz Karakoyunlu
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 3582ae8..04d72a5 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Accelerated GHASH implementation with Intel PCLMULQDQ-NI
* instructions. This file contains glue code.
*
* Copyright (c) 2009 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
*/
#include <linux/err.h>
@@ -19,8 +16,9 @@
#include <crypto/cryptd.h>
#include <crypto/gf128mul.h>
#include <crypto/internal/hash.h>
-#include <asm/fpu/api.h>
+#include <crypto/internal/simd.h>
#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
@@ -171,7 +169,6 @@
struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
desc->tfm = child;
- desc->flags = req->base.flags;
return crypto_shash_init(desc);
}
@@ -182,7 +179,7 @@
struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- if (!irq_fpu_usable() ||
+ if (!crypto_simd_usable() ||
(in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
memcpy(cryptd_req, req, sizeof(*req));
ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -200,7 +197,7 @@
struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- if (!irq_fpu_usable() ||
+ if (!crypto_simd_usable() ||
(in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
memcpy(cryptd_req, req, sizeof(*req));
ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -241,7 +238,7 @@
struct ahash_request *cryptd_req = ahash_request_ctx(req);
struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- if (!irq_fpu_usable() ||
+ if (!crypto_simd_usable() ||
(in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
memcpy(cryptd_req, req, sizeof(*req));
ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -251,7 +248,6 @@
struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
desc->tfm = child;
- desc->flags = req->base.flags;
return shash_ahash_digest(req, desc);
}
}
@@ -361,6 +357,5 @@
module_exit(ghash_pclmulqdqni_mod_exit);
MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
- "accelerated by PCLMULQDQ-NI");
+MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI");
MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
index 02ee230..d08fc57 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -1,18 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Shared glue code for 128bit block ciphers, AVX assembler macros
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
index a53ac11..d84508c 100644
--- a/arch/x86/crypto/glue_helper-asm-avx2.S
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -1,13 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Shared glue code for 128bit block ciphers, AVX2 assembler macros
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
*/
#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index a78ef99..d15b993 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Shared glue code for 128bit block ciphers
*
@@ -7,28 +8,13 @@
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
* CTR part based on code (crypto/ctr.c) by:
* (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/module.h>
#include <crypto/b128ops.h>
#include <crypto/gf128mul.h>
#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <asm/crypto/glue_helper.h>
@@ -274,17 +260,36 @@
int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
struct skcipher_request *req,
common_glue_func_t tweak_fn, void *tweak_ctx,
- void *crypt_ctx)
+ void *crypt_ctx, bool decrypt)
{
+ const bool cts = (req->cryptlen % XTS_BLOCK_SIZE);
const unsigned int bsize = 128 / 8;
+ struct skcipher_request subreq;
struct skcipher_walk walk;
bool fpu_enabled = false;
- unsigned int nbytes;
+ unsigned int nbytes, tail;
int err;
+ if (req->cryptlen < XTS_BLOCK_SIZE)
+ return -EINVAL;
+
+ if (unlikely(cts)) {
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+
+ tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ crypto_skcipher_get_flags(tfm),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ req->cryptlen - tail, req->iv);
+ req = &subreq;
+ }
+
err = skcipher_walk_virt(&walk, req, false);
nbytes = walk.nbytes;
- if (!nbytes)
+ if (err)
return err;
/* set minimum length to bsize, for tweak_fn */
@@ -302,6 +307,47 @@
nbytes = walk.nbytes;
}
+ if (unlikely(cts)) {
+ u8 *next_tweak, *final_tweak = req->iv;
+ struct scatterlist *src, *dst;
+ struct scatterlist s[2], d[2];
+ le128 b[2];
+
+ dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(d, req->dst, req->cryptlen);
+
+ if (decrypt) {
+ next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE);
+ gf128mul_x_ble(b, b);
+ } else {
+ next_tweak = req->iv;
+ }
+
+ skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE,
+ next_tweak);
+
+ err = skcipher_walk_virt(&walk, req, false) ?:
+ skcipher_walk_done(&walk,
+ __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
+ if (err)
+ goto out;
+
+ scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0);
+ memcpy(b + 1, b, tail - XTS_BLOCK_SIZE);
+ scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE,
+ tail - XTS_BLOCK_SIZE, 0);
+ scatterwalk_map_and_copy(b, dst, 0, tail, 1);
+
+ skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE,
+ final_tweak);
+
+ err = skcipher_walk_virt(&walk, req, false) ?:
+ skcipher_walk_done(&walk,
+ __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
+ }
+
+out:
glue_fpu_end(fpu_enabled);
return err;
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
deleted file mode 100644
index de182c4..0000000
--- a/arch/x86/crypto/morus1280-avx2-asm.S
+++ /dev/null
@@ -1,622 +0,0 @@
-/*
- * AVX2 implementation of MORUS-1280
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define SHUFFLE_MASK(i0, i1, i2, i3) \
- (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
-
-#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
-#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
-#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
-
-#define STATE0 %ymm0
-#define STATE0_LOW %xmm0
-#define STATE1 %ymm1
-#define STATE2 %ymm2
-#define STATE3 %ymm3
-#define STATE4 %ymm4
-#define KEY %ymm5
-#define MSG %ymm5
-#define MSG_LOW %xmm5
-#define T0 %ymm6
-#define T0_LOW %xmm6
-#define T1 %ymm7
-
-.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
-.align 32
-.Lmorus1280_const:
- .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
- .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
- .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
- .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-
-.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
-.align 32
-.Lmorus1280_counter:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-
-.text
-
-.macro morus1280_round s0, s1, s2, s3, s4, b, w
- vpand \s1, \s2, T0
- vpxor T0, \s0, \s0
- vpxor \s3, \s0, \s0
- vpsllq $\b, \s0, T0
- vpsrlq $(64 - \b), \s0, \s0
- vpxor T0, \s0, \s0
- vpermq $\w, \s3, \s3
-.endm
-
-/*
- * __morus1280_update: internal ABI
- * input:
- * STATE[0-4] - input state
- * MSG - message block
- * output:
- * STATE[0-4] - output state
- * changed:
- * T0
- */
-__morus1280_update:
- morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
- vpxor MSG, STATE1, STATE1
- morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
- vpxor MSG, STATE2, STATE2
- morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
- vpxor MSG, STATE3, STATE3
- morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
- vpxor MSG, STATE4, STATE4
- morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
- ret
-ENDPROC(__morus1280_update)
-
-/*
- * __morus1280_update_zero: internal ABI
- * input:
- * STATE[0-4] - input state
- * output:
- * STATE[0-4] - output state
- * changed:
- * T0
- */
-__morus1280_update_zero:
- morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
- morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
- morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
- morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2
- morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1
- ret
-ENDPROC(__morus1280_update_zero)
-
-/*
- * __load_partial: internal ABI
- * input:
- * %rsi - src
- * %rcx - bytes
- * output:
- * MSG - message block
- * changed:
- * %r8
- * %r9
- */
-__load_partial:
- xor %r9d, %r9d
- vpxor MSG, MSG, MSG
-
- mov %rcx, %r8
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov %rcx, %r8
- and $0x1E, %r8
- add %rsi, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov %rcx, %r8
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov %rcx, %r8
- and $0x1C, %r8
- add %rsi, %r8
- shl $16, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov %rcx, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov %rcx, %r8
- and $0x18, %r8
- add %rsi, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG_LOW
-
- mov %rcx, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov %rcx, %r8
- and $0x10, %r8
- add %rsi, %r8
- pshufd $MASK2, MSG_LOW, MSG_LOW
- pinsrq $0, (%r8), MSG_LOW
-
-.Lld_partial_8:
- mov %rcx, %r8
- and $0x10, %r8
- jz .Lld_partial_16
-
- vpermq $MASK2, MSG, MSG
- movdqu (%rsi), MSG_LOW
-
-.Lld_partial_16:
- ret
-ENDPROC(__load_partial)
-
-/*
- * __store_partial: internal ABI
- * input:
- * %rdx - dst
- * %rcx - bytes
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
- */
-__store_partial:
- mov %rcx, %r8
- mov %rdx, %r9
-
- cmp $16, %r8
- jl .Lst_partial_16
-
- movdqu T0_LOW, (%r9)
- vpermq $MASK2, T0, T0
-
- sub $16, %r8
- add $16, %r9
-
-.Lst_partial_16:
- movq T0_LOW, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- pextrq $1, T0_LOW, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $16, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- ret
-ENDPROC(__store_partial)
-
-/*
- * void crypto_morus1280_avx2_init(void *state, const void *key,
- * const void *iv);
- */
-ENTRY(crypto_morus1280_avx2_init)
- FRAME_BEGIN
-
- /* load IV: */
- vpxor STATE0, STATE0, STATE0
- movdqu (%rdx), STATE0_LOW
- /* load key: */
- vmovdqu (%rsi), KEY
- vmovdqa KEY, STATE1
- /* load all ones: */
- vpcmpeqd STATE2, STATE2, STATE2
- /* load all zeros: */
- vpxor STATE3, STATE3, STATE3
- /* load the constant: */
- vmovdqa .Lmorus1280_const, STATE4
-
- /* update 16 times with zero: */
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
-
- /* xor-in the key again after updates: */
- vpxor KEY, STATE1, STATE1
-
- /* store the state: */
- vmovdqu STATE0, (0 * 32)(%rdi)
- vmovdqu STATE1, (1 * 32)(%rdi)
- vmovdqu STATE2, (2 * 32)(%rdi)
- vmovdqu STATE3, (3 * 32)(%rdi)
- vmovdqu STATE4, (4 * 32)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_init)
-
-/*
- * void crypto_morus1280_avx2_ad(void *state, const void *data,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_avx2_ad)
- FRAME_BEGIN
-
- cmp $32, %rdx
- jb .Lad_out
-
- /* load the state: */
- vmovdqu (0 * 32)(%rdi), STATE0
- vmovdqu (1 * 32)(%rdi), STATE1
- vmovdqu (2 * 32)(%rdi), STATE2
- vmovdqu (3 * 32)(%rdi), STATE3
- vmovdqu (4 * 32)(%rdi), STATE4
-
- mov %rsi, %r8
- and $0x1F, %r8
- jnz .Lad_u_loop
-
-.align 4
-.Lad_a_loop:
- vmovdqa (%rsi), MSG
- call __morus1280_update
- sub $32, %rdx
- add $32, %rsi
- cmp $32, %rdx
- jge .Lad_a_loop
-
- jmp .Lad_cont
-.align 4
-.Lad_u_loop:
- vmovdqu (%rsi), MSG
- call __morus1280_update
- sub $32, %rdx
- add $32, %rsi
- cmp $32, %rdx
- jge .Lad_u_loop
-
-.Lad_cont:
- /* store the state: */
- vmovdqu STATE0, (0 * 32)(%rdi)
- vmovdqu STATE1, (1 * 32)(%rdi)
- vmovdqu STATE2, (2 * 32)(%rdi)
- vmovdqu STATE3, (3 * 32)(%rdi)
- vmovdqu STATE4, (4 * 32)(%rdi)
-
-.Lad_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_ad)
-
-/*
- * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_avx2_enc)
- FRAME_BEGIN
-
- cmp $32, %rcx
- jb .Lenc_out
-
- /* load the state: */
- vmovdqu (0 * 32)(%rdi), STATE0
- vmovdqu (1 * 32)(%rdi), STATE1
- vmovdqu (2 * 32)(%rdi), STATE2
- vmovdqu (3 * 32)(%rdi), STATE3
- vmovdqu (4 * 32)(%rdi), STATE4
-
- mov %rsi, %r8
- or %rdx, %r8
- and $0x1F, %r8
- jnz .Lenc_u_loop
-
-.align 4
-.Lenc_a_loop:
- vmovdqa (%rsi), MSG
- vmovdqa MSG, T0
- vpxor STATE0, T0, T0
- vpermq $MASK3, STATE1, T1
- vpxor T1, T0, T0
- vpand STATE2, STATE3, T1
- vpxor T1, T0, T0
- vmovdqa T0, (%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Lenc_a_loop
-
- jmp .Lenc_cont
-.align 4
-.Lenc_u_loop:
- vmovdqu (%rsi), MSG
- vmovdqa MSG, T0
- vpxor STATE0, T0, T0
- vpermq $MASK3, STATE1, T1
- vpxor T1, T0, T0
- vpand STATE2, STATE3, T1
- vpxor T1, T0, T0
- vmovdqu T0, (%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Lenc_u_loop
-
-.Lenc_cont:
- /* store the state: */
- vmovdqu STATE0, (0 * 32)(%rdi)
- vmovdqu STATE1, (1 * 32)(%rdi)
- vmovdqu STATE2, (2 * 32)(%rdi)
- vmovdqu STATE3, (3 * 32)(%rdi)
- vmovdqu STATE4, (4 * 32)(%rdi)
-
-.Lenc_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_enc)
-
-/*
- * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_avx2_enc_tail)
- FRAME_BEGIN
-
- /* load the state: */
- vmovdqu (0 * 32)(%rdi), STATE0
- vmovdqu (1 * 32)(%rdi), STATE1
- vmovdqu (2 * 32)(%rdi), STATE2
- vmovdqu (3 * 32)(%rdi), STATE3
- vmovdqu (4 * 32)(%rdi), STATE4
-
- /* encrypt message: */
- call __load_partial
-
- vmovdqa MSG, T0
- vpxor STATE0, T0, T0
- vpermq $MASK3, STATE1, T1
- vpxor T1, T0, T0
- vpand STATE2, STATE3, T1
- vpxor T1, T0, T0
-
- call __store_partial
-
- call __morus1280_update
-
- /* store the state: */
- vmovdqu STATE0, (0 * 32)(%rdi)
- vmovdqu STATE1, (1 * 32)(%rdi)
- vmovdqu STATE2, (2 * 32)(%rdi)
- vmovdqu STATE3, (3 * 32)(%rdi)
- vmovdqu STATE4, (4 * 32)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_enc_tail)
-
-/*
- * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_avx2_dec)
- FRAME_BEGIN
-
- cmp $32, %rcx
- jb .Ldec_out
-
- /* load the state: */
- vmovdqu (0 * 32)(%rdi), STATE0
- vmovdqu (1 * 32)(%rdi), STATE1
- vmovdqu (2 * 32)(%rdi), STATE2
- vmovdqu (3 * 32)(%rdi), STATE3
- vmovdqu (4 * 32)(%rdi), STATE4
-
- mov %rsi, %r8
- or %rdx, %r8
- and $0x1F, %r8
- jnz .Ldec_u_loop
-
-.align 4
-.Ldec_a_loop:
- vmovdqa (%rsi), MSG
- vpxor STATE0, MSG, MSG
- vpermq $MASK3, STATE1, T0
- vpxor T0, MSG, MSG
- vpand STATE2, STATE3, T0
- vpxor T0, MSG, MSG
- vmovdqa MSG, (%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Ldec_a_loop
-
- jmp .Ldec_cont
-.align 4
-.Ldec_u_loop:
- vmovdqu (%rsi), MSG
- vpxor STATE0, MSG, MSG
- vpermq $MASK3, STATE1, T0
- vpxor T0, MSG, MSG
- vpand STATE2, STATE3, T0
- vpxor T0, MSG, MSG
- vmovdqu MSG, (%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Ldec_u_loop
-
-.Ldec_cont:
- /* store the state: */
- vmovdqu STATE0, (0 * 32)(%rdi)
- vmovdqu STATE1, (1 * 32)(%rdi)
- vmovdqu STATE2, (2 * 32)(%rdi)
- vmovdqu STATE3, (3 * 32)(%rdi)
- vmovdqu STATE4, (4 * 32)(%rdi)
-
-.Ldec_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_dec)
-
-/*
- * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_avx2_dec_tail)
- FRAME_BEGIN
-
- /* load the state: */
- vmovdqu (0 * 32)(%rdi), STATE0
- vmovdqu (1 * 32)(%rdi), STATE1
- vmovdqu (2 * 32)(%rdi), STATE2
- vmovdqu (3 * 32)(%rdi), STATE3
- vmovdqu (4 * 32)(%rdi), STATE4
-
- /* decrypt message: */
- call __load_partial
-
- vpxor STATE0, MSG, MSG
- vpermq $MASK3, STATE1, T0
- vpxor T0, MSG, MSG
- vpand STATE2, STATE3, T0
- vpxor T0, MSG, MSG
- vmovdqa MSG, T0
-
- call __store_partial
-
- /* mask with byte count: */
- movq %rcx, T0_LOW
- vpbroadcastb T0_LOW, T0
- vmovdqa .Lmorus1280_counter, T1
- vpcmpgtb T1, T0, T0
- vpand T0, MSG, MSG
-
- call __morus1280_update
-
- /* store the state: */
- vmovdqu STATE0, (0 * 32)(%rdi)
- vmovdqu STATE1, (1 * 32)(%rdi)
- vmovdqu STATE2, (2 * 32)(%rdi)
- vmovdqu STATE3, (3 * 32)(%rdi)
- vmovdqu STATE4, (4 * 32)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_dec_tail)
-
-/*
- * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
- */
-ENTRY(crypto_morus1280_avx2_final)
- FRAME_BEGIN
-
- /* load the state: */
- vmovdqu (0 * 32)(%rdi), STATE0
- vmovdqu (1 * 32)(%rdi), STATE1
- vmovdqu (2 * 32)(%rdi), STATE2
- vmovdqu (3 * 32)(%rdi), STATE3
- vmovdqu (4 * 32)(%rdi), STATE4
-
- /* xor state[0] into state[4]: */
- vpxor STATE0, STATE4, STATE4
-
- /* prepare length block: */
- vpxor MSG, MSG, MSG
- vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
- vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
- vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
-
- /* update state: */
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
-
- /* xor tag: */
- vmovdqu (%rsi), MSG
-
- vpxor STATE0, MSG, MSG
- vpermq $MASK3, STATE1, T0
- vpxor T0, MSG, MSG
- vpand STATE2, STATE3, T0
- vpxor T0, MSG, MSG
- vmovdqu MSG, (%rsi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_avx2_final)
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
deleted file mode 100644
index 6634907..0000000
--- a/arch/x86/crypto/morus1280-avx2-glue.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * The MORUS-1280 Authenticated-Encryption Algorithm
- * Glue for AVX2 implementation
- *
- * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/internal/aead.h>
-#include <crypto/morus1280_glue.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/cpu_device_id.h>
-
-asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key,
- const void *iv);
-asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data,
- unsigned int length);
-
-asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src,
- void *dst, unsigned int length);
-asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src,
- void *dst, unsigned int length);
-
-asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src,
- void *dst, unsigned int length);
-asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
- void *dst, unsigned int length);
-
-asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
- u64 assoclen, u64 cryptlen);
-
-MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
-
-static int __init crypto_morus1280_avx2_module_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_AVX2) ||
- !boot_cpu_has(X86_FEATURE_OSXSAVE) ||
- !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- return -ENODEV;
-
- return crypto_register_aeads(crypto_morus1280_avx2_algs,
- ARRAY_SIZE(crypto_morus1280_avx2_algs));
-}
-
-static void __exit crypto_morus1280_avx2_module_exit(void)
-{
- crypto_unregister_aeads(crypto_morus1280_avx2_algs,
- ARRAY_SIZE(crypto_morus1280_avx2_algs));
-}
-
-module_init(crypto_morus1280_avx2_module_init);
-module_exit(crypto_morus1280_avx2_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation");
-MODULE_ALIAS_CRYPTO("morus1280");
-MODULE_ALIAS_CRYPTO("morus1280-avx2");
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
deleted file mode 100644
index da5d290..0000000
--- a/arch/x86/crypto/morus1280-sse2-asm.S
+++ /dev/null
@@ -1,896 +0,0 @@
-/*
- * SSE2 implementation of MORUS-1280
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define SHUFFLE_MASK(i0, i1, i2, i3) \
- (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
-
-#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
-
-#define STATE0_LO %xmm0
-#define STATE0_HI %xmm1
-#define STATE1_LO %xmm2
-#define STATE1_HI %xmm3
-#define STATE2_LO %xmm4
-#define STATE2_HI %xmm5
-#define STATE3_LO %xmm6
-#define STATE3_HI %xmm7
-#define STATE4_LO %xmm8
-#define STATE4_HI %xmm9
-#define KEY_LO %xmm10
-#define KEY_HI %xmm11
-#define MSG_LO %xmm10
-#define MSG_HI %xmm11
-#define T0_LO %xmm12
-#define T0_HI %xmm13
-#define T1_LO %xmm14
-#define T1_HI %xmm15
-
-.section .rodata.cst16.morus640_const, "aM", @progbits, 16
-.align 16
-.Lmorus640_const_0:
- .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
- .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
-.Lmorus640_const_1:
- .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
- .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-
-.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
-.align 16
-.Lmorus640_counter_0:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-.Lmorus640_counter_1:
- .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
- .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-
-.text
-
-.macro rol1 hi, lo
- /*
- * HI_1 | HI_0 || LO_1 | LO_0
- * ==>
- * HI_0 | HI_1 || LO_1 | LO_0
- * ==>
- * HI_0 | LO_1 || LO_0 | HI_1
- */
- pshufd $MASK2, \hi, \hi
- movdqa \hi, T0_LO
- punpcklqdq \lo, T0_LO
- punpckhqdq \hi, \lo
- movdqa \lo, \hi
- movdqa T0_LO, \lo
-.endm
-
-.macro rol2 hi, lo
- movdqa \lo, T0_LO
- movdqa \hi, \lo
- movdqa T0_LO, \hi
-.endm
-
-.macro rol3 hi, lo
- /*
- * HI_1 | HI_0 || LO_1 | LO_0
- * ==>
- * HI_0 | HI_1 || LO_1 | LO_0
- * ==>
- * LO_0 | HI_1 || HI_0 | LO_1
- */
- pshufd $MASK2, \hi, \hi
- movdqa \lo, T0_LO
- punpckhqdq \hi, T0_LO
- punpcklqdq \lo, \hi
- movdqa T0_LO, \lo
-.endm
-
-.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
- movdqa \s1_l, T0_LO
- pand \s2_l, T0_LO
- pxor T0_LO, \s0_l
-
- movdqa \s1_h, T0_LO
- pand \s2_h, T0_LO
- pxor T0_LO, \s0_h
-
- pxor \s3_l, \s0_l
- pxor \s3_h, \s0_h
-
- movdqa \s0_l, T0_LO
- psllq $\b, T0_LO
- psrlq $(64 - \b), \s0_l
- pxor T0_LO, \s0_l
-
- movdqa \s0_h, T0_LO
- psllq $\b, T0_LO
- psrlq $(64 - \b), \s0_h
- pxor T0_LO, \s0_h
-
- \w \s3_h, \s3_l
-.endm
-
-/*
- * __morus1280_update: internal ABI
- * input:
- * STATE[0-4] - input state
- * MSG - message block
- * output:
- * STATE[0-4] - output state
- * changed:
- * T0
- */
-__morus1280_update:
- morus1280_round \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- 13, rol1
- pxor MSG_LO, STATE1_LO
- pxor MSG_HI, STATE1_HI
- morus1280_round \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- 46, rol2
- pxor MSG_LO, STATE2_LO
- pxor MSG_HI, STATE2_HI
- morus1280_round \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- 38, rol3
- pxor MSG_LO, STATE3_LO
- pxor MSG_HI, STATE3_HI
- morus1280_round \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- 7, rol2
- pxor MSG_LO, STATE4_LO
- pxor MSG_HI, STATE4_HI
- morus1280_round \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- 4, rol1
- ret
-ENDPROC(__morus1280_update)
-
-/*
- * __morus1280_update_zero: internal ABI
- * input:
- * STATE[0-4] - input state
- * output:
- * STATE[0-4] - output state
- * changed:
- * T0
- */
-__morus1280_update_zero:
- morus1280_round \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- 13, rol1
- morus1280_round \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- 46, rol2
- morus1280_round \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- 38, rol3
- morus1280_round \
- STATE3_LO, STATE3_HI, \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- 7, rol2
- morus1280_round \
- STATE4_LO, STATE4_HI, \
- STATE0_LO, STATE0_HI, \
- STATE1_LO, STATE1_HI, \
- STATE2_LO, STATE2_HI, \
- STATE3_LO, STATE3_HI, \
- 4, rol1
- ret
-ENDPROC(__morus1280_update_zero)
-
-/*
- * __load_partial: internal ABI
- * input:
- * %rsi - src
- * %rcx - bytes
- * output:
- * MSG - message block
- * changed:
- * %r8
- * %r9
- */
-__load_partial:
- xor %r9d, %r9d
- pxor MSG_LO, MSG_LO
- pxor MSG_HI, MSG_HI
-
- mov %rcx, %r8
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov %rcx, %r8
- and $0x1E, %r8
- add %rsi, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov %rcx, %r8
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov %rcx, %r8
- and $0x1C, %r8
- add %rsi, %r8
- shl $16, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov %rcx, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov %rcx, %r8
- and $0x18, %r8
- add %rsi, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG_LO
-
- mov %rcx, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov %rcx, %r8
- and $0x10, %r8
- add %rsi, %r8
- pslldq $8, MSG_LO
- movq (%r8), T0_LO
- pxor T0_LO, MSG_LO
-
-.Lld_partial_8:
- mov %rcx, %r8
- and $0x10, %r8
- jz .Lld_partial_16
-
- movdqa MSG_LO, MSG_HI
- movdqu (%rsi), MSG_LO
-
-.Lld_partial_16:
- ret
-ENDPROC(__load_partial)
-
-/*
- * __store_partial: internal ABI
- * input:
- * %rdx - dst
- * %rcx - bytes
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
- */
-__store_partial:
- mov %rcx, %r8
- mov %rdx, %r9
-
- cmp $16, %r8
- jl .Lst_partial_16
-
- movdqu T0_LO, (%r9)
- movdqa T0_HI, T0_LO
-
- sub $16, %r8
- add $16, %r9
-
-.Lst_partial_16:
- movq T0_LO, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- psrldq $8, T0_LO
- movq T0_LO, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $16, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- ret
-ENDPROC(__store_partial)
-
-/*
- * void crypto_morus1280_sse2_init(void *state, const void *key,
- * const void *iv);
- */
-ENTRY(crypto_morus1280_sse2_init)
- FRAME_BEGIN
-
- /* load IV: */
- pxor STATE0_HI, STATE0_HI
- movdqu (%rdx), STATE0_LO
- /* load key: */
- movdqu 0(%rsi), KEY_LO
- movdqu 16(%rsi), KEY_HI
- movdqa KEY_LO, STATE1_LO
- movdqa KEY_HI, STATE1_HI
- /* load all ones: */
- pcmpeqd STATE2_LO, STATE2_LO
- pcmpeqd STATE2_HI, STATE2_HI
- /* load all zeros: */
- pxor STATE3_LO, STATE3_LO
- pxor STATE3_HI, STATE3_HI
- /* load the constant: */
- movdqa .Lmorus640_const_0, STATE4_LO
- movdqa .Lmorus640_const_1, STATE4_HI
-
- /* update 16 times with zero: */
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
- call __morus1280_update_zero
-
- /* xor-in the key again after updates: */
- pxor KEY_LO, STATE1_LO
- pxor KEY_HI, STATE1_HI
-
- /* store the state: */
- movdqu STATE0_LO, (0 * 16)(%rdi)
- movdqu STATE0_HI, (1 * 16)(%rdi)
- movdqu STATE1_LO, (2 * 16)(%rdi)
- movdqu STATE1_HI, (3 * 16)(%rdi)
- movdqu STATE2_LO, (4 * 16)(%rdi)
- movdqu STATE2_HI, (5 * 16)(%rdi)
- movdqu STATE3_LO, (6 * 16)(%rdi)
- movdqu STATE3_HI, (7 * 16)(%rdi)
- movdqu STATE4_LO, (8 * 16)(%rdi)
- movdqu STATE4_HI, (9 * 16)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_init)
-
-/*
- * void crypto_morus1280_sse2_ad(void *state, const void *data,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_sse2_ad)
- FRAME_BEGIN
-
- cmp $32, %rdx
- jb .Lad_out
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0_LO
- movdqu (1 * 16)(%rdi), STATE0_HI
- movdqu (2 * 16)(%rdi), STATE1_LO
- movdqu (3 * 16)(%rdi), STATE1_HI
- movdqu (4 * 16)(%rdi), STATE2_LO
- movdqu (5 * 16)(%rdi), STATE2_HI
- movdqu (6 * 16)(%rdi), STATE3_LO
- movdqu (7 * 16)(%rdi), STATE3_HI
- movdqu (8 * 16)(%rdi), STATE4_LO
- movdqu (9 * 16)(%rdi), STATE4_HI
-
- mov %rsi, %r8
- and $0xF, %r8
- jnz .Lad_u_loop
-
-.align 4
-.Lad_a_loop:
- movdqa 0(%rsi), MSG_LO
- movdqa 16(%rsi), MSG_HI
- call __morus1280_update
- sub $32, %rdx
- add $32, %rsi
- cmp $32, %rdx
- jge .Lad_a_loop
-
- jmp .Lad_cont
-.align 4
-.Lad_u_loop:
- movdqu 0(%rsi), MSG_LO
- movdqu 16(%rsi), MSG_HI
- call __morus1280_update
- sub $32, %rdx
- add $32, %rsi
- cmp $32, %rdx
- jge .Lad_u_loop
-
-.Lad_cont:
- /* store the state: */
- movdqu STATE0_LO, (0 * 16)(%rdi)
- movdqu STATE0_HI, (1 * 16)(%rdi)
- movdqu STATE1_LO, (2 * 16)(%rdi)
- movdqu STATE1_HI, (3 * 16)(%rdi)
- movdqu STATE2_LO, (4 * 16)(%rdi)
- movdqu STATE2_HI, (5 * 16)(%rdi)
- movdqu STATE3_LO, (6 * 16)(%rdi)
- movdqu STATE3_HI, (7 * 16)(%rdi)
- movdqu STATE4_LO, (8 * 16)(%rdi)
- movdqu STATE4_HI, (9 * 16)(%rdi)
-
-.Lad_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_ad)
-
-/*
- * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_sse2_enc)
- FRAME_BEGIN
-
- cmp $32, %rcx
- jb .Lenc_out
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0_LO
- movdqu (1 * 16)(%rdi), STATE0_HI
- movdqu (2 * 16)(%rdi), STATE1_LO
- movdqu (3 * 16)(%rdi), STATE1_HI
- movdqu (4 * 16)(%rdi), STATE2_LO
- movdqu (5 * 16)(%rdi), STATE2_HI
- movdqu (6 * 16)(%rdi), STATE3_LO
- movdqu (7 * 16)(%rdi), STATE3_HI
- movdqu (8 * 16)(%rdi), STATE4_LO
- movdqu (9 * 16)(%rdi), STATE4_HI
-
- mov %rsi, %r8
- or %rdx, %r8
- and $0xF, %r8
- jnz .Lenc_u_loop
-
-.align 4
-.Lenc_a_loop:
- movdqa 0(%rsi), MSG_LO
- movdqa 16(%rsi), MSG_HI
- movdqa STATE1_LO, T1_LO
- movdqa STATE1_HI, T1_HI
- rol3 T1_HI, T1_LO
- movdqa MSG_LO, T0_LO
- movdqa MSG_HI, T0_HI
- pxor T1_LO, T0_LO
- pxor T1_HI, T0_HI
- pxor STATE0_LO, T0_LO
- pxor STATE0_HI, T0_HI
- movdqa STATE2_LO, T1_LO
- movdqa STATE2_HI, T1_HI
- pand STATE3_LO, T1_LO
- pand STATE3_HI, T1_HI
- pxor T1_LO, T0_LO
- pxor T1_HI, T0_HI
- movdqa T0_LO, 0(%rdx)
- movdqa T0_HI, 16(%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Lenc_a_loop
-
- jmp .Lenc_cont
-.align 4
-.Lenc_u_loop:
- movdqu 0(%rsi), MSG_LO
- movdqu 16(%rsi), MSG_HI
- movdqa STATE1_LO, T1_LO
- movdqa STATE1_HI, T1_HI
- rol3 T1_HI, T1_LO
- movdqa MSG_LO, T0_LO
- movdqa MSG_HI, T0_HI
- pxor T1_LO, T0_LO
- pxor T1_HI, T0_HI
- pxor STATE0_LO, T0_LO
- pxor STATE0_HI, T0_HI
- movdqa STATE2_LO, T1_LO
- movdqa STATE2_HI, T1_HI
- pand STATE3_LO, T1_LO
- pand STATE3_HI, T1_HI
- pxor T1_LO, T0_LO
- pxor T1_HI, T0_HI
- movdqu T0_LO, 0(%rdx)
- movdqu T0_HI, 16(%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Lenc_u_loop
-
-.Lenc_cont:
- /* store the state: */
- movdqu STATE0_LO, (0 * 16)(%rdi)
- movdqu STATE0_HI, (1 * 16)(%rdi)
- movdqu STATE1_LO, (2 * 16)(%rdi)
- movdqu STATE1_HI, (3 * 16)(%rdi)
- movdqu STATE2_LO, (4 * 16)(%rdi)
- movdqu STATE2_HI, (5 * 16)(%rdi)
- movdqu STATE3_LO, (6 * 16)(%rdi)
- movdqu STATE3_HI, (7 * 16)(%rdi)
- movdqu STATE4_LO, (8 * 16)(%rdi)
- movdqu STATE4_HI, (9 * 16)(%rdi)
-
-.Lenc_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_enc)
-
-/*
- * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_sse2_enc_tail)
- FRAME_BEGIN
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0_LO
- movdqu (1 * 16)(%rdi), STATE0_HI
- movdqu (2 * 16)(%rdi), STATE1_LO
- movdqu (3 * 16)(%rdi), STATE1_HI
- movdqu (4 * 16)(%rdi), STATE2_LO
- movdqu (5 * 16)(%rdi), STATE2_HI
- movdqu (6 * 16)(%rdi), STATE3_LO
- movdqu (7 * 16)(%rdi), STATE3_HI
- movdqu (8 * 16)(%rdi), STATE4_LO
- movdqu (9 * 16)(%rdi), STATE4_HI
-
- /* encrypt message: */
- call __load_partial
-
- movdqa STATE1_LO, T1_LO
- movdqa STATE1_HI, T1_HI
- rol3 T1_HI, T1_LO
- movdqa MSG_LO, T0_LO
- movdqa MSG_HI, T0_HI
- pxor T1_LO, T0_LO
- pxor T1_HI, T0_HI
- pxor STATE0_LO, T0_LO
- pxor STATE0_HI, T0_HI
- movdqa STATE2_LO, T1_LO
- movdqa STATE2_HI, T1_HI
- pand STATE3_LO, T1_LO
- pand STATE3_HI, T1_HI
- pxor T1_LO, T0_LO
- pxor T1_HI, T0_HI
-
- call __store_partial
-
- call __morus1280_update
-
- /* store the state: */
- movdqu STATE0_LO, (0 * 16)(%rdi)
- movdqu STATE0_HI, (1 * 16)(%rdi)
- movdqu STATE1_LO, (2 * 16)(%rdi)
- movdqu STATE1_HI, (3 * 16)(%rdi)
- movdqu STATE2_LO, (4 * 16)(%rdi)
- movdqu STATE2_HI, (5 * 16)(%rdi)
- movdqu STATE3_LO, (6 * 16)(%rdi)
- movdqu STATE3_HI, (7 * 16)(%rdi)
- movdqu STATE4_LO, (8 * 16)(%rdi)
- movdqu STATE4_HI, (9 * 16)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_enc_tail)
-
-/*
- * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_sse2_dec)
- FRAME_BEGIN
-
- cmp $32, %rcx
- jb .Ldec_out
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0_LO
- movdqu (1 * 16)(%rdi), STATE0_HI
- movdqu (2 * 16)(%rdi), STATE1_LO
- movdqu (3 * 16)(%rdi), STATE1_HI
- movdqu (4 * 16)(%rdi), STATE2_LO
- movdqu (5 * 16)(%rdi), STATE2_HI
- movdqu (6 * 16)(%rdi), STATE3_LO
- movdqu (7 * 16)(%rdi), STATE3_HI
- movdqu (8 * 16)(%rdi), STATE4_LO
- movdqu (9 * 16)(%rdi), STATE4_HI
-
- mov %rsi, %r8
- or %rdx, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
-.align 4
-.Ldec_a_loop:
- movdqa 0(%rsi), MSG_LO
- movdqa 16(%rsi), MSG_HI
- pxor STATE0_LO, MSG_LO
- pxor STATE0_HI, MSG_HI
- movdqa STATE1_LO, T1_LO
- movdqa STATE1_HI, T1_HI
- rol3 T1_HI, T1_LO
- pxor T1_LO, MSG_LO
- pxor T1_HI, MSG_HI
- movdqa STATE2_LO, T1_LO
- movdqa STATE2_HI, T1_HI
- pand STATE3_LO, T1_LO
- pand STATE3_HI, T1_HI
- pxor T1_LO, MSG_LO
- pxor T1_HI, MSG_HI
- movdqa MSG_LO, 0(%rdx)
- movdqa MSG_HI, 16(%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Ldec_a_loop
-
- jmp .Ldec_cont
-.align 4
-.Ldec_u_loop:
- movdqu 0(%rsi), MSG_LO
- movdqu 16(%rsi), MSG_HI
- pxor STATE0_LO, MSG_LO
- pxor STATE0_HI, MSG_HI
- movdqa STATE1_LO, T1_LO
- movdqa STATE1_HI, T1_HI
- rol3 T1_HI, T1_LO
- pxor T1_LO, MSG_LO
- pxor T1_HI, MSG_HI
- movdqa STATE2_LO, T1_LO
- movdqa STATE2_HI, T1_HI
- pand STATE3_LO, T1_LO
- pand STATE3_HI, T1_HI
- pxor T1_LO, MSG_LO
- pxor T1_HI, MSG_HI
- movdqu MSG_LO, 0(%rdx)
- movdqu MSG_HI, 16(%rdx)
-
- call __morus1280_update
- sub $32, %rcx
- add $32, %rsi
- add $32, %rdx
- cmp $32, %rcx
- jge .Ldec_u_loop
-
-.Ldec_cont:
- /* store the state: */
- movdqu STATE0_LO, (0 * 16)(%rdi)
- movdqu STATE0_HI, (1 * 16)(%rdi)
- movdqu STATE1_LO, (2 * 16)(%rdi)
- movdqu STATE1_HI, (3 * 16)(%rdi)
- movdqu STATE2_LO, (4 * 16)(%rdi)
- movdqu STATE2_HI, (5 * 16)(%rdi)
- movdqu STATE3_LO, (6 * 16)(%rdi)
- movdqu STATE3_HI, (7 * 16)(%rdi)
- movdqu STATE4_LO, (8 * 16)(%rdi)
- movdqu STATE4_HI, (9 * 16)(%rdi)
-
-.Ldec_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_dec)
-
-/*
- * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus1280_sse2_dec_tail)
- FRAME_BEGIN
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0_LO
- movdqu (1 * 16)(%rdi), STATE0_HI
- movdqu (2 * 16)(%rdi), STATE1_LO
- movdqu (3 * 16)(%rdi), STATE1_HI
- movdqu (4 * 16)(%rdi), STATE2_LO
- movdqu (5 * 16)(%rdi), STATE2_HI
- movdqu (6 * 16)(%rdi), STATE3_LO
- movdqu (7 * 16)(%rdi), STATE3_HI
- movdqu (8 * 16)(%rdi), STATE4_LO
- movdqu (9 * 16)(%rdi), STATE4_HI
-
- /* decrypt message: */
- call __load_partial
-
- pxor STATE0_LO, MSG_LO
- pxor STATE0_HI, MSG_HI
- movdqa STATE1_LO, T1_LO
- movdqa STATE1_HI, T1_HI
- rol3 T1_HI, T1_LO
- pxor T1_LO, MSG_LO
- pxor T1_HI, MSG_HI
- movdqa STATE2_LO, T1_LO
- movdqa STATE2_HI, T1_HI
- pand STATE3_LO, T1_LO
- pand STATE3_HI, T1_HI
- pxor T1_LO, MSG_LO
- pxor T1_HI, MSG_HI
- movdqa MSG_LO, T0_LO
- movdqa MSG_HI, T0_HI
-
- call __store_partial
-
- /* mask with byte count: */
- movq %rcx, T0_LO
- punpcklbw T0_LO, T0_LO
- punpcklbw T0_LO, T0_LO
- punpcklbw T0_LO, T0_LO
- punpcklbw T0_LO, T0_LO
- movdqa T0_LO, T0_HI
- movdqa .Lmorus640_counter_0, T1_LO
- movdqa .Lmorus640_counter_1, T1_HI
- pcmpgtb T1_LO, T0_LO
- pcmpgtb T1_HI, T0_HI
- pand T0_LO, MSG_LO
- pand T0_HI, MSG_HI
-
- call __morus1280_update
-
- /* store the state: */
- movdqu STATE0_LO, (0 * 16)(%rdi)
- movdqu STATE0_HI, (1 * 16)(%rdi)
- movdqu STATE1_LO, (2 * 16)(%rdi)
- movdqu STATE1_HI, (3 * 16)(%rdi)
- movdqu STATE2_LO, (4 * 16)(%rdi)
- movdqu STATE2_HI, (5 * 16)(%rdi)
- movdqu STATE3_LO, (6 * 16)(%rdi)
- movdqu STATE3_HI, (7 * 16)(%rdi)
- movdqu STATE4_LO, (8 * 16)(%rdi)
- movdqu STATE4_HI, (9 * 16)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_dec_tail)
-
-/*
- * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
- */
-ENTRY(crypto_morus1280_sse2_final)
- FRAME_BEGIN
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0_LO
- movdqu (1 * 16)(%rdi), STATE0_HI
- movdqu (2 * 16)(%rdi), STATE1_LO
- movdqu (3 * 16)(%rdi), STATE1_HI
- movdqu (4 * 16)(%rdi), STATE2_LO
- movdqu (5 * 16)(%rdi), STATE2_HI
- movdqu (6 * 16)(%rdi), STATE3_LO
- movdqu (7 * 16)(%rdi), STATE3_HI
- movdqu (8 * 16)(%rdi), STATE4_LO
- movdqu (9 * 16)(%rdi), STATE4_HI
-
- /* xor state[0] into state[4]: */
- pxor STATE0_LO, STATE4_LO
- pxor STATE0_HI, STATE4_HI
-
- /* prepare length block: */
- movq %rdx, MSG_LO
- movq %rcx, T0_LO
- pslldq $8, T0_LO
- pxor T0_LO, MSG_LO
- psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
- pxor MSG_HI, MSG_HI
-
- /* update state: */
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
- call __morus1280_update
-
- /* xor tag: */
- movdqu 0(%rsi), MSG_LO
- movdqu 16(%rsi), MSG_HI
-
- pxor STATE0_LO, MSG_LO
- pxor STATE0_HI, MSG_HI
- movdqa STATE1_LO, T0_LO
- movdqa STATE1_HI, T0_HI
- rol3 T0_HI, T0_LO
- pxor T0_LO, MSG_LO
- pxor T0_HI, MSG_HI
- movdqa STATE2_LO, T0_LO
- movdqa STATE2_HI, T0_HI
- pand STATE3_LO, T0_LO
- pand STATE3_HI, T0_HI
- pxor T0_LO, MSG_LO
- pxor T0_HI, MSG_HI
-
- movdqu MSG_LO, 0(%rsi)
- movdqu MSG_HI, 16(%rsi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus1280_sse2_final)
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
deleted file mode 100644
index f40244e..0000000
--- a/arch/x86/crypto/morus1280-sse2-glue.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * The MORUS-1280 Authenticated-Encryption Algorithm
- * Glue for SSE2 implementation
- *
- * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/internal/aead.h>
-#include <crypto/morus1280_glue.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/cpu_device_id.h>
-
-asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key,
- const void *iv);
-asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data,
- unsigned int length);
-
-asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src,
- void *dst, unsigned int length);
-asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src,
- void *dst, unsigned int length);
-
-asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src,
- void *dst, unsigned int length);
-asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
- void *dst, unsigned int length);
-
-asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
- u64 assoclen, u64 cryptlen);
-
-MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
-
-static int __init crypto_morus1280_sse2_module_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
- !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
- return -ENODEV;
-
- return crypto_register_aeads(crypto_morus1280_sse2_algs,
- ARRAY_SIZE(crypto_morus1280_sse2_algs));
-}
-
-static void __exit crypto_morus1280_sse2_module_exit(void)
-{
- crypto_unregister_aeads(crypto_morus1280_sse2_algs,
- ARRAY_SIZE(crypto_morus1280_sse2_algs));
-}
-
-module_init(crypto_morus1280_sse2_module_init);
-module_exit(crypto_morus1280_sse2_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation");
-MODULE_ALIAS_CRYPTO("morus1280");
-MODULE_ALIAS_CRYPTO("morus1280-sse2");
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
deleted file mode 100644
index 0dccdda..0000000
--- a/arch/x86/crypto/morus1280_glue.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * The MORUS-1280 Authenticated-Encryption Algorithm
- * Common x86 SIMD glue skeleton
- *
- * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/cryptd.h>
-#include <crypto/internal/aead.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/morus1280_glue.h>
-#include <crypto/scatterwalk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/scatterlist.h>
-#include <asm/fpu/api.h>
-
-struct morus1280_state {
- struct morus1280_block s[MORUS_STATE_BLOCKS];
-};
-
-struct morus1280_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
-
- void (*crypt_blocks)(void *state, const void *src, void *dst,
- unsigned int length);
- void (*crypt_tail)(void *state, const void *src, void *dst,
- unsigned int length);
-};
-
-static void crypto_morus1280_glue_process_ad(
- struct morus1280_state *state,
- const struct morus1280_glue_ops *ops,
- struct scatterlist *sg_src, unsigned int assoclen)
-{
- struct scatter_walk walk;
- struct morus1280_block buf;
- unsigned int pos = 0;
-
- scatterwalk_start(&walk, sg_src);
- while (assoclen != 0) {
- unsigned int size = scatterwalk_clamp(&walk, assoclen);
- unsigned int left = size;
- void *mapped = scatterwalk_map(&walk);
- const u8 *src = (const u8 *)mapped;
-
- if (pos + size >= MORUS1280_BLOCK_SIZE) {
- if (pos > 0) {
- unsigned int fill = MORUS1280_BLOCK_SIZE - pos;
- memcpy(buf.bytes + pos, src, fill);
- ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
- pos = 0;
- left -= fill;
- src += fill;
- }
-
- ops->ad(state, src, left);
- src += left & ~(MORUS1280_BLOCK_SIZE - 1);
- left &= MORUS1280_BLOCK_SIZE - 1;
- }
-
- memcpy(buf.bytes + pos, src, left);
-
- pos += left;
- assoclen -= size;
- scatterwalk_unmap(mapped);
- scatterwalk_advance(&walk, size);
- scatterwalk_done(&walk, 0, assoclen);
- }
-
- if (pos > 0) {
- memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos);
- ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE);
- }
-}
-
-static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
- struct morus1280_ops ops,
- struct aead_request *req)
-{
- struct skcipher_walk walk;
- u8 *cursor_src, *cursor_dst;
- unsigned int chunksize, base;
-
- ops.skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- cursor_src = walk.src.virt.addr;
- cursor_dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
-
- base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1);
- cursor_src += base;
- cursor_dst += base;
- chunksize &= MORUS1280_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops.crypt_tail(state, cursor_src, cursor_dst,
- chunksize);
-
- skcipher_walk_done(&walk, 0);
- }
-}
-
-int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
-{
- struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
-
- if (keylen == MORUS1280_BLOCK_SIZE) {
- memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE);
- } else if (keylen == MORUS1280_BLOCK_SIZE / 2) {
- memcpy(ctx->key.bytes, key, keylen);
- memcpy(ctx->key.bytes + keylen, key, keylen);
- } else {
- crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey);
-
-int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm,
- unsigned int authsize)
-{
- return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
-}
-EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize);
-
-static void crypto_morus1280_glue_crypt(struct aead_request *req,
- struct morus1280_ops ops,
- unsigned int cryptlen,
- struct morus1280_block *tag_xor)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
- struct morus1280_state state;
-
- kernel_fpu_begin();
-
- ctx->ops->init(&state, &ctx->key, req->iv);
- crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
- crypto_morus1280_glue_process_crypt(&state, ops, req);
- ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
-
- kernel_fpu_end();
-}
-
-int crypto_morus1280_glue_encrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
- struct morus1280_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = ctx->ops->enc,
- .crypt_tail = ctx->ops->enc_tail,
- };
-
- struct morus1280_block tag = {};
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen;
-
- crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
-
- scatterwalk_map_and_copy(tag.bytes, req->dst,
- req->assoclen + cryptlen, authsize, 1);
- return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt);
-
-int crypto_morus1280_glue_decrypt(struct aead_request *req)
-{
- static const u8 zeros[MORUS1280_BLOCK_SIZE] = {};
-
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
- struct morus1280_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = ctx->ops->dec,
- .crypt_tail = ctx->ops->dec_tail,
- };
-
- struct morus1280_block tag;
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen - authsize;
-
- scatterwalk_map_and_copy(tag.bytes, req->src,
- req->assoclen + cryptlen, authsize, 0);
-
- crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag);
-
- return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
-}
-EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt);
-
-void crypto_morus1280_glue_init_ops(struct crypto_aead *aead,
- const struct morus1280_glue_ops *ops)
-{
- struct morus1280_ctx *ctx = crypto_aead_ctx(aead);
- ctx->ops = ops;
-}
-EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops);
-
-int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey);
-
-int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize);
-
-int cryptd_morus1280_glue_encrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_encrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt);
-
-int cryptd_morus1280_glue_decrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_decrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt);
-
-int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
- char internal_name[CRYPTO_MAX_ALG_NAME];
-
- if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
- >= CRYPTO_MAX_ALG_NAME)
- return -ENAMETOOLONG;
-
- cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
- return 0;
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm);
-
-void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
deleted file mode 100644
index 414db48..0000000
--- a/arch/x86/crypto/morus640-sse2-asm.S
+++ /dev/null
@@ -1,615 +0,0 @@
-/*
- * SSE2 implementation of MORUS-640
- *
- * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-#define SHUFFLE_MASK(i0, i1, i2, i3) \
- (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
-
-#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
-#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
-#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
-
-#define STATE0 %xmm0
-#define STATE1 %xmm1
-#define STATE2 %xmm2
-#define STATE3 %xmm3
-#define STATE4 %xmm4
-#define KEY %xmm5
-#define MSG %xmm5
-#define T0 %xmm6
-#define T1 %xmm7
-
-.section .rodata.cst16.morus640_const, "aM", @progbits, 32
-.align 16
-.Lmorus640_const_0:
- .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
- .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
-.Lmorus640_const_1:
- .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
- .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-
-.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
-.align 16
-.Lmorus640_counter:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-
-.text
-
-.macro morus640_round s0, s1, s2, s3, s4, b, w
- movdqa \s1, T0
- pand \s2, T0
- pxor T0, \s0
- pxor \s3, \s0
- movdqa \s0, T0
- pslld $\b, T0
- psrld $(32 - \b), \s0
- pxor T0, \s0
- pshufd $\w, \s3, \s3
-.endm
-
-/*
- * __morus640_update: internal ABI
- * input:
- * STATE[0-4] - input state
- * MSG - message block
- * output:
- * STATE[0-4] - output state
- * changed:
- * T0
- */
-__morus640_update:
- morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
- pxor MSG, STATE1
- morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
- pxor MSG, STATE2
- morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
- pxor MSG, STATE3
- morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
- pxor MSG, STATE4
- morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
- ret
-ENDPROC(__morus640_update)
-
-
-/*
- * __morus640_update_zero: internal ABI
- * input:
- * STATE[0-4] - input state
- * output:
- * STATE[0-4] - output state
- * changed:
- * T0
- */
-__morus640_update_zero:
- morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1
- morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
- morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3
- morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
- morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
- ret
-ENDPROC(__morus640_update_zero)
-
-/*
- * __load_partial: internal ABI
- * input:
- * %rsi - src
- * %rcx - bytes
- * output:
- * MSG - message block
- * changed:
- * T0
- * %r8
- * %r9
- */
-__load_partial:
- xor %r9d, %r9d
- pxor MSG, MSG
-
- mov %rcx, %r8
- and $0x1, %r8
- jz .Lld_partial_1
-
- mov %rcx, %r8
- and $0x1E, %r8
- add %rsi, %r8
- mov (%r8), %r9b
-
-.Lld_partial_1:
- mov %rcx, %r8
- and $0x2, %r8
- jz .Lld_partial_2
-
- mov %rcx, %r8
- and $0x1C, %r8
- add %rsi, %r8
- shl $16, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov %rcx, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov %rcx, %r8
- and $0x18, %r8
- add %rsi, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG
-
- mov %rcx, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov %rcx, %r8
- and $0x10, %r8
- add %rsi, %r8
- pslldq $8, MSG
- movq (%r8), T0
- pxor T0, MSG
-
-.Lld_partial_8:
- ret
-ENDPROC(__load_partial)
-
-/*
- * __store_partial: internal ABI
- * input:
- * %rdx - dst
- * %rcx - bytes
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
- */
-__store_partial:
- mov %rcx, %r8
- mov %rdx, %r9
-
- movq T0, %r10
-
- cmp $8, %r8
- jl .Lst_partial_8
-
- mov %r10, (%r9)
- psrldq $8, T0
- movq T0, %r10
-
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $16, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- ret
-ENDPROC(__store_partial)
-
-/*
- * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
- */
-ENTRY(crypto_morus640_sse2_init)
- FRAME_BEGIN
-
- /* load IV: */
- movdqu (%rdx), STATE0
- /* load key: */
- movdqu (%rsi), KEY
- movdqa KEY, STATE1
- /* load all ones: */
- pcmpeqd STATE2, STATE2
- /* load the constants: */
- movdqa .Lmorus640_const_0, STATE3
- movdqa .Lmorus640_const_1, STATE4
-
- /* update 16 times with zero: */
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
- call __morus640_update_zero
-
- /* xor-in the key again after updates: */
- pxor KEY, STATE1
-
- /* store the state: */
- movdqu STATE0, (0 * 16)(%rdi)
- movdqu STATE1, (1 * 16)(%rdi)
- movdqu STATE2, (2 * 16)(%rdi)
- movdqu STATE3, (3 * 16)(%rdi)
- movdqu STATE4, (4 * 16)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_init)
-
-/*
- * void crypto_morus640_sse2_ad(void *state, const void *data,
- * unsigned int length);
- */
-ENTRY(crypto_morus640_sse2_ad)
- FRAME_BEGIN
-
- cmp $16, %rdx
- jb .Lad_out
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0
- movdqu (1 * 16)(%rdi), STATE1
- movdqu (2 * 16)(%rdi), STATE2
- movdqu (3 * 16)(%rdi), STATE3
- movdqu (4 * 16)(%rdi), STATE4
-
- mov %rsi, %r8
- and $0xF, %r8
- jnz .Lad_u_loop
-
-.align 4
-.Lad_a_loop:
- movdqa (%rsi), MSG
- call __morus640_update
- sub $16, %rdx
- add $16, %rsi
- cmp $16, %rdx
- jge .Lad_a_loop
-
- jmp .Lad_cont
-.align 4
-.Lad_u_loop:
- movdqu (%rsi), MSG
- call __morus640_update
- sub $16, %rdx
- add $16, %rsi
- cmp $16, %rdx
- jge .Lad_u_loop
-
-.Lad_cont:
- /* store the state: */
- movdqu STATE0, (0 * 16)(%rdi)
- movdqu STATE1, (1 * 16)(%rdi)
- movdqu STATE2, (2 * 16)(%rdi)
- movdqu STATE3, (3 * 16)(%rdi)
- movdqu STATE4, (4 * 16)(%rdi)
-
-.Lad_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_ad)
-
-/*
- * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus640_sse2_enc)
- FRAME_BEGIN
-
- cmp $16, %rcx
- jb .Lenc_out
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0
- movdqu (1 * 16)(%rdi), STATE1
- movdqu (2 * 16)(%rdi), STATE2
- movdqu (3 * 16)(%rdi), STATE3
- movdqu (4 * 16)(%rdi), STATE4
-
- mov %rsi, %r8
- or %rdx, %r8
- and $0xF, %r8
- jnz .Lenc_u_loop
-
-.align 4
-.Lenc_a_loop:
- movdqa (%rsi), MSG
- movdqa MSG, T0
- pxor STATE0, T0
- pshufd $MASK3, STATE1, T1
- pxor T1, T0
- movdqa STATE2, T1
- pand STATE3, T1
- pxor T1, T0
- movdqa T0, (%rdx)
-
- call __morus640_update
- sub $16, %rcx
- add $16, %rsi
- add $16, %rdx
- cmp $16, %rcx
- jge .Lenc_a_loop
-
- jmp .Lenc_cont
-.align 4
-.Lenc_u_loop:
- movdqu (%rsi), MSG
- movdqa MSG, T0
- pxor STATE0, T0
- pshufd $MASK3, STATE1, T1
- pxor T1, T0
- movdqa STATE2, T1
- pand STATE3, T1
- pxor T1, T0
- movdqu T0, (%rdx)
-
- call __morus640_update
- sub $16, %rcx
- add $16, %rsi
- add $16, %rdx
- cmp $16, %rcx
- jge .Lenc_u_loop
-
-.Lenc_cont:
- /* store the state: */
- movdqu STATE0, (0 * 16)(%rdi)
- movdqu STATE1, (1 * 16)(%rdi)
- movdqu STATE2, (2 * 16)(%rdi)
- movdqu STATE3, (3 * 16)(%rdi)
- movdqu STATE4, (4 * 16)(%rdi)
-
-.Lenc_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_enc)
-
-/*
- * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus640_sse2_enc_tail)
- FRAME_BEGIN
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0
- movdqu (1 * 16)(%rdi), STATE1
- movdqu (2 * 16)(%rdi), STATE2
- movdqu (3 * 16)(%rdi), STATE3
- movdqu (4 * 16)(%rdi), STATE4
-
- /* encrypt message: */
- call __load_partial
-
- movdqa MSG, T0
- pxor STATE0, T0
- pshufd $MASK3, STATE1, T1
- pxor T1, T0
- movdqa STATE2, T1
- pand STATE3, T1
- pxor T1, T0
-
- call __store_partial
-
- call __morus640_update
-
- /* store the state: */
- movdqu STATE0, (0 * 16)(%rdi)
- movdqu STATE1, (1 * 16)(%rdi)
- movdqu STATE2, (2 * 16)(%rdi)
- movdqu STATE3, (3 * 16)(%rdi)
- movdqu STATE4, (4 * 16)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_enc_tail)
-
-/*
- * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus640_sse2_dec)
- FRAME_BEGIN
-
- cmp $16, %rcx
- jb .Ldec_out
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0
- movdqu (1 * 16)(%rdi), STATE1
- movdqu (2 * 16)(%rdi), STATE2
- movdqu (3 * 16)(%rdi), STATE3
- movdqu (4 * 16)(%rdi), STATE4
-
- mov %rsi, %r8
- or %rdx, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
-.align 4
-.Ldec_a_loop:
- movdqa (%rsi), MSG
- pxor STATE0, MSG
- pshufd $MASK3, STATE1, T0
- pxor T0, MSG
- movdqa STATE2, T0
- pand STATE3, T0
- pxor T0, MSG
- movdqa MSG, (%rdx)
-
- call __morus640_update
- sub $16, %rcx
- add $16, %rsi
- add $16, %rdx
- cmp $16, %rcx
- jge .Ldec_a_loop
-
- jmp .Ldec_cont
-.align 4
-.Ldec_u_loop:
- movdqu (%rsi), MSG
- pxor STATE0, MSG
- pshufd $MASK3, STATE1, T0
- pxor T0, MSG
- movdqa STATE2, T0
- pand STATE3, T0
- pxor T0, MSG
- movdqu MSG, (%rdx)
-
- call __morus640_update
- sub $16, %rcx
- add $16, %rsi
- add $16, %rdx
- cmp $16, %rcx
- jge .Ldec_u_loop
-
-.Ldec_cont:
- /* store the state: */
- movdqu STATE0, (0 * 16)(%rdi)
- movdqu STATE1, (1 * 16)(%rdi)
- movdqu STATE2, (2 * 16)(%rdi)
- movdqu STATE3, (3 * 16)(%rdi)
- movdqu STATE4, (4 * 16)(%rdi)
-
-.Ldec_out:
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_dec)
-
-/*
- * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
- * unsigned int length);
- */
-ENTRY(crypto_morus640_sse2_dec_tail)
- FRAME_BEGIN
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0
- movdqu (1 * 16)(%rdi), STATE1
- movdqu (2 * 16)(%rdi), STATE2
- movdqu (3 * 16)(%rdi), STATE3
- movdqu (4 * 16)(%rdi), STATE4
-
- /* decrypt message: */
- call __load_partial
-
- pxor STATE0, MSG
- pshufd $MASK3, STATE1, T0
- pxor T0, MSG
- movdqa STATE2, T0
- pand STATE3, T0
- pxor T0, MSG
- movdqa MSG, T0
-
- call __store_partial
-
- /* mask with byte count: */
- movq %rcx, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- movdqa .Lmorus640_counter, T1
- pcmpgtb T1, T0
- pand T0, MSG
-
- call __morus640_update
-
- /* store the state: */
- movdqu STATE0, (0 * 16)(%rdi)
- movdqu STATE1, (1 * 16)(%rdi)
- movdqu STATE2, (2 * 16)(%rdi)
- movdqu STATE3, (3 * 16)(%rdi)
- movdqu STATE4, (4 * 16)(%rdi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_dec_tail)
-
-/*
- * void crypto_morus640_sse2_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
- */
-ENTRY(crypto_morus640_sse2_final)
- FRAME_BEGIN
-
- /* load the state: */
- movdqu (0 * 16)(%rdi), STATE0
- movdqu (1 * 16)(%rdi), STATE1
- movdqu (2 * 16)(%rdi), STATE2
- movdqu (3 * 16)(%rdi), STATE3
- movdqu (4 * 16)(%rdi), STATE4
-
- /* xor state[0] into state[4]: */
- pxor STATE0, STATE4
-
- /* prepare length block: */
- movq %rdx, MSG
- movq %rcx, T0
- pslldq $8, T0
- pxor T0, MSG
- psllq $3, MSG /* multiply by 8 (to get bit count) */
-
- /* update state: */
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
- call __morus640_update
-
- /* xor tag: */
- movdqu (%rsi), MSG
-
- pxor STATE0, MSG
- pshufd $MASK3, STATE1, T0
- pxor T0, MSG
- movdqa STATE2, T0
- pand STATE3, T0
- pxor T0, MSG
-
- movdqu MSG, (%rsi)
-
- FRAME_END
- ret
-ENDPROC(crypto_morus640_sse2_final)
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
deleted file mode 100644
index 9afaf8f..0000000
--- a/arch/x86/crypto/morus640-sse2-glue.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * The MORUS-640 Authenticated-Encryption Algorithm
- * Glue for SSE2 implementation
- *
- * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/internal/aead.h>
-#include <crypto/morus640_glue.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/cpu_device_id.h>
-
-asmlinkage void crypto_morus640_sse2_init(void *state, const void *key,
- const void *iv);
-asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data,
- unsigned int length);
-
-asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src,
- void *dst, unsigned int length);
-asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src,
- void *dst, unsigned int length);
-
-asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src,
- void *dst, unsigned int length);
-asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
- void *dst, unsigned int length);
-
-asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
- u64 assoclen, u64 cryptlen);
-
-MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
-
-static int __init crypto_morus640_sse2_module_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
- !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
- return -ENODEV;
-
- return crypto_register_aeads(crypto_morus640_sse2_algs,
- ARRAY_SIZE(crypto_morus640_sse2_algs));
-}
-
-static void __exit crypto_morus640_sse2_module_exit(void)
-{
- crypto_unregister_aeads(crypto_morus640_sse2_algs,
- ARRAY_SIZE(crypto_morus640_sse2_algs));
-}
-
-module_init(crypto_morus640_sse2_module_init);
-module_exit(crypto_morus640_sse2_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation");
-MODULE_ALIAS_CRYPTO("morus640");
-MODULE_ALIAS_CRYPTO("morus640-sse2");
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
deleted file mode 100644
index 7b58fe4..0000000
--- a/arch/x86/crypto/morus640_glue.c
+++ /dev/null
@@ -1,298 +0,0 @@
-/*
- * The MORUS-640 Authenticated-Encryption Algorithm
- * Common x86 SIMD glue skeleton
- *
- * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
- * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <crypto/cryptd.h>
-#include <crypto/internal/aead.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/morus640_glue.h>
-#include <crypto/scatterwalk.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/scatterlist.h>
-#include <asm/fpu/api.h>
-
-struct morus640_state {
- struct morus640_block s[MORUS_STATE_BLOCKS];
-};
-
-struct morus640_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
-
- void (*crypt_blocks)(void *state, const void *src, void *dst,
- unsigned int length);
- void (*crypt_tail)(void *state, const void *src, void *dst,
- unsigned int length);
-};
-
-static void crypto_morus640_glue_process_ad(
- struct morus640_state *state,
- const struct morus640_glue_ops *ops,
- struct scatterlist *sg_src, unsigned int assoclen)
-{
- struct scatter_walk walk;
- struct morus640_block buf;
- unsigned int pos = 0;
-
- scatterwalk_start(&walk, sg_src);
- while (assoclen != 0) {
- unsigned int size = scatterwalk_clamp(&walk, assoclen);
- unsigned int left = size;
- void *mapped = scatterwalk_map(&walk);
- const u8 *src = (const u8 *)mapped;
-
- if (pos + size >= MORUS640_BLOCK_SIZE) {
- if (pos > 0) {
- unsigned int fill = MORUS640_BLOCK_SIZE - pos;
- memcpy(buf.bytes + pos, src, fill);
- ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
- pos = 0;
- left -= fill;
- src += fill;
- }
-
- ops->ad(state, src, left);
- src += left & ~(MORUS640_BLOCK_SIZE - 1);
- left &= MORUS640_BLOCK_SIZE - 1;
- }
-
- memcpy(buf.bytes + pos, src, left);
-
- pos += left;
- assoclen -= size;
- scatterwalk_unmap(mapped);
- scatterwalk_advance(&walk, size);
- scatterwalk_done(&walk, 0, assoclen);
- }
-
- if (pos > 0) {
- memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos);
- ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE);
- }
-}
-
-static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
- struct morus640_ops ops,
- struct aead_request *req)
-{
- struct skcipher_walk walk;
- u8 *cursor_src, *cursor_dst;
- unsigned int chunksize, base;
-
- ops.skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- cursor_src = walk.src.virt.addr;
- cursor_dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
-
- base = chunksize & ~(MORUS640_BLOCK_SIZE - 1);
- cursor_src += base;
- cursor_dst += base;
- chunksize &= MORUS640_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops.crypt_tail(state, cursor_src, cursor_dst,
- chunksize);
-
- skcipher_walk_done(&walk, 0);
- }
-}
-
-int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
-{
- struct morus640_ctx *ctx = crypto_aead_ctx(aead);
-
- if (keylen != MORUS640_BLOCK_SIZE) {
- crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
- return -EINVAL;
- }
-
- memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE);
- return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey);
-
-int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm,
- unsigned int authsize)
-{
- return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL;
-}
-EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize);
-
-static void crypto_morus640_glue_crypt(struct aead_request *req,
- struct morus640_ops ops,
- unsigned int cryptlen,
- struct morus640_block *tag_xor)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
- struct morus640_state state;
-
- kernel_fpu_begin();
-
- ctx->ops->init(&state, &ctx->key, req->iv);
- crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
- crypto_morus640_glue_process_crypt(&state, ops, req);
- ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
-
- kernel_fpu_end();
-}
-
-int crypto_morus640_glue_encrypt(struct aead_request *req)
-{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
- struct morus640_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = ctx->ops->enc,
- .crypt_tail = ctx->ops->enc_tail,
- };
-
- struct morus640_block tag = {};
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen;
-
- crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
-
- scatterwalk_map_and_copy(tag.bytes, req->dst,
- req->assoclen + cryptlen, authsize, 1);
- return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt);
-
-int crypto_morus640_glue_decrypt(struct aead_request *req)
-{
- static const u8 zeros[MORUS640_BLOCK_SIZE] = {};
-
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
- struct morus640_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = ctx->ops->dec,
- .crypt_tail = ctx->ops->dec_tail,
- };
-
- struct morus640_block tag;
- unsigned int authsize = crypto_aead_authsize(tfm);
- unsigned int cryptlen = req->cryptlen - authsize;
-
- scatterwalk_map_and_copy(tag.bytes, req->src,
- req->assoclen + cryptlen, authsize, 0);
-
- crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag);
-
- return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0;
-}
-EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt);
-
-void crypto_morus640_glue_init_ops(struct crypto_aead *aead,
- const struct morus640_glue_ops *ops)
-{
- struct morus640_ctx *ctx = crypto_aead_ctx(aead);
- ctx->ops = ops;
-}
-EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops);
-
-int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setkey(&cryptd_tfm->base, key, keylen);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey);
-
-int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead,
- unsigned int authsize)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize);
-
-int cryptd_morus640_glue_encrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_encrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt);
-
-int cryptd_morus640_glue_decrypt(struct aead_request *req)
-{
- struct crypto_aead *aead = crypto_aead_reqtfm(req);
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- struct cryptd_aead *cryptd_tfm = *ctx;
-
- aead = &cryptd_tfm->base;
- if (irq_fpu_usable() && (!in_atomic() ||
- !cryptd_aead_queued(cryptd_tfm)))
- aead = cryptd_aead_child(cryptd_tfm);
-
- aead_request_set_tfm(req, aead);
-
- return crypto_aead_decrypt(req);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt);
-
-int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead *cryptd_tfm;
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
- const char *name = crypto_aead_alg(aead)->base.cra_driver_name;
- char internal_name[CRYPTO_MAX_ALG_NAME];
-
- if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name)
- >= CRYPTO_MAX_ALG_NAME)
- return -ENAMETOOLONG;
-
- cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
-
- *ctx = cryptd_tfm;
- crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
- return 0;
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm);
-
-void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead)
-{
- struct cryptd_aead **ctx = crypto_aead_ctx(aead);
-
- cryptd_free_aead(*ctx);
-}
-EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations");
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index 0000000..f7946ea
--- /dev/null
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define PASS0_SUMS %ymm0
+#define PASS1_SUMS %ymm1
+#define PASS2_SUMS %ymm2
+#define PASS3_SUMS %ymm3
+#define K0 %ymm4
+#define K0_XMM %xmm4
+#define K1 %ymm5
+#define K1_XMM %xmm5
+#define K2 %ymm6
+#define K2_XMM %xmm6
+#define K3 %ymm7
+#define K3_XMM %xmm7
+#define T0 %ymm8
+#define T1 %ymm9
+#define T2 %ymm10
+#define T2_XMM %xmm10
+#define T3 %ymm11
+#define T3_XMM %xmm11
+#define T4 %ymm12
+#define T5 %ymm13
+#define T6 %ymm14
+#define T7 %ymm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_2xstride k0, k1, k2, k3
+
+ // Add message words to key words
+ vpaddd \k0, T3, T0
+ vpaddd \k1, T3, T1
+ vpaddd \k2, T3, T2
+ vpaddd \k3, T3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ vpshufd $0x10, T0, T4
+ vpshufd $0x32, T0, T0
+ vpshufd $0x10, T1, T5
+ vpshufd $0x32, T1, T1
+ vpshufd $0x10, T2, T6
+ vpshufd $0x32, T2, T2
+ vpshufd $0x10, T3, T7
+ vpshufd $0x32, T3, T3
+ vpmuludq T4, T0, T0
+ vpmuludq T5, T1, T1
+ vpmuludq T6, T2, T2
+ vpmuludq T7, T3, T3
+ vpaddq T0, PASS0_SUMS, PASS0_SUMS
+ vpaddq T1, PASS1_SUMS, PASS1_SUMS
+ vpaddq T2, PASS2_SUMS, PASS2_SUMS
+ vpaddq T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_avx2)
+
+ vmovdqu 0x00(KEY), K0
+ vmovdqu 0x10(KEY), K1
+ add $0x20, KEY
+ vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+ vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+ vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+ vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+
+ vmovdqu 0x20(MESSAGE), T3
+ vmovdqu 0x20(KEY), K0
+ vmovdqu 0x30(KEY), K1
+ _nh_2xstride K2, K3, K0, K1
+
+ add $0x40, MESSAGE
+ add $0x40, KEY
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+
+ cmp $0x20, MESSAGE_LEN
+ jl .Llast
+
+ // 2 or 3 strides remain; do 2 more.
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+ add $0x20, MESSAGE
+ add $0x20, KEY
+ sub $0x20, MESSAGE_LEN
+ jz .Ldone
+ vmovdqa K2, K0
+ vmovdqa K3, K1
+.Llast:
+ // Last stride. Zero the high 128 bits of the message and keys so they
+ // don't affect the result when processing them like 2 strides.
+ vmovdqu (MESSAGE), T3_XMM
+ vmovdqa K0_XMM, K0_XMM
+ vmovdqa K1_XMM, K1_XMM
+ vmovdqu 0x00(KEY), K2_XMM
+ vmovdqu 0x10(KEY), K3_XMM
+ _nh_2xstride K0, K1, K2, K3
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+
+ // PASS0_SUMS is (0A 0B 0C 0D)
+ // PASS1_SUMS is (1A 1B 1C 1D)
+ // PASS2_SUMS is (2A 2B 2C 2D)
+ // PASS3_SUMS is (3A 3B 3C 3D)
+ // We need the horizontal sums:
+ // (0A + 0B + 0C + 0D,
+ // 1A + 1B + 1C + 1D,
+ // 2A + 2B + 2C + 2D,
+ // 3A + 3B + 3C + 3D)
+ //
+
+ vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
+ vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
+ vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
+ vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
+
+ vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
+ vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
+ vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
+ vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
+
+ vpaddq T5, T4, T4
+ vpaddq T1, T0, T0
+ vpaddq T4, T0, T0
+ vmovdqu T0, (HASH)
+ ret
+ENDPROC(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index 0000000..51f52d4
--- /dev/null
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define PASS0_SUMS %xmm0
+#define PASS1_SUMS %xmm1
+#define PASS2_SUMS %xmm2
+#define PASS3_SUMS %xmm3
+#define K0 %xmm4
+#define K1 %xmm5
+#define K2 %xmm6
+#define K3 %xmm7
+#define T0 %xmm8
+#define T1 %xmm9
+#define T2 %xmm10
+#define T3 %xmm11
+#define T4 %xmm12
+#define T5 %xmm13
+#define T6 %xmm14
+#define T7 %xmm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_stride k0, k1, k2, k3, offset
+
+ // Load next message stride
+ movdqu \offset(MESSAGE), T1
+
+ // Load next key stride
+ movdqu \offset(KEY), \k3
+
+ // Add message words to key words
+ movdqa T1, T2
+ movdqa T1, T3
+ paddd T1, \k0 // reuse k0 to avoid a move
+ paddd \k1, T1
+ paddd \k2, T2
+ paddd \k3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ pshufd $0x10, \k0, T4
+ pshufd $0x32, \k0, \k0
+ pshufd $0x10, T1, T5
+ pshufd $0x32, T1, T1
+ pshufd $0x10, T2, T6
+ pshufd $0x32, T2, T2
+ pshufd $0x10, T3, T7
+ pshufd $0x32, T3, T3
+ pmuludq T4, \k0
+ pmuludq T5, T1
+ pmuludq T6, T2
+ pmuludq T7, T3
+ paddq \k0, PASS0_SUMS
+ paddq T1, PASS1_SUMS
+ paddq T2, PASS2_SUMS
+ paddq T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_sse2)
+
+ movdqu 0x00(KEY), K0
+ movdqu 0x10(KEY), K1
+ movdqu 0x20(KEY), K2
+ add $0x30, KEY
+ pxor PASS0_SUMS, PASS0_SUMS
+ pxor PASS1_SUMS, PASS1_SUMS
+ pxor PASS2_SUMS, PASS2_SUMS
+ pxor PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3, 0x00
+ _nh_stride K1, K2, K3, K0, 0x10
+ _nh_stride K2, K3, K0, K1, 0x20
+ _nh_stride K3, K0, K1, K2, 0x30
+ add $0x40, KEY
+ add $0x40, MESSAGE
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K0, K1, K2, K3, 0x00
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K1, K2, K3, K0, 0x10
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K2, K3, K0, K1, 0x20
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ movdqa PASS0_SUMS, T0
+ movdqa PASS2_SUMS, T1
+ punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
+ punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
+ punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
+ punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
+ paddq PASS0_SUMS, T0
+ paddq PASS2_SUMS, T1
+ movdqu T0, 0x00(HASH)
+ movdqu T1, 0x10(HASH)
+ ret
+ENDPROC(nh_sse2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
new file mode 100644
index 0000000..f7567cb
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (AVX2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_avx2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_avx2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-avx2",
+ .base.cra_priority = 300,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_avx2_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
new file mode 100644
index 0000000..a661ede
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (SSE2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/simd.h>
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_sse2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_sse2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-sse2",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_sse2_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index 3b6e70d..8b341bc 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
*
* Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/linkage.h>
@@ -323,6 +319,12 @@
vpaddq t2,t1,t1
vmovq t1x,d4
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
@@ -361,16 +363,16 @@
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index c88c670..5578f84 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
*
* Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/linkage.h>
@@ -253,16 +249,16 @@
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
@@ -272,6 +268,10 @@
dec %rcx
jnz .Ldoblock
+ # Zeroing of key material
+ mov %rcx,0x00(%rsp)
+ mov %rcx,0x08(%rsp)
+
add $0x10,%rsp
pop %r12
pop %rbx
@@ -520,6 +520,12 @@
paddq t2,t1
movq t1,d4
+ # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+ # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+ # amount. Careful: we must not assume the carry bits 'd0 >> 26',
+ # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+ # integers. It's true in a single-block implementation, but not here.
+
# d1 += d0 >> 26
mov d0,%rax
shr $26,%rax
@@ -558,16 +564,16 @@
# h0 += (d4 >> 26) * 5
mov d4,%rax
shr $26,%rax
- lea (%eax,%eax,4),%eax
- add %eax,%ebx
+ lea (%rax,%rax,4),%rax
+ add %rax,%rbx
# h4 = d4 & 0x3ffffff
mov d4,%rax
and $0x3ffffff,%eax
mov %eax,h4
# h1 += h0 >> 26
- mov %ebx,%eax
- shr $26,%eax
+ mov %rbx,%rax
+ shr $26,%rax
add %eax,h1
# h0 = h0 & 0x3ffffff
andl $0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f012b7e..4a1c05d 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -1,21 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Poly1305 authenticator algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <crypto/poly1305.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <asm/fpu/api.h>
#include <asm/simd.h>
struct poly1305_simd_desc_ctx {
@@ -83,35 +79,37 @@
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
if (unlikely(!sctx->wset)) {
if (!sctx->uset) {
- memcpy(sctx->u, dctx->r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r);
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 5, dctx->r);
+ poly1305_simd_mult(sctx->u + 5, dctx->r.r);
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 10, dctx->r);
+ poly1305_simd_mult(sctx->u + 10, dctx->r.r);
sctx->wset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
- poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+ poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
src += POLY1305_BLOCK_SIZE * 4 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
}
#endif
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
if (unlikely(!sctx->uset)) {
- memcpy(sctx->u, dctx->r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r);
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
- poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+ poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
src += POLY1305_BLOCK_SIZE * 2 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+ poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
srclen -= POLY1305_BLOCK_SIZE;
}
return srclen;
@@ -124,7 +122,7 @@
unsigned int bytes;
/* kernel_fpu_begin/end is costly, use fallback for small updates */
- if (srclen <= 288 || !may_use_simd())
+ if (srclen <= 288 || !crypto_simd_usable())
return crypto_poly1305_update(desc, src, srclen);
kernel_fpu_begin();
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 2925077..ddc51db 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
*
@@ -5,22 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index d67888f..37bc1d4 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* x86_64/AVX2 assembler optimized version of Serpent
*
@@ -6,12 +7,6 @@
* Based on AVX assembler implementation of Serpent by:
* Copyright © 2012 Johannes Goetzfried
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index d348f15..e5c4a46 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Serpent Cipher 4-way parallel algorithm (i586/SSE2)
*
@@ -6,22 +7,6 @@
* Based on crypto/serpent.c by
* Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
* 2003 Herbert Valerio Riedel <hvr@gnu.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index acc066c..5e0b3a3 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
*
@@ -6,22 +7,6 @@
* Based on crypto/serpent.c by
* Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
* 2003 Herbert Valerio Riedel <hvr@gnu.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 03347b1..13fd8d3 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for x86_64/AVX2 assembler optimized version of Serpent
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
*/
#include <linux/module.h>
@@ -172,7 +167,7 @@
return glue_xts_req_128bit(&serpent_enc_xts, req,
XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -182,7 +177,7 @@
return glue_xts_req_128bit(&serpent_dec_xts, req,
XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
static struct skcipher_alg serpent_algs[] = {
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 458567e..7d3dca3 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for AVX assembler versions of Serpent Cipher
*
@@ -5,22 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/module.h>
@@ -222,7 +207,7 @@
return glue_xts_req_128bit(&serpent_enc_xts, req,
XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -232,7 +217,7 @@
return glue_xts_req_128bit(&serpent_dec_xts, req,
XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
static struct skcipher_alg serpent_algs[] = {
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 3dafe13..5fdf193 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for SSE2 assembler versions of Serpent Cipher
*
@@ -11,22 +12,6 @@
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
* CTR part based on code (crypto/ctr.c) by:
* (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/module.h>
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
deleted file mode 100644
index 815ded3..0000000
--- a/arch/x86/crypto/sha1-mb/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Arch-specific CryptoAPI modules.
-#
-
-OBJECT_FILES_NON_STANDARD := y
-
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
- $(comma)4)$(comma)%ymm2,yes,no)
-ifeq ($(avx2_supported),yes)
- obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
- sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
- sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
-endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
deleted file mode 100644
index b938056..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb.c
+++ /dev/null
@@ -1,1011 +0,0 @@
-/*
- * Multi buffer SHA1 algorithm Glue Code
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
-#include <crypto/mcryptd.h>
-#include <crypto/crypto_wq.h>
-#include <asm/byteorder.h>
-#include <linux/hardirq.h>
-#include <asm/fpu/api.h>
-#include "sha1_mb_ctx.h"
-
-#define FLUSH_INTERVAL 1000 /* in usec */
-
-static struct mcryptd_alg_state sha1_mb_alg_state;
-
-struct sha1_mb_ctx {
- struct mcryptd_ahash *mcryptd_tfm;
-};
-
-static inline struct mcryptd_hash_request_ctx
- *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
-{
- struct ahash_request *areq;
-
- areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
- return container_of(areq, struct mcryptd_hash_request_ctx, areq);
-}
-
-static inline struct ahash_request
- *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
-{
- return container_of((void *) ctx, struct ahash_request, __ctx);
-}
-
-static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
- struct ahash_request *areq)
-{
- rctx->flag = HASH_UPDATE;
-}
-
-static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)
- (struct sha1_mb_mgr *state, struct job_sha1 *job);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)
- (struct sha1_mb_mgr *state);
-static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)
- (struct sha1_mb_mgr *state);
-
-static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
- uint64_t total_len)
-{
- uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
-
- memset(&padblock[i], 0, SHA1_BLOCK_SIZE);
- padblock[i] = 0x80;
-
- i += ((SHA1_BLOCK_SIZE - 1) &
- (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1)))
- + 1 + SHA1_PADLENGTHFIELD_SIZE;
-
-#if SHA1_PADLENGTHFIELD_SIZE == 16
- *((uint64_t *) &padblock[i - 16]) = 0;
-#endif
-
- *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
-
- /* Number of extra blocks to hash */
- return i >> SHA1_LOG2_BLOCK_SIZE;
-}
-
-static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr,
- struct sha1_hash_ctx *ctx)
-{
- while (ctx) {
- if (ctx->status & HASH_CTX_STS_COMPLETE) {
- /* Clear PROCESSING bit */
- ctx->status = HASH_CTX_STS_COMPLETE;
- return ctx;
- }
-
- /*
- * If the extra blocks are empty, begin hashing what remains
- * in the user's buffer.
- */
- if (ctx->partial_block_buffer_length == 0 &&
- ctx->incoming_buffer_length) {
-
- const void *buffer = ctx->incoming_buffer;
- uint32_t len = ctx->incoming_buffer_length;
- uint32_t copy_len;
-
- /*
- * Only entire blocks can be hashed.
- * Copy remainder to extra blocks buffer.
- */
- copy_len = len & (SHA1_BLOCK_SIZE-1);
-
- if (copy_len) {
- len -= copy_len;
- memcpy(ctx->partial_block_buffer,
- ((const char *) buffer + len),
- copy_len);
- ctx->partial_block_buffer_length = copy_len;
- }
-
- ctx->incoming_buffer_length = 0;
-
- /* len should be a multiple of the block size now */
- assert((len % SHA1_BLOCK_SIZE) == 0);
-
- /* Set len to the number of blocks to be hashed */
- len >>= SHA1_LOG2_BLOCK_SIZE;
-
- if (len) {
-
- ctx->job.buffer = (uint8_t *) buffer;
- ctx->job.len = len;
- ctx = (struct sha1_hash_ctx *)sha1_job_mgr_submit(&mgr->mgr,
- &ctx->job);
- continue;
- }
- }
-
- /*
- * If the extra blocks are not empty, then we are
- * either on the last block(s) or we need more
- * user input before continuing.
- */
- if (ctx->status & HASH_CTX_STS_LAST) {
-
- uint8_t *buf = ctx->partial_block_buffer;
- uint32_t n_extra_blocks =
- sha1_pad(buf, ctx->total_length);
-
- ctx->status = (HASH_CTX_STS_PROCESSING |
- HASH_CTX_STS_COMPLETE);
- ctx->job.buffer = buf;
- ctx->job.len = (uint32_t) n_extra_blocks;
- ctx = (struct sha1_hash_ctx *)
- sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
- continue;
- }
-
- ctx->status = HASH_CTX_STS_IDLE;
- return ctx;
- }
-
- return NULL;
-}
-
-static struct sha1_hash_ctx
- *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
-{
- /*
- * If get_comp_job returns NULL, there are no jobs complete.
- * If get_comp_job returns a job, verify that it is safe to return to
- * the user.
- * If it is not ready, resubmit the job to finish processing.
- * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
- * Otherwise, all jobs currently being managed by the hash_ctx_mgr
- * still need processing.
- */
- struct sha1_hash_ctx *ctx;
-
- ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr);
- return sha1_ctx_mgr_resubmit(mgr, ctx);
-}
-
-static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr)
-{
- sha1_job_mgr_init(&mgr->mgr);
-}
-
-static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
- struct sha1_hash_ctx *ctx,
- const void *buffer,
- uint32_t len,
- int flags)
-{
- if (flags & ~(HASH_UPDATE | HASH_LAST)) {
- /* User should not pass anything other than UPDATE or LAST */
- ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
- return ctx;
- }
-
- if (ctx->status & HASH_CTX_STS_PROCESSING) {
- /* Cannot submit to a currently processing job. */
- ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
- return ctx;
- }
-
- if (ctx->status & HASH_CTX_STS_COMPLETE) {
- /* Cannot update a finished job. */
- ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
- return ctx;
- }
-
- /*
- * If we made it here, there were no errors during this call to
- * submit
- */
- ctx->error = HASH_CTX_ERROR_NONE;
-
- /* Store buffer ptr info from user */
- ctx->incoming_buffer = buffer;
- ctx->incoming_buffer_length = len;
-
- /*
- * Store the user's request flags and mark this ctx as currently
- * being processed.
- */
- ctx->status = (flags & HASH_LAST) ?
- (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
- HASH_CTX_STS_PROCESSING;
-
- /* Advance byte counter */
- ctx->total_length += len;
-
- /*
- * If there is anything currently buffered in the extra blocks,
- * append to it until it contains a whole block.
- * Or if the user's buffer contains less than a whole block,
- * append as much as possible to the extra block.
- */
- if (ctx->partial_block_buffer_length || len < SHA1_BLOCK_SIZE) {
- /*
- * Compute how many bytes to copy from user buffer into
- * extra block
- */
- uint32_t copy_len = SHA1_BLOCK_SIZE -
- ctx->partial_block_buffer_length;
- if (len < copy_len)
- copy_len = len;
-
- if (copy_len) {
- /* Copy and update relevant pointers and counters */
- memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
- buffer, copy_len);
-
- ctx->partial_block_buffer_length += copy_len;
- ctx->incoming_buffer = (const void *)
- ((const char *)buffer + copy_len);
- ctx->incoming_buffer_length = len - copy_len;
- }
-
- /*
- * The extra block should never contain more than 1 block
- * here
- */
- assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
-
- /*
- * If the extra block buffer contains exactly 1 block, it can
- * be hashed.
- */
- if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
- ctx->partial_block_buffer_length = 0;
-
- ctx->job.buffer = ctx->partial_block_buffer;
- ctx->job.len = 1;
- ctx = (struct sha1_hash_ctx *)
- sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
- }
- }
-
- return sha1_ctx_mgr_resubmit(mgr, ctx);
-}
-
-static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
-{
- struct sha1_hash_ctx *ctx;
-
- while (1) {
- ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr);
-
- /* If flush returned 0, there are no more jobs in flight. */
- if (!ctx)
- return NULL;
-
- /*
- * If flush returned a job, resubmit the job to finish
- * processing.
- */
- ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
-
- /*
- * If sha1_ctx_mgr_resubmit returned a job, it is ready to be
- * returned. Otherwise, all jobs currently being managed by the
- * sha1_ctx_mgr still need processing. Loop.
- */
- if (ctx)
- return ctx;
- }
-}
-
-static int sha1_mb_init(struct ahash_request *areq)
-{
- struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
-
- hash_ctx_init(sctx);
- sctx->job.result_digest[0] = SHA1_H0;
- sctx->job.result_digest[1] = SHA1_H1;
- sctx->job.result_digest[2] = SHA1_H2;
- sctx->job.result_digest[3] = SHA1_H3;
- sctx->job.result_digest[4] = SHA1_H4;
- sctx->total_length = 0;
- sctx->partial_block_buffer_length = 0;
- sctx->status = HASH_CTX_STS_IDLE;
-
- return 0;
-}
-
-static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
-{
- int i;
- struct sha1_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
- __be32 *dst = (__be32 *) rctx->out;
-
- for (i = 0; i < 5; ++i)
- dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
-
- return 0;
-}
-
-static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
- struct mcryptd_alg_cstate *cstate, bool flush)
-{
- int flag = HASH_UPDATE;
- int nbytes, err = 0;
- struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
- struct sha1_hash_ctx *sha_ctx;
-
- /* more work ? */
- while (!(rctx->flag & HASH_DONE)) {
- nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
- if (nbytes < 0) {
- err = nbytes;
- goto out;
- }
- /* check if the walk is done */
- if (crypto_ahash_walk_last(&rctx->walk)) {
- rctx->flag |= HASH_DONE;
- if (rctx->flag & HASH_FINAL)
- flag |= HASH_LAST;
-
- }
- sha_ctx = (struct sha1_hash_ctx *)
- ahash_request_ctx(&rctx->areq);
- kernel_fpu_begin();
- sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx,
- rctx->walk.data, nbytes, flag);
- if (!sha_ctx) {
- if (flush)
- sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
- }
- kernel_fpu_end();
- if (sha_ctx)
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- else {
- rctx = NULL;
- goto out;
- }
- }
-
- /* copy the results */
- if (rctx->flag & HASH_FINAL)
- sha1_mb_set_results(rctx);
-
-out:
- *ret_rctx = rctx;
- return err;
-}
-
-static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
- struct mcryptd_alg_cstate *cstate,
- int err)
-{
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha1_hash_ctx *sha_ctx;
- struct mcryptd_hash_request_ctx *req_ctx;
- int ret;
-
- /* remove from work list */
- spin_lock(&cstate->work_lock);
- list_del(&rctx->waiter);
- spin_unlock(&cstate->work_lock);
-
- if (irqs_disabled())
- rctx->complete(&req->base, err);
- else {
- local_bh_disable();
- rctx->complete(&req->base, err);
- local_bh_enable();
- }
-
- /* check to see if there are other jobs that are done */
- sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
- while (sha_ctx) {
- req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&req_ctx, cstate, false);
- if (req_ctx) {
- spin_lock(&cstate->work_lock);
- list_del(&req_ctx->waiter);
- spin_unlock(&cstate->work_lock);
-
- req = cast_mcryptd_ctx_to_req(req_ctx);
- if (irqs_disabled())
- req_ctx->complete(&req->base, ret);
- else {
- local_bh_disable();
- req_ctx->complete(&req->base, ret);
- local_bh_enable();
- }
- }
- sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
- }
-
- return 0;
-}
-
-static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
- struct mcryptd_alg_cstate *cstate)
-{
- unsigned long next_flush;
- unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
-
- /* initialize tag */
- rctx->tag.arrival = jiffies; /* tag the arrival time */
- rctx->tag.seq_num = cstate->next_seq_num++;
- next_flush = rctx->tag.arrival + delay;
- rctx->tag.expire = next_flush;
-
- spin_lock(&cstate->work_lock);
- list_add_tail(&rctx->waiter, &cstate->work_list);
- spin_unlock(&cstate->work_lock);
-
- mcryptd_arm_flusher(cstate, delay);
-}
-
-static int sha1_mb_update(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx, areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
-
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha1_hash_ctx *sha_ctx;
- int ret = 0, nbytes;
-
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
- if (nbytes < 0) {
- ret = nbytes;
- goto done;
- }
-
- if (crypto_ahash_walk_last(&rctx->walk))
- rctx->flag |= HASH_DONE;
-
- /* submit */
- sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
- sha1_mb_add_list(rctx, cstate);
- kernel_fpu_begin();
- sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
- nbytes, HASH_UPDATE);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
-
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha1_mb_finup(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx, areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
-
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha1_hash_ctx *sha_ctx;
- int ret = 0, flag = HASH_UPDATE, nbytes;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
- if (nbytes < 0) {
- ret = nbytes;
- goto done;
- }
-
- if (crypto_ahash_walk_last(&rctx->walk)) {
- rctx->flag |= HASH_DONE;
- flag = HASH_LAST;
- }
-
- /* submit */
- rctx->flag |= HASH_FINAL;
- sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
- sha1_mb_add_list(rctx, cstate);
-
- kernel_fpu_begin();
- sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
- nbytes, flag);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha1_mb_final(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx, areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
-
- struct sha1_hash_ctx *sha_ctx;
- int ret = 0;
- u8 data;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- rctx->flag |= HASH_DONE | HASH_FINAL;
-
- sha_ctx = (struct sha1_hash_ctx *) ahash_request_ctx(areq);
- /* flag HASH_FINAL and 0 data size */
- sha1_mb_add_list(rctx, cstate);
- kernel_fpu_begin();
- sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
- HASH_LAST);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha1_mb_export(struct ahash_request *areq, void *out)
-{
- struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
-
- memcpy(out, sctx, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha1_mb_import(struct ahash_request *areq, const void *in)
-{
- struct sha1_hash_ctx *sctx = ahash_request_ctx(areq);
-
- memcpy(sctx, in, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
-{
- struct mcryptd_ahash *mcryptd_tfm;
- struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
- struct mcryptd_hash_ctx *mctx;
-
- mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(mcryptd_tfm))
- return PTR_ERR(mcryptd_tfm);
- mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
- mctx->alg_state = &sha1_mb_alg_state;
- ctx->mcryptd_tfm = mcryptd_tfm;
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- crypto_ahash_reqsize(&mcryptd_tfm->base));
-
- return 0;
-}
-
-static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
-{
- struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
- mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static int sha1_mb_areq_init_tfm(struct crypto_tfm *tfm)
-{
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- sizeof(struct sha1_hash_ctx));
-
- return 0;
-}
-
-static void sha1_mb_areq_exit_tfm(struct crypto_tfm *tfm)
-{
- struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
- mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static struct ahash_alg sha1_mb_areq_alg = {
- .init = sha1_mb_init,
- .update = sha1_mb_update,
- .final = sha1_mb_final,
- .finup = sha1_mb_finup,
- .export = sha1_mb_export,
- .import = sha1_mb_import,
- .halg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .statesize = sizeof(struct sha1_hash_ctx),
- .base = {
- .cra_name = "__sha1-mb",
- .cra_driver_name = "__intel_sha1-mb",
- .cra_priority = 100,
- /*
- * use ASYNC flag as some buffers in multi-buffer
- * algo may not have completed before hashing thread
- * sleep
- */
- .cra_flags = CRYPTO_ALG_ASYNC |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT
- (sha1_mb_areq_alg.halg.base.cra_list),
- .cra_init = sha1_mb_areq_init_tfm,
- .cra_exit = sha1_mb_areq_exit_tfm,
- .cra_ctxsize = sizeof(struct sha1_hash_ctx),
- }
- }
-};
-
-static int sha1_mb_async_init(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_init(mcryptd_req);
-}
-
-static int sha1_mb_async_update(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_update(mcryptd_req);
-}
-
-static int sha1_mb_async_finup(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_finup(mcryptd_req);
-}
-
-static int sha1_mb_async_final(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_final(mcryptd_req);
-}
-
-static int sha1_mb_async_digest(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_digest(mcryptd_req);
-}
-
-static int sha1_mb_async_export(struct ahash_request *req, void *out)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_export(mcryptd_req, out);
-}
-
-static int sha1_mb_async_import(struct ahash_request *req, const void *in)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
- struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
- struct mcryptd_hash_request_ctx *rctx;
- struct ahash_request *areq;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- rctx = ahash_request_ctx(mcryptd_req);
- areq = &rctx->areq;
-
- ahash_request_set_tfm(areq, child);
- ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
- rctx->complete, req);
-
- return crypto_ahash_import(mcryptd_req, in);
-}
-
-static struct ahash_alg sha1_mb_async_alg = {
- .init = sha1_mb_async_init,
- .update = sha1_mb_async_update,
- .final = sha1_mb_async_final,
- .finup = sha1_mb_async_finup,
- .digest = sha1_mb_async_digest,
- .export = sha1_mb_async_export,
- .import = sha1_mb_async_import,
- .halg = {
- .digestsize = SHA1_DIGEST_SIZE,
- .statesize = sizeof(struct sha1_hash_ctx),
- .base = {
- .cra_name = "sha1",
- .cra_driver_name = "sha1_mb",
- /*
- * Low priority, since with few concurrent hash requests
- * this is extremely slow due to the flush delay. Users
- * whose workloads would benefit from this can request
- * it explicitly by driver name, or can increase its
- * priority at runtime using NETLINK_CRYPTO.
- */
- .cra_priority = 50,
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = SHA1_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list),
- .cra_init = sha1_mb_async_init_tfm,
- .cra_exit = sha1_mb_async_exit_tfm,
- .cra_ctxsize = sizeof(struct sha1_mb_ctx),
- .cra_alignmask = 0,
- },
- },
-};
-
-static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
-{
- struct mcryptd_hash_request_ctx *rctx;
- unsigned long cur_time;
- unsigned long next_flush = 0;
- struct sha1_hash_ctx *sha_ctx;
-
-
- cur_time = jiffies;
-
- while (!list_empty(&cstate->work_list)) {
- rctx = list_entry(cstate->work_list.next,
- struct mcryptd_hash_request_ctx, waiter);
- if (time_before(cur_time, rctx->tag.expire))
- break;
- kernel_fpu_begin();
- sha_ctx = (struct sha1_hash_ctx *)
- sha1_ctx_mgr_flush(cstate->mgr);
- kernel_fpu_end();
- if (!sha_ctx) {
- pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
- break;
- }
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- sha_finish_walk(&rctx, cstate, true);
- sha_complete_job(rctx, cstate, 0);
- }
-
- if (!list_empty(&cstate->work_list)) {
- rctx = list_entry(cstate->work_list.next,
- struct mcryptd_hash_request_ctx, waiter);
- /* get the hash context and then flush time */
- next_flush = rctx->tag.expire;
- mcryptd_arm_flusher(cstate, get_delay(next_flush));
- }
- return next_flush;
-}
-
-static int __init sha1_mb_mod_init(void)
-{
-
- int cpu;
- int err;
- struct mcryptd_alg_cstate *cpu_state;
-
- /* check for dependent cpu features */
- if (!boot_cpu_has(X86_FEATURE_AVX2) ||
- !boot_cpu_has(X86_FEATURE_BMI2))
- return -ENODEV;
-
- /* initialize multibuffer structures */
- sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate);
-
- sha1_job_mgr_init = sha1_mb_mgr_init_avx2;
- sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2;
- sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2;
- sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2;
-
- if (!sha1_mb_alg_state.alg_cstate)
- return -ENOMEM;
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
- cpu_state->next_flush = 0;
- cpu_state->next_seq_num = 0;
- cpu_state->flusher_engaged = false;
- INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
- cpu_state->cpu = cpu;
- cpu_state->alg_state = &sha1_mb_alg_state;
- cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
- GFP_KERNEL);
- if (!cpu_state->mgr)
- goto err2;
- sha1_ctx_mgr_init(cpu_state->mgr);
- INIT_LIST_HEAD(&cpu_state->work_list);
- spin_lock_init(&cpu_state->work_lock);
- }
- sha1_mb_alg_state.flusher = &sha1_mb_flusher;
-
- err = crypto_register_ahash(&sha1_mb_areq_alg);
- if (err)
- goto err2;
- err = crypto_register_ahash(&sha1_mb_async_alg);
- if (err)
- goto err1;
-
-
- return 0;
-err1:
- crypto_unregister_ahash(&sha1_mb_areq_alg);
-err2:
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
- kfree(cpu_state->mgr);
- }
- free_percpu(sha1_mb_alg_state.alg_cstate);
- return -ENODEV;
-}
-
-static void __exit sha1_mb_mod_fini(void)
-{
- int cpu;
- struct mcryptd_alg_cstate *cpu_state;
-
- crypto_unregister_ahash(&sha1_mb_async_alg);
- crypto_unregister_ahash(&sha1_mb_areq_alg);
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
- kfree(cpu_state->mgr);
- }
- free_percpu(sha1_mb_alg_state.alg_cstate);
-}
-
-module_init(sha1_mb_mod_init);
-module_exit(sha1_mb_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
-
-MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
deleted file mode 100644
index 9454bd1..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Header file for multi buffer SHA context
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _SHA_MB_CTX_INTERNAL_H
-#define _SHA_MB_CTX_INTERNAL_H
-
-#include "sha1_mb_mgr.h"
-
-#define HASH_UPDATE 0x00
-#define HASH_LAST 0x01
-#define HASH_DONE 0x02
-#define HASH_FINAL 0x04
-
-#define HASH_CTX_STS_IDLE 0x00
-#define HASH_CTX_STS_PROCESSING 0x01
-#define HASH_CTX_STS_LAST 0x02
-#define HASH_CTX_STS_COMPLETE 0x04
-
-enum hash_ctx_error {
- HASH_CTX_ERROR_NONE = 0,
- HASH_CTX_ERROR_INVALID_FLAGS = -1,
- HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
- HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
-
-#ifdef HASH_CTX_DEBUG
- HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
-#endif
-};
-
-
-#define hash_ctx_user_data(ctx) ((ctx)->user_data)
-#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
-#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
-#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
-#define hash_ctx_status(ctx) ((ctx)->status)
-#define hash_ctx_error(ctx) ((ctx)->error)
-#define hash_ctx_init(ctx) \
- do { \
- (ctx)->error = HASH_CTX_ERROR_NONE; \
- (ctx)->status = HASH_CTX_STS_COMPLETE; \
- } while (0)
-
-
-/* Hash Constants and Typedefs */
-#define SHA1_DIGEST_LENGTH 5
-#define SHA1_LOG2_BLOCK_SIZE 6
-
-#define SHA1_PADLENGTHFIELD_SIZE 8
-
-#ifdef SHA_MB_DEBUG
-#define assert(expr) \
-do { \
- if (unlikely(!(expr))) { \
- printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
- #expr, __FILE__, __func__, __LINE__); \
- } \
-} while (0)
-#else
-#define assert(expr) do {} while (0)
-#endif
-
-struct sha1_ctx_mgr {
- struct sha1_mb_mgr mgr;
-};
-
-/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
-
-struct sha1_hash_ctx {
- /* Must be at struct offset 0 */
- struct job_sha1 job;
- /* status flag */
- int status;
- /* error flag */
- int error;
-
- uint64_t total_length;
- const void *incoming_buffer;
- uint32_t incoming_buffer_length;
- uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2];
- uint32_t partial_block_buffer_length;
- void *user_data;
-};
-
-#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h b/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
deleted file mode 100644
index 08ad1a9..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Header file for multi buffer SHA1 algorithm manager
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * James Guilford <james.guilford@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#ifndef __SHA_MB_MGR_H
-#define __SHA_MB_MGR_H
-
-
-#include <linux/types.h>
-
-#define NUM_SHA1_DIGEST_WORDS 5
-
-enum job_sts { STS_UNKNOWN = 0,
- STS_BEING_PROCESSED = 1,
- STS_COMPLETED = 2,
- STS_INTERNAL_ERROR = 3,
- STS_ERROR = 4
-};
-
-struct job_sha1 {
- u8 *buffer;
- u32 len;
- u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
- enum job_sts status;
- void *user_data;
-};
-
-/* SHA1 out-of-order scheduler */
-
-/* typedef uint32_t sha1_digest_array[5][8]; */
-
-struct sha1_args_x8 {
- uint32_t digest[5][8];
- uint8_t *data_ptr[8];
-};
-
-struct sha1_lane_data {
- struct job_sha1 *job_in_lane;
-};
-
-struct sha1_mb_mgr {
- struct sha1_args_x8 args;
-
- uint32_t lens[8];
-
- /* each byte is index (0...7) of unused lanes */
- uint64_t unused_lanes;
- /* byte 4 is set to FF as a flag */
- struct sha1_lane_data ldata[8];
-};
-
-
-#define SHA1_MB_MGR_NUM_LANES_AVX2 8
-
-void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
-struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
- struct job_sha1 *job);
-struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
-struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
-
-#endif
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
deleted file mode 100644
index 86688c6..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_datastruct.S
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Header file for multi buffer SHA1 algorithm data structure
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * James Guilford <james.guilford@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-# Macros for defining data structures
-
-# Usage example
-
-#START_FIELDS # JOB_AES
-### name size align
-#FIELD _plaintext, 8, 8 # pointer to plaintext
-#FIELD _ciphertext, 8, 8 # pointer to ciphertext
-#FIELD _IV, 16, 8 # IV
-#FIELD _keys, 8, 8 # pointer to keys
-#FIELD _len, 4, 4 # length in bytes
-#FIELD _status, 4, 4 # status enumeration
-#FIELD _user_data, 8, 8 # pointer to user data
-#UNION _union, size1, align1, \
-# size2, align2, \
-# size3, align3, \
-# ...
-#END_FIELDS
-#%assign _JOB_AES_size _FIELD_OFFSET
-#%assign _JOB_AES_align _STRUCT_ALIGN
-
-#########################################################################
-
-# Alternate "struc-like" syntax:
-# STRUCT job_aes2
-# RES_Q .plaintext, 1
-# RES_Q .ciphertext, 1
-# RES_DQ .IV, 1
-# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
-# RES_U .union, size1, align1, \
-# size2, align2, \
-# ...
-# ENDSTRUCT
-# # Following only needed if nesting
-# %assign job_aes2_size _FIELD_OFFSET
-# %assign job_aes2_align _STRUCT_ALIGN
-#
-# RES_* macros take a name, a count and an optional alignment.
-# The count in in terms of the base size of the macro, and the
-# default alignment is the base size.
-# The macros are:
-# Macro Base size
-# RES_B 1
-# RES_W 2
-# RES_D 4
-# RES_Q 8
-# RES_DQ 16
-# RES_Y 32
-# RES_Z 64
-#
-# RES_U defines a union. It's arguments are a name and two or more
-# pairs of "size, alignment"
-#
-# The two assigns are only needed if this structure is being nested
-# within another. Even if the assigns are not done, one can still use
-# STRUCT_NAME_size as the size of the structure.
-#
-# Note that for nesting, you still need to assign to STRUCT_NAME_size.
-#
-# The differences between this and using "struc" directly are that each
-# type is implicitly aligned to its natural length (although this can be
-# over-ridden with an explicit third parameter), and that the structure
-# is padded at the end to its overall alignment.
-#
-
-#########################################################################
-
-#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
-#define _SHA1_MB_MGR_DATASTRUCT_ASM_
-
-## START_FIELDS
-.macro START_FIELDS
- _FIELD_OFFSET = 0
- _STRUCT_ALIGN = 0
-.endm
-
-## FIELD name size align
-.macro FIELD name size align
- _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
- \name = _FIELD_OFFSET
- _FIELD_OFFSET = _FIELD_OFFSET + (\size)
-.if (\align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = \align
-.endif
-.endm
-
-## END_FIELDS
-.macro END_FIELDS
- _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
-.endm
-
-########################################################################
-
-.macro STRUCT p1
-START_FIELDS
-.struc \p1
-.endm
-
-.macro ENDSTRUCT
- tmp = _FIELD_OFFSET
- END_FIELDS
- tmp = (_FIELD_OFFSET - %%tmp)
-.if (tmp > 0)
- .lcomm tmp
-.endif
-.endstruc
-.endm
-
-## RES_int name size align
-.macro RES_int p1 p2 p3
- name = \p1
- size = \p2
- align = .\p3
-
- _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
-.align align
-.lcomm name size
- _FIELD_OFFSET = _FIELD_OFFSET + (size)
-.if (align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = align
-.endif
-.endm
-
-
-
-# macro RES_B name, size [, align]
-.macro RES_B _name, _size, _align=1
-RES_int _name _size _align
-.endm
-
-# macro RES_W name, size [, align]
-.macro RES_W _name, _size, _align=2
-RES_int _name 2*(_size) _align
-.endm
-
-# macro RES_D name, size [, align]
-.macro RES_D _name, _size, _align=4
-RES_int _name 4*(_size) _align
-.endm
-
-# macro RES_Q name, size [, align]
-.macro RES_Q _name, _size, _align=8
-RES_int _name 8*(_size) _align
-.endm
-
-# macro RES_DQ name, size [, align]
-.macro RES_DQ _name, _size, _align=16
-RES_int _name 16*(_size) _align
-.endm
-
-# macro RES_Y name, size [, align]
-.macro RES_Y _name, _size, _align=32
-RES_int _name 32*(_size) _align
-.endm
-
-# macro RES_Z name, size [, align]
-.macro RES_Z _name, _size, _align=64
-RES_int _name 64*(_size) _align
-.endm
-
-
-#endif
-
-########################################################################
-#### Define constants
-########################################################################
-
-########################################################################
-#### Define SHA1 Out Of Order Data Structures
-########################################################################
-
-START_FIELDS # LANE_DATA
-### name size align
-FIELD _job_in_lane, 8, 8 # pointer to job object
-END_FIELDS
-
-_LANE_DATA_size = _FIELD_OFFSET
-_LANE_DATA_align = _STRUCT_ALIGN
-
-########################################################################
-
-START_FIELDS # SHA1_ARGS_X8
-### name size align
-FIELD _digest, 4*5*8, 16 # transposed digest
-FIELD _data_ptr, 8*8, 8 # array of pointers to data
-END_FIELDS
-
-_SHA1_ARGS_X4_size = _FIELD_OFFSET
-_SHA1_ARGS_X4_align = _STRUCT_ALIGN
-_SHA1_ARGS_X8_size = _FIELD_OFFSET
-_SHA1_ARGS_X8_align = _STRUCT_ALIGN
-
-########################################################################
-
-START_FIELDS # MB_MGR
-### name size align
-FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
-FIELD _lens, 4*8, 8
-FIELD _unused_lanes, 8, 8
-FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
-END_FIELDS
-
-_MB_MGR_size = _FIELD_OFFSET
-_MB_MGR_align = _STRUCT_ALIGN
-
-_args_digest = _args + _digest
-_args_data_ptr = _args + _data_ptr
-
-
-########################################################################
-#### Define constants
-########################################################################
-
-#define STS_UNKNOWN 0
-#define STS_BEING_PROCESSED 1
-#define STS_COMPLETED 2
-
-########################################################################
-#### Define JOB_SHA1 structure
-########################################################################
-
-START_FIELDS # JOB_SHA1
-
-### name size align
-FIELD _buffer, 8, 8 # pointer to buffer
-FIELD _len, 4, 4 # length in bytes
-FIELD _result_digest, 5*4, 32 # Digest (output)
-FIELD _status, 4, 4
-FIELD _user_data, 8, 8
-END_FIELDS
-
-_JOB_SHA1_size = _FIELD_OFFSET
-_JOB_SHA1_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
deleted file mode 100644
index 7cfba73..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Flush routine for SHA1 multibuffer
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * James Guilford <james.guilford@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha1_mb_mgr_datastruct.S"
-
-
-.extern sha1_x8_avx2
-
-# LINUX register definitions
-#define arg1 %rdi
-#define arg2 %rsi
-
-# Common definitions
-#define state arg1
-#define job arg2
-#define len2 arg2
-
-# idx must be a register not clobbered by sha1_x8_avx2
-#define idx %r8
-#define DWORD_idx %r8d
-
-#define unused_lanes %rbx
-#define lane_data %rbx
-#define tmp2 %rbx
-#define tmp2_w %ebx
-
-#define job_rax %rax
-#define tmp1 %rax
-#define size_offset %rax
-#define tmp %rax
-#define start_offset %rax
-
-#define tmp3 %arg1
-
-#define extra_blocks %arg2
-#define p %arg2
-
-.macro LABEL prefix n
-\prefix\n\():
-.endm
-
-.macro JNE_SKIP i
-jne skip_\i
-.endm
-
-.altmacro
-.macro SET_OFFSET _offset
-offset = \_offset
-.endm
-.noaltmacro
-
-# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
-# arg 1 : rcx : state
-ENTRY(sha1_mb_mgr_flush_avx2)
- FRAME_BEGIN
- push %rbx
-
- # If bit (32+3) is set, then all lanes are empty
- mov _unused_lanes(state), unused_lanes
- bt $32+3, unused_lanes
- jc return_null
-
- # find a lane with a non-null job
- xor idx, idx
- offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne one(%rip), idx
- offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne two(%rip), idx
- offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne three(%rip), idx
- offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne four(%rip), idx
- offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne five(%rip), idx
- offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne six(%rip), idx
- offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne seven(%rip), idx
-
- # copy idx to empty lanes
-copy_lane_data:
- offset = (_args + _data_ptr)
- mov offset(state,idx,8), tmp
-
- I = 0
-.rep 8
- offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
-.altmacro
- JNE_SKIP %I
- offset = (_args + _data_ptr + 8*I)
- mov tmp, offset(state)
- offset = (_lens + 4*I)
- movl $0xFFFFFFFF, offset(state)
-LABEL skip_ %I
- I = (I+1)
-.noaltmacro
-.endr
-
- # Find min length
- vmovdqu _lens+0*16(state), %xmm0
- vmovdqu _lens+1*16(state), %xmm1
-
- vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
- vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
- vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
-
- vmovd %xmm2, DWORD_idx
- mov idx, len2
- and $0xF, idx
- shr $4, len2
- jz len_is_0
-
- vpand clear_low_nibble(%rip), %xmm2, %xmm2
- vpshufd $0, %xmm2, %xmm2
-
- vpsubd %xmm2, %xmm0, %xmm0
- vpsubd %xmm2, %xmm1, %xmm1
-
- vmovdqu %xmm0, _lens+0*16(state)
- vmovdqu %xmm1, _lens+1*16(state)
-
- # "state" and "args" are the same address, arg1
- # len is arg2
- call sha1_x8_avx2
- # state and idx are intact
-
-
-len_is_0:
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- mov _unused_lanes(state), unused_lanes
- shl $4, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens(state, idx, 4)
-
- vmovd _args_digest(state , idx, 4) , %xmm0
- vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
- movl _args_digest+4*32(state, idx, 4), tmp2_w
-
- vmovdqu %xmm0, _result_digest(job_rax)
- offset = (_result_digest + 1*16)
- mov tmp2_w, offset(job_rax)
-
-return:
- pop %rbx
- FRAME_END
- ret
-
-return_null:
- xor job_rax, job_rax
- jmp return
-ENDPROC(sha1_mb_mgr_flush_avx2)
-
-
-#################################################################
-
-.align 16
-ENTRY(sha1_mb_mgr_get_comp_job_avx2)
- push %rbx
-
- ## if bit 32+3 is set, then all lanes are empty
- mov _unused_lanes(state), unused_lanes
- bt $(32+3), unused_lanes
- jc .return_null
-
- # Find min length
- vmovdqu _lens(state), %xmm0
- vmovdqu _lens+1*16(state), %xmm1
-
- vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
- vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
- vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
-
- vmovd %xmm2, DWORD_idx
- test $~0xF, idx
- jnz .return_null
-
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- mov _unused_lanes(state), unused_lanes
- shl $4, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens(state, idx, 4)
-
- vmovd _args_digest(state, idx, 4), %xmm0
- vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
- movl _args_digest+4*32(state, idx, 4), tmp2_w
-
- vmovdqu %xmm0, _result_digest(job_rax)
- movl tmp2_w, _result_digest+1*16(job_rax)
-
- pop %rbx
-
- ret
-
-.return_null:
- xor job_rax, job_rax
- pop %rbx
- ret
-ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
-
-.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
-.align 16
-clear_low_nibble:
-.octa 0x000000000000000000000000FFFFFFF0
-
-.section .rodata.cst8, "aM", @progbits, 8
-.align 8
-one:
-.quad 1
-two:
-.quad 2
-three:
-.quad 3
-four:
-.quad 4
-five:
-.quad 5
-six:
-.quad 6
-seven:
-.quad 7
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
deleted file mode 100644
index d2add0d..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_init_avx2.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Initialization code for multi buffer SHA1 algorithm for AVX2
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "sha1_mb_mgr.h"
-
-void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
-{
- unsigned int j;
- state->unused_lanes = 0xF76543210ULL;
- for (j = 0; j < 8; j++) {
- state->lens[j] = 0xFFFFFFFF;
- state->ldata[j].job_in_lane = NULL;
- }
-}
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
deleted file mode 100644
index 7a93b1c..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Buffer submit code for multi buffer SHA1 algorithm
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * James Guilford <james.guilford@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha1_mb_mgr_datastruct.S"
-
-
-.extern sha1_x8_avx
-
-# LINUX register definitions
-arg1 = %rdi
-arg2 = %rsi
-size_offset = %rcx
-tmp2 = %rcx
-extra_blocks = %rdx
-
-# Common definitions
-#define state arg1
-#define job %rsi
-#define len2 arg2
-#define p2 arg2
-
-# idx must be a register not clobberred by sha1_x8_avx2
-idx = %r8
-DWORD_idx = %r8d
-last_len = %r8
-
-p = %r11
-start_offset = %r11
-
-unused_lanes = %rbx
-BYTE_unused_lanes = %bl
-
-job_rax = %rax
-len = %rax
-DWORD_len = %eax
-
-lane = %r12
-tmp3 = %r12
-
-tmp = %r9
-DWORD_tmp = %r9d
-
-lane_data = %r10
-
-# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
-# arg 1 : rcx : state
-# arg 2 : rdx : job
-ENTRY(sha1_mb_mgr_submit_avx2)
- FRAME_BEGIN
- push %rbx
- push %r12
-
- mov _unused_lanes(state), unused_lanes
- mov unused_lanes, lane
- and $0xF, lane
- shr $4, unused_lanes
- imul $_LANE_DATA_size, lane, lane_data
- movl $STS_BEING_PROCESSED, _status(job)
- lea _ldata(state, lane_data), lane_data
- mov unused_lanes, _unused_lanes(state)
- movl _len(job), DWORD_len
-
- mov job, _job_in_lane(lane_data)
- shl $4, len
- or lane, len
-
- movl DWORD_len, _lens(state , lane, 4)
-
- # Load digest words from result_digest
- vmovdqu _result_digest(job), %xmm0
- mov _result_digest+1*16(job), DWORD_tmp
- vmovd %xmm0, _args_digest(state, lane, 4)
- vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
- vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
- vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
- movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
-
- mov _buffer(job), p
- mov p, _args_data_ptr(state, lane, 8)
-
- cmp $0xF, unused_lanes
- jne return_null
-
-start_loop:
- # Find min length
- vmovdqa _lens(state), %xmm0
- vmovdqa _lens+1*16(state), %xmm1
-
- vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
- vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
- vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
-
- vmovd %xmm2, DWORD_idx
- mov idx, len2
- and $0xF, idx
- shr $4, len2
- jz len_is_0
-
- vpand clear_low_nibble(%rip), %xmm2, %xmm2
- vpshufd $0, %xmm2, %xmm2
-
- vpsubd %xmm2, %xmm0, %xmm0
- vpsubd %xmm2, %xmm1, %xmm1
-
- vmovdqa %xmm0, _lens + 0*16(state)
- vmovdqa %xmm1, _lens + 1*16(state)
-
-
- # "state" and "args" are the same address, arg1
- # len is arg2
- call sha1_x8_avx2
-
- # state and idx are intact
-
-len_is_0:
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- mov _unused_lanes(state), unused_lanes
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- shl $4, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens(state, idx, 4)
-
- vmovd _args_digest(state, idx, 4), %xmm0
- vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
- vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
- vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
- movl _args_digest+4*32(state, idx, 4), DWORD_tmp
-
- vmovdqu %xmm0, _result_digest(job_rax)
- movl DWORD_tmp, _result_digest+1*16(job_rax)
-
-return:
- pop %r12
- pop %rbx
- FRAME_END
- ret
-
-return_null:
- xor job_rax, job_rax
- jmp return
-
-ENDPROC(sha1_mb_mgr_submit_avx2)
-
-.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
-.align 16
-clear_low_nibble:
- .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
deleted file mode 100644
index 20f77aa..0000000
--- a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
+++ /dev/null
@@ -1,492 +0,0 @@
-/*
- * Multi-buffer SHA1 algorithm hash compute routine
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * James Guilford <james.guilford@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2014 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include "sha1_mb_mgr_datastruct.S"
-
-## code to compute oct SHA1 using SSE-256
-## outer calling routine takes care of save and restore of XMM registers
-
-## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
-##
-## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
-## Linux preserves: rdi rbp r8
-##
-## clobbers ymm0-15
-
-
-# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-# "transpose" data in {r0...r7} using temps {t0...t1}
-# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
-# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
-# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
-# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
-# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
-# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
-# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
-# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
-# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
-#
-# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
-# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
-# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
-# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
-# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
-# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
-# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
-# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
-# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
-#
-
-.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
- # process top half (r0..r3) {a...d}
- vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
- vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
- vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
- vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
- vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
- vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
- vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
- vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
-
- # use r2 in place of t0
- # process bottom half (r4..r7) {e...h}
- vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
- vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
- vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
- vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
- vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
- vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
- vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
- vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
-
- vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
- vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
- vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
- vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
- vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
- vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
- vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
- vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
-
-.endm
-##
-## Magic functions defined in FIPS 180-1
-##
-# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
-.macro MAGIC_F0 regF regB regC regD regT
- vpxor \regD, \regC, \regF
- vpand \regB, \regF, \regF
- vpxor \regD, \regF, \regF
-.endm
-
-# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
-.macro MAGIC_F1 regF regB regC regD regT
- vpxor \regC, \regD, \regF
- vpxor \regB, \regF, \regF
-.endm
-
-# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
-.macro MAGIC_F2 regF regB regC regD regT
- vpor \regC, \regB, \regF
- vpand \regC, \regB, \regT
- vpand \regD, \regF, \regF
- vpor \regT, \regF, \regF
-.endm
-
-# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
-.macro MAGIC_F3 regF regB regC regD regT
- MAGIC_F1 \regF,\regB,\regC,\regD,\regT
-.endm
-
-# PROLD reg, imm, tmp
-.macro PROLD reg imm tmp
- vpsrld $(32-\imm), \reg, \tmp
- vpslld $\imm, \reg, \reg
- vpor \tmp, \reg, \reg
-.endm
-
-.macro PROLD_nd reg imm tmp src
- vpsrld $(32-\imm), \src, \tmp
- vpslld $\imm, \src, \reg
- vpor \tmp, \reg, \reg
-.endm
-
-.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
- vpaddd \immCNT, \regE, \regE
- vpaddd \memW*32(%rsp), \regE, \regE
- PROLD_nd \regT, 5, \regF, \regA
- vpaddd \regT, \regE, \regE
- \MAGIC \regF, \regB, \regC, \regD, \regT
- PROLD \regB, 30, \regT
- vpaddd \regF, \regE, \regE
-.endm
-
-.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
- vpaddd \immCNT, \regE, \regE
- offset = ((\memW - 14) & 15) * 32
- vmovdqu offset(%rsp), W14
- vpxor W14, W16, W16
- offset = ((\memW - 8) & 15) * 32
- vpxor offset(%rsp), W16, W16
- offset = ((\memW - 3) & 15) * 32
- vpxor offset(%rsp), W16, W16
- vpsrld $(32-1), W16, \regF
- vpslld $1, W16, W16
- vpor W16, \regF, \regF
-
- ROTATE_W
-
- offset = ((\memW - 0) & 15) * 32
- vmovdqu \regF, offset(%rsp)
- vpaddd \regF, \regE, \regE
- PROLD_nd \regT, 5, \regF, \regA
- vpaddd \regT, \regE, \regE
- \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
- PROLD \regB,30, \regT
- vpaddd \regF, \regE, \regE
-.endm
-
-########################################################################
-########################################################################
-########################################################################
-
-## FRAMESZ plus pushes must be an odd multiple of 8
-YMM_SAVE = (15-15)*32
-FRAMESZ = 32*16 + YMM_SAVE
-_YMM = FRAMESZ - YMM_SAVE
-
-#define VMOVPS vmovups
-
-IDX = %rax
-inp0 = %r9
-inp1 = %r10
-inp2 = %r11
-inp3 = %r12
-inp4 = %r13
-inp5 = %r14
-inp6 = %r15
-inp7 = %rcx
-arg1 = %rdi
-arg2 = %rsi
-RSP_SAVE = %rdx
-
-# ymm0 A
-# ymm1 B
-# ymm2 C
-# ymm3 D
-# ymm4 E
-# ymm5 F AA
-# ymm6 T0 BB
-# ymm7 T1 CC
-# ymm8 T2 DD
-# ymm9 T3 EE
-# ymm10 T4 TMP
-# ymm11 T5 FUN
-# ymm12 T6 K
-# ymm13 T7 W14
-# ymm14 T8 W15
-# ymm15 T9 W16
-
-
-A = %ymm0
-B = %ymm1
-C = %ymm2
-D = %ymm3
-E = %ymm4
-F = %ymm5
-T0 = %ymm6
-T1 = %ymm7
-T2 = %ymm8
-T3 = %ymm9
-T4 = %ymm10
-T5 = %ymm11
-T6 = %ymm12
-T7 = %ymm13
-T8 = %ymm14
-T9 = %ymm15
-
-AA = %ymm5
-BB = %ymm6
-CC = %ymm7
-DD = %ymm8
-EE = %ymm9
-TMP = %ymm10
-FUN = %ymm11
-K = %ymm12
-W14 = %ymm13
-W15 = %ymm14
-W16 = %ymm15
-
-.macro ROTATE_ARGS
- TMP_ = E
- E = D
- D = C
- C = B
- B = A
- A = TMP_
-.endm
-
-.macro ROTATE_W
-TMP_ = W16
-W16 = W15
-W15 = W14
-W14 = TMP_
-.endm
-
-# 8 streams x 5 32bit words per digest x 4 bytes per word
-#define DIGEST_SIZE (8*5*4)
-
-.align 32
-
-# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
-# arg 1 : pointer to array[4] of pointer to input data
-# arg 2 : size (in blocks) ;; assumed to be >= 1
-#
-ENTRY(sha1_x8_avx2)
-
- # save callee-saved clobbered registers to comply with C function ABI
- push %r12
- push %r13
- push %r14
- push %r15
-
- #save rsp
- mov %rsp, RSP_SAVE
- sub $FRAMESZ, %rsp
-
- #align rsp to 32 Bytes
- and $~0x1F, %rsp
-
- ## Initialize digests
- vmovdqu 0*32(arg1), A
- vmovdqu 1*32(arg1), B
- vmovdqu 2*32(arg1), C
- vmovdqu 3*32(arg1), D
- vmovdqu 4*32(arg1), E
-
- ## transpose input onto stack
- mov _data_ptr+0*8(arg1),inp0
- mov _data_ptr+1*8(arg1),inp1
- mov _data_ptr+2*8(arg1),inp2
- mov _data_ptr+3*8(arg1),inp3
- mov _data_ptr+4*8(arg1),inp4
- mov _data_ptr+5*8(arg1),inp5
- mov _data_ptr+6*8(arg1),inp6
- mov _data_ptr+7*8(arg1),inp7
-
- xor IDX, IDX
-lloop:
- vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
- I=0
-.rep 2
- VMOVPS (inp0, IDX), T0
- VMOVPS (inp1, IDX), T1
- VMOVPS (inp2, IDX), T2
- VMOVPS (inp3, IDX), T3
- VMOVPS (inp4, IDX), T4
- VMOVPS (inp5, IDX), T5
- VMOVPS (inp6, IDX), T6
- VMOVPS (inp7, IDX), T7
-
- TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
- vpshufb F, T0, T0
- vmovdqu T0, (I*8)*32(%rsp)
- vpshufb F, T1, T1
- vmovdqu T1, (I*8+1)*32(%rsp)
- vpshufb F, T2, T2
- vmovdqu T2, (I*8+2)*32(%rsp)
- vpshufb F, T3, T3
- vmovdqu T3, (I*8+3)*32(%rsp)
- vpshufb F, T4, T4
- vmovdqu T4, (I*8+4)*32(%rsp)
- vpshufb F, T5, T5
- vmovdqu T5, (I*8+5)*32(%rsp)
- vpshufb F, T6, T6
- vmovdqu T6, (I*8+6)*32(%rsp)
- vpshufb F, T7, T7
- vmovdqu T7, (I*8+7)*32(%rsp)
- add $32, IDX
- I = (I+1)
-.endr
- # save old digests
- vmovdqu A,AA
- vmovdqu B,BB
- vmovdqu C,CC
- vmovdqu D,DD
- vmovdqu E,EE
-
-##
-## perform 0-79 steps
-##
- vmovdqu K00_19(%rip), K
-## do rounds 0...15
- I = 0
-.rep 16
- SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
- ROTATE_ARGS
- I = (I+1)
-.endr
-
-## do rounds 16...19
- vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
- vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
-.rep 4
- SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
- ROTATE_ARGS
- I = (I+1)
-.endr
-
-## do rounds 20...39
- vmovdqu K20_39(%rip), K
-.rep 20
- SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
- ROTATE_ARGS
- I = (I+1)
-.endr
-
-## do rounds 40...59
- vmovdqu K40_59(%rip), K
-.rep 20
- SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
- ROTATE_ARGS
- I = (I+1)
-.endr
-
-## do rounds 60...79
- vmovdqu K60_79(%rip), K
-.rep 20
- SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
- ROTATE_ARGS
- I = (I+1)
-.endr
-
- vpaddd AA,A,A
- vpaddd BB,B,B
- vpaddd CC,C,C
- vpaddd DD,D,D
- vpaddd EE,E,E
-
- sub $1, arg2
- jne lloop
-
- # write out digests
- vmovdqu A, 0*32(arg1)
- vmovdqu B, 1*32(arg1)
- vmovdqu C, 2*32(arg1)
- vmovdqu D, 3*32(arg1)
- vmovdqu E, 4*32(arg1)
-
- # update input pointers
- add IDX, inp0
- add IDX, inp1
- add IDX, inp2
- add IDX, inp3
- add IDX, inp4
- add IDX, inp5
- add IDX, inp6
- add IDX, inp7
- mov inp0, _data_ptr (arg1)
- mov inp1, _data_ptr + 1*8(arg1)
- mov inp2, _data_ptr + 2*8(arg1)
- mov inp3, _data_ptr + 3*8(arg1)
- mov inp4, _data_ptr + 4*8(arg1)
- mov inp5, _data_ptr + 5*8(arg1)
- mov inp6, _data_ptr + 6*8(arg1)
- mov inp7, _data_ptr + 7*8(arg1)
-
- ################
- ## Postamble
-
- mov RSP_SAVE, %rsp
-
- # restore callee-saved clobbered registers
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-
- ret
-ENDPROC(sha1_x8_avx2)
-
-
-.section .rodata.cst32.K00_19, "aM", @progbits, 32
-.align 32
-K00_19:
-.octa 0x5A8279995A8279995A8279995A827999
-.octa 0x5A8279995A8279995A8279995A827999
-
-.section .rodata.cst32.K20_39, "aM", @progbits, 32
-.align 32
-K20_39:
-.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
-.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
-
-.section .rodata.cst32.K40_59, "aM", @progbits, 32
-.align 32
-K40_59:
-.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
-.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
-
-.section .rodata.cst32.K60_79, "aM", @progbits, 32
-.align 32
-K60_79:
-.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
-.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
-.octa 0x0c0d0e0f08090a0b0405060700010203
-.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 613d0bf..99c5b8c 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
* SSE3 instruction set extensions introduced in Intel Core Microarchitecture
@@ -21,11 +22,6 @@
*
* Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
* Author: Mathias Krause <minipli@googlemail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 7391c7d..639d4c2 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Cryptographic API.
*
@@ -11,17 +12,12 @@
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
* Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -29,7 +25,7 @@
#include <linux/types.h>
#include <crypto/sha.h>
#include <crypto/sha1_base.h>
-#include <asm/fpu/api.h>
+#include <asm/simd.h>
typedef void (sha1_transform_fn)(u32 *digest, const char *data,
unsigned int rounds);
@@ -39,7 +35,7 @@
{
struct sha1_state *sctx = shash_desc_ctx(desc);
- if (!irq_fpu_usable() ||
+ if (!crypto_simd_usable() ||
(sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
return crypto_sha1_update(desc, data, len);
@@ -57,7 +53,7 @@
static int sha1_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
{
- if (!irq_fpu_usable())
+ if (!crypto_simd_usable())
return crypto_sha1_finup(desc, data, len, out);
kernel_fpu_begin();
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
deleted file mode 100644
index 53ad6e7..0000000
--- a/arch/x86/crypto/sha256-mb/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Arch-specific CryptoAPI modules.
-#
-
-OBJECT_FILES_NON_STANDARD := y
-
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
- $(comma)4)$(comma)%ymm2,yes,no)
-ifeq ($(avx2_supported),yes)
- obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb.o
- sha256-mb-y := sha256_mb.o sha256_mb_mgr_flush_avx2.o \
- sha256_mb_mgr_init_avx2.o sha256_mb_mgr_submit_avx2.o sha256_x8_avx2.o
-endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
deleted file mode 100644
index 97c5fc4..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb.c
+++ /dev/null
@@ -1,1013 +0,0 @@
-/*
- * Multi buffer SHA256 algorithm Glue Code
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
-#include <crypto/mcryptd.h>
-#include <crypto/crypto_wq.h>
-#include <asm/byteorder.h>
-#include <linux/hardirq.h>
-#include <asm/fpu/api.h>
-#include "sha256_mb_ctx.h"
-
-#define FLUSH_INTERVAL 1000 /* in usec */
-
-static struct mcryptd_alg_state sha256_mb_alg_state;
-
-struct sha256_mb_ctx {
- struct mcryptd_ahash *mcryptd_tfm;
-};
-
-static inline struct mcryptd_hash_request_ctx
- *cast_hash_to_mcryptd_ctx(struct sha256_hash_ctx *hash_ctx)
-{
- struct ahash_request *areq;
-
- areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
- return container_of(areq, struct mcryptd_hash_request_ctx, areq);
-}
-
-static inline struct ahash_request
- *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
-{
- return container_of((void *) ctx, struct ahash_request, __ctx);
-}
-
-static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
- struct ahash_request *areq)
-{
- rctx->flag = HASH_UPDATE;
-}
-
-static asmlinkage void (*sha256_job_mgr_init)(struct sha256_mb_mgr *state);
-static asmlinkage struct job_sha256* (*sha256_job_mgr_submit)
- (struct sha256_mb_mgr *state, struct job_sha256 *job);
-static asmlinkage struct job_sha256* (*sha256_job_mgr_flush)
- (struct sha256_mb_mgr *state);
-static asmlinkage struct job_sha256* (*sha256_job_mgr_get_comp_job)
- (struct sha256_mb_mgr *state);
-
-inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2],
- uint64_t total_len)
-{
- uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
-
- memset(&padblock[i], 0, SHA256_BLOCK_SIZE);
- padblock[i] = 0x80;
-
- i += ((SHA256_BLOCK_SIZE - 1) &
- (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1)))
- + 1 + SHA256_PADLENGTHFIELD_SIZE;
-
-#if SHA256_PADLENGTHFIELD_SIZE == 16
- *((uint64_t *) &padblock[i - 16]) = 0;
-#endif
-
- *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
-
- /* Number of extra blocks to hash */
- return i >> SHA256_LOG2_BLOCK_SIZE;
-}
-
-static struct sha256_hash_ctx
- *sha256_ctx_mgr_resubmit(struct sha256_ctx_mgr *mgr,
- struct sha256_hash_ctx *ctx)
-{
- while (ctx) {
- if (ctx->status & HASH_CTX_STS_COMPLETE) {
- /* Clear PROCESSING bit */
- ctx->status = HASH_CTX_STS_COMPLETE;
- return ctx;
- }
-
- /*
- * If the extra blocks are empty, begin hashing what remains
- * in the user's buffer.
- */
- if (ctx->partial_block_buffer_length == 0 &&
- ctx->incoming_buffer_length) {
-
- const void *buffer = ctx->incoming_buffer;
- uint32_t len = ctx->incoming_buffer_length;
- uint32_t copy_len;
-
- /*
- * Only entire blocks can be hashed.
- * Copy remainder to extra blocks buffer.
- */
- copy_len = len & (SHA256_BLOCK_SIZE-1);
-
- if (copy_len) {
- len -= copy_len;
- memcpy(ctx->partial_block_buffer,
- ((const char *) buffer + len),
- copy_len);
- ctx->partial_block_buffer_length = copy_len;
- }
-
- ctx->incoming_buffer_length = 0;
-
- /* len should be a multiple of the block size now */
- assert((len % SHA256_BLOCK_SIZE) == 0);
-
- /* Set len to the number of blocks to be hashed */
- len >>= SHA256_LOG2_BLOCK_SIZE;
-
- if (len) {
-
- ctx->job.buffer = (uint8_t *) buffer;
- ctx->job.len = len;
- ctx = (struct sha256_hash_ctx *)
- sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
- continue;
- }
- }
-
- /*
- * If the extra blocks are not empty, then we are
- * either on the last block(s) or we need more
- * user input before continuing.
- */
- if (ctx->status & HASH_CTX_STS_LAST) {
-
- uint8_t *buf = ctx->partial_block_buffer;
- uint32_t n_extra_blocks =
- sha256_pad(buf, ctx->total_length);
-
- ctx->status = (HASH_CTX_STS_PROCESSING |
- HASH_CTX_STS_COMPLETE);
- ctx->job.buffer = buf;
- ctx->job.len = (uint32_t) n_extra_blocks;
- ctx = (struct sha256_hash_ctx *)
- sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
- continue;
- }
-
- ctx->status = HASH_CTX_STS_IDLE;
- return ctx;
- }
-
- return NULL;
-}
-
-static struct sha256_hash_ctx
- *sha256_ctx_mgr_get_comp_ctx(struct sha256_ctx_mgr *mgr)
-{
- /*
- * If get_comp_job returns NULL, there are no jobs complete.
- * If get_comp_job returns a job, verify that it is safe to return to
- * the user. If it is not ready, resubmit the job to finish processing.
- * If sha256_ctx_mgr_resubmit returned a job, it is ready to be
- * returned. Otherwise, all jobs currently being managed by the
- * hash_ctx_mgr still need processing.
- */
- struct sha256_hash_ctx *ctx;
-
- ctx = (struct sha256_hash_ctx *) sha256_job_mgr_get_comp_job(&mgr->mgr);
- return sha256_ctx_mgr_resubmit(mgr, ctx);
-}
-
-static void sha256_ctx_mgr_init(struct sha256_ctx_mgr *mgr)
-{
- sha256_job_mgr_init(&mgr->mgr);
-}
-
-static struct sha256_hash_ctx *sha256_ctx_mgr_submit(struct sha256_ctx_mgr *mgr,
- struct sha256_hash_ctx *ctx,
- const void *buffer,
- uint32_t len,
- int flags)
-{
- if (flags & ~(HASH_UPDATE | HASH_LAST)) {
- /* User should not pass anything other than UPDATE or LAST */
- ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
- return ctx;
- }
-
- if (ctx->status & HASH_CTX_STS_PROCESSING) {
- /* Cannot submit to a currently processing job. */
- ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
- return ctx;
- }
-
- if (ctx->status & HASH_CTX_STS_COMPLETE) {
- /* Cannot update a finished job. */
- ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
- return ctx;
- }
-
- /* If we made it here, there was no error during this call to submit */
- ctx->error = HASH_CTX_ERROR_NONE;
-
- /* Store buffer ptr info from user */
- ctx->incoming_buffer = buffer;
- ctx->incoming_buffer_length = len;
-
- /*
- * Store the user's request flags and mark this ctx as currently
- * being processed.
- */
- ctx->status = (flags & HASH_LAST) ?
- (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
- HASH_CTX_STS_PROCESSING;
-
- /* Advance byte counter */
- ctx->total_length += len;
-
- /*
- * If there is anything currently buffered in the extra blocks,
- * append to it until it contains a whole block.
- * Or if the user's buffer contains less than a whole block,
- * append as much as possible to the extra block.
- */
- if (ctx->partial_block_buffer_length || len < SHA256_BLOCK_SIZE) {
- /*
- * Compute how many bytes to copy from user buffer into
- * extra block
- */
- uint32_t copy_len = SHA256_BLOCK_SIZE -
- ctx->partial_block_buffer_length;
- if (len < copy_len)
- copy_len = len;
-
- if (copy_len) {
- /* Copy and update relevant pointers and counters */
- memcpy(
- &ctx->partial_block_buffer[ctx->partial_block_buffer_length],
- buffer, copy_len);
-
- ctx->partial_block_buffer_length += copy_len;
- ctx->incoming_buffer = (const void *)
- ((const char *)buffer + copy_len);
- ctx->incoming_buffer_length = len - copy_len;
- }
-
- /* The extra block should never contain more than 1 block */
- assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
-
- /*
- * If the extra block buffer contains exactly 1 block,
- * it can be hashed.
- */
- if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
- ctx->partial_block_buffer_length = 0;
-
- ctx->job.buffer = ctx->partial_block_buffer;
- ctx->job.len = 1;
- ctx = (struct sha256_hash_ctx *)
- sha256_job_mgr_submit(&mgr->mgr, &ctx->job);
- }
- }
-
- return sha256_ctx_mgr_resubmit(mgr, ctx);
-}
-
-static struct sha256_hash_ctx *sha256_ctx_mgr_flush(struct sha256_ctx_mgr *mgr)
-{
- struct sha256_hash_ctx *ctx;
-
- while (1) {
- ctx = (struct sha256_hash_ctx *)
- sha256_job_mgr_flush(&mgr->mgr);
-
- /* If flush returned 0, there are no more jobs in flight. */
- if (!ctx)
- return NULL;
-
- /*
- * If flush returned a job, resubmit the job to finish
- * processing.
- */
- ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
-
- /*
- * If sha256_ctx_mgr_resubmit returned a job, it is ready to
- * be returned. Otherwise, all jobs currently being managed by
- * the sha256_ctx_mgr still need processing. Loop.
- */
- if (ctx)
- return ctx;
- }
-}
-
-static int sha256_mb_init(struct ahash_request *areq)
-{
- struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
-
- hash_ctx_init(sctx);
- sctx->job.result_digest[0] = SHA256_H0;
- sctx->job.result_digest[1] = SHA256_H1;
- sctx->job.result_digest[2] = SHA256_H2;
- sctx->job.result_digest[3] = SHA256_H3;
- sctx->job.result_digest[4] = SHA256_H4;
- sctx->job.result_digest[5] = SHA256_H5;
- sctx->job.result_digest[6] = SHA256_H6;
- sctx->job.result_digest[7] = SHA256_H7;
- sctx->total_length = 0;
- sctx->partial_block_buffer_length = 0;
- sctx->status = HASH_CTX_STS_IDLE;
-
- return 0;
-}
-
-static int sha256_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
-{
- int i;
- struct sha256_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
- __be32 *dst = (__be32 *) rctx->out;
-
- for (i = 0; i < 8; ++i)
- dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
-
- return 0;
-}
-
-static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
- struct mcryptd_alg_cstate *cstate, bool flush)
-{
- int flag = HASH_UPDATE;
- int nbytes, err = 0;
- struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
- struct sha256_hash_ctx *sha_ctx;
-
- /* more work ? */
- while (!(rctx->flag & HASH_DONE)) {
- nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
- if (nbytes < 0) {
- err = nbytes;
- goto out;
- }
- /* check if the walk is done */
- if (crypto_ahash_walk_last(&rctx->walk)) {
- rctx->flag |= HASH_DONE;
- if (rctx->flag & HASH_FINAL)
- flag |= HASH_LAST;
-
- }
- sha_ctx = (struct sha256_hash_ctx *)
- ahash_request_ctx(&rctx->areq);
- kernel_fpu_begin();
- sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx,
- rctx->walk.data, nbytes, flag);
- if (!sha_ctx) {
- if (flush)
- sha_ctx = sha256_ctx_mgr_flush(cstate->mgr);
- }
- kernel_fpu_end();
- if (sha_ctx)
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- else {
- rctx = NULL;
- goto out;
- }
- }
-
- /* copy the results */
- if (rctx->flag & HASH_FINAL)
- sha256_mb_set_results(rctx);
-
-out:
- *ret_rctx = rctx;
- return err;
-}
-
-static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
- struct mcryptd_alg_cstate *cstate,
- int err)
-{
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha256_hash_ctx *sha_ctx;
- struct mcryptd_hash_request_ctx *req_ctx;
- int ret;
-
- /* remove from work list */
- spin_lock(&cstate->work_lock);
- list_del(&rctx->waiter);
- spin_unlock(&cstate->work_lock);
-
- if (irqs_disabled())
- rctx->complete(&req->base, err);
- else {
- local_bh_disable();
- rctx->complete(&req->base, err);
- local_bh_enable();
- }
-
- /* check to see if there are other jobs that are done */
- sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
- while (sha_ctx) {
- req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&req_ctx, cstate, false);
- if (req_ctx) {
- spin_lock(&cstate->work_lock);
- list_del(&req_ctx->waiter);
- spin_unlock(&cstate->work_lock);
-
- req = cast_mcryptd_ctx_to_req(req_ctx);
- if (irqs_disabled())
- req_ctx->complete(&req->base, ret);
- else {
- local_bh_disable();
- req_ctx->complete(&req->base, ret);
- local_bh_enable();
- }
- }
- sha_ctx = sha256_ctx_mgr_get_comp_ctx(cstate->mgr);
- }
-
- return 0;
-}
-
-static void sha256_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
- struct mcryptd_alg_cstate *cstate)
-{
- unsigned long next_flush;
- unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
-
- /* initialize tag */
- rctx->tag.arrival = jiffies; /* tag the arrival time */
- rctx->tag.seq_num = cstate->next_seq_num++;
- next_flush = rctx->tag.arrival + delay;
- rctx->tag.expire = next_flush;
-
- spin_lock(&cstate->work_lock);
- list_add_tail(&rctx->waiter, &cstate->work_list);
- spin_unlock(&cstate->work_lock);
-
- mcryptd_arm_flusher(cstate, delay);
-}
-
-static int sha256_mb_update(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx, areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
-
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha256_hash_ctx *sha_ctx;
- int ret = 0, nbytes;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
- if (nbytes < 0) {
- ret = nbytes;
- goto done;
- }
-
- if (crypto_ahash_walk_last(&rctx->walk))
- rctx->flag |= HASH_DONE;
-
- /* submit */
- sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
- sha256_mb_add_list(rctx, cstate);
- kernel_fpu_begin();
- sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
- nbytes, HASH_UPDATE);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
-
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha256_mb_finup(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx, areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
-
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha256_hash_ctx *sha_ctx;
- int ret = 0, flag = HASH_UPDATE, nbytes;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
- if (nbytes < 0) {
- ret = nbytes;
- goto done;
- }
-
- if (crypto_ahash_walk_last(&rctx->walk)) {
- rctx->flag |= HASH_DONE;
- flag = HASH_LAST;
- }
-
- /* submit */
- rctx->flag |= HASH_FINAL;
- sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
- sha256_mb_add_list(rctx, cstate);
-
- kernel_fpu_begin();
- sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
- nbytes, flag);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha256_mb_final(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx,
- areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha256_mb_alg_state.alg_cstate);
-
- struct sha256_hash_ctx *sha_ctx;
- int ret = 0;
- u8 data;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- rctx->flag |= HASH_DONE | HASH_FINAL;
-
- sha_ctx = (struct sha256_hash_ctx *) ahash_request_ctx(areq);
- /* flag HASH_FINAL and 0 data size */
- sha256_mb_add_list(rctx, cstate);
- kernel_fpu_begin();
- sha_ctx = sha256_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
- HASH_LAST);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha256_mb_export(struct ahash_request *areq, void *out)
-{
- struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
-
- memcpy(out, sctx, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha256_mb_import(struct ahash_request *areq, const void *in)
-{
- struct sha256_hash_ctx *sctx = ahash_request_ctx(areq);
-
- memcpy(sctx, in, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha256_mb_async_init_tfm(struct crypto_tfm *tfm)
-{
- struct mcryptd_ahash *mcryptd_tfm;
- struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
- struct mcryptd_hash_ctx *mctx;
-
- mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha256-mb",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(mcryptd_tfm))
- return PTR_ERR(mcryptd_tfm);
- mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
- mctx->alg_state = &sha256_mb_alg_state;
- ctx->mcryptd_tfm = mcryptd_tfm;
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- crypto_ahash_reqsize(&mcryptd_tfm->base));
-
- return 0;
-}
-
-static void sha256_mb_async_exit_tfm(struct crypto_tfm *tfm)
-{
- struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
- mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static int sha256_mb_areq_init_tfm(struct crypto_tfm *tfm)
-{
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- sizeof(struct sha256_hash_ctx));
-
- return 0;
-}
-
-static void sha256_mb_areq_exit_tfm(struct crypto_tfm *tfm)
-{
- struct sha256_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
- mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static struct ahash_alg sha256_mb_areq_alg = {
- .init = sha256_mb_init,
- .update = sha256_mb_update,
- .final = sha256_mb_final,
- .finup = sha256_mb_finup,
- .export = sha256_mb_export,
- .import = sha256_mb_import,
- .halg = {
- .digestsize = SHA256_DIGEST_SIZE,
- .statesize = sizeof(struct sha256_hash_ctx),
- .base = {
- .cra_name = "__sha256-mb",
- .cra_driver_name = "__intel_sha256-mb",
- .cra_priority = 100,
- /*
- * use ASYNC flag as some buffers in multi-buffer
- * algo may not have completed before hashing thread
- * sleep
- */
- .cra_flags = CRYPTO_ALG_ASYNC |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT
- (sha256_mb_areq_alg.halg.base.cra_list),
- .cra_init = sha256_mb_areq_init_tfm,
- .cra_exit = sha256_mb_areq_exit_tfm,
- .cra_ctxsize = sizeof(struct sha256_hash_ctx),
- }
- }
-};
-
-static int sha256_mb_async_init(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_init(mcryptd_req);
-}
-
-static int sha256_mb_async_update(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_update(mcryptd_req);
-}
-
-static int sha256_mb_async_finup(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_finup(mcryptd_req);
-}
-
-static int sha256_mb_async_final(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_final(mcryptd_req);
-}
-
-static int sha256_mb_async_digest(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_digest(mcryptd_req);
-}
-
-static int sha256_mb_async_export(struct ahash_request *req, void *out)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_export(mcryptd_req, out);
-}
-
-static int sha256_mb_async_import(struct ahash_request *req, const void *in)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha256_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
- struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
- struct mcryptd_hash_request_ctx *rctx;
- struct ahash_request *areq;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- rctx = ahash_request_ctx(mcryptd_req);
- areq = &rctx->areq;
-
- ahash_request_set_tfm(areq, child);
- ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
- rctx->complete, req);
-
- return crypto_ahash_import(mcryptd_req, in);
-}
-
-static struct ahash_alg sha256_mb_async_alg = {
- .init = sha256_mb_async_init,
- .update = sha256_mb_async_update,
- .final = sha256_mb_async_final,
- .finup = sha256_mb_async_finup,
- .export = sha256_mb_async_export,
- .import = sha256_mb_async_import,
- .digest = sha256_mb_async_digest,
- .halg = {
- .digestsize = SHA256_DIGEST_SIZE,
- .statesize = sizeof(struct sha256_hash_ctx),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256_mb",
- /*
- * Low priority, since with few concurrent hash requests
- * this is extremely slow due to the flush delay. Users
- * whose workloads would benefit from this can request
- * it explicitly by driver name, or can increase its
- * priority at runtime using NETLINK_CRYPTO.
- */
- .cra_priority = 50,
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT
- (sha256_mb_async_alg.halg.base.cra_list),
- .cra_init = sha256_mb_async_init_tfm,
- .cra_exit = sha256_mb_async_exit_tfm,
- .cra_ctxsize = sizeof(struct sha256_mb_ctx),
- .cra_alignmask = 0,
- },
- },
-};
-
-static unsigned long sha256_mb_flusher(struct mcryptd_alg_cstate *cstate)
-{
- struct mcryptd_hash_request_ctx *rctx;
- unsigned long cur_time;
- unsigned long next_flush = 0;
- struct sha256_hash_ctx *sha_ctx;
-
-
- cur_time = jiffies;
-
- while (!list_empty(&cstate->work_list)) {
- rctx = list_entry(cstate->work_list.next,
- struct mcryptd_hash_request_ctx, waiter);
- if (time_before(cur_time, rctx->tag.expire))
- break;
- kernel_fpu_begin();
- sha_ctx = (struct sha256_hash_ctx *)
- sha256_ctx_mgr_flush(cstate->mgr);
- kernel_fpu_end();
- if (!sha_ctx) {
- pr_err("sha256_mb error: nothing got"
- " flushed for non-empty list\n");
- break;
- }
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- sha_finish_walk(&rctx, cstate, true);
- sha_complete_job(rctx, cstate, 0);
- }
-
- if (!list_empty(&cstate->work_list)) {
- rctx = list_entry(cstate->work_list.next,
- struct mcryptd_hash_request_ctx, waiter);
- /* get the hash context and then flush time */
- next_flush = rctx->tag.expire;
- mcryptd_arm_flusher(cstate, get_delay(next_flush));
- }
- return next_flush;
-}
-
-static int __init sha256_mb_mod_init(void)
-{
-
- int cpu;
- int err;
- struct mcryptd_alg_cstate *cpu_state;
-
- /* check for dependent cpu features */
- if (!boot_cpu_has(X86_FEATURE_AVX2) ||
- !boot_cpu_has(X86_FEATURE_BMI2))
- return -ENODEV;
-
- /* initialize multibuffer structures */
- sha256_mb_alg_state.alg_cstate = alloc_percpu
- (struct mcryptd_alg_cstate);
-
- sha256_job_mgr_init = sha256_mb_mgr_init_avx2;
- sha256_job_mgr_submit = sha256_mb_mgr_submit_avx2;
- sha256_job_mgr_flush = sha256_mb_mgr_flush_avx2;
- sha256_job_mgr_get_comp_job = sha256_mb_mgr_get_comp_job_avx2;
-
- if (!sha256_mb_alg_state.alg_cstate)
- return -ENOMEM;
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
- cpu_state->next_flush = 0;
- cpu_state->next_seq_num = 0;
- cpu_state->flusher_engaged = false;
- INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
- cpu_state->cpu = cpu;
- cpu_state->alg_state = &sha256_mb_alg_state;
- cpu_state->mgr = kzalloc(sizeof(struct sha256_ctx_mgr),
- GFP_KERNEL);
- if (!cpu_state->mgr)
- goto err2;
- sha256_ctx_mgr_init(cpu_state->mgr);
- INIT_LIST_HEAD(&cpu_state->work_list);
- spin_lock_init(&cpu_state->work_lock);
- }
- sha256_mb_alg_state.flusher = &sha256_mb_flusher;
-
- err = crypto_register_ahash(&sha256_mb_areq_alg);
- if (err)
- goto err2;
- err = crypto_register_ahash(&sha256_mb_async_alg);
- if (err)
- goto err1;
-
-
- return 0;
-err1:
- crypto_unregister_ahash(&sha256_mb_areq_alg);
-err2:
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
- kfree(cpu_state->mgr);
- }
- free_percpu(sha256_mb_alg_state.alg_cstate);
- return -ENODEV;
-}
-
-static void __exit sha256_mb_mod_fini(void)
-{
- int cpu;
- struct mcryptd_alg_cstate *cpu_state;
-
- crypto_unregister_ahash(&sha256_mb_async_alg);
- crypto_unregister_ahash(&sha256_mb_areq_alg);
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha256_mb_alg_state.alg_cstate, cpu);
- kfree(cpu_state->mgr);
- }
- free_percpu(sha256_mb_alg_state.alg_cstate);
-}
-
-module_init(sha256_mb_mod_init);
-module_exit(sha256_mb_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, multi buffer accelerated");
-
-MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
deleted file mode 100644
index 7c43254..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Header file for multi buffer SHA256 context
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _SHA_MB_CTX_INTERNAL_H
-#define _SHA_MB_CTX_INTERNAL_H
-
-#include "sha256_mb_mgr.h"
-
-#define HASH_UPDATE 0x00
-#define HASH_LAST 0x01
-#define HASH_DONE 0x02
-#define HASH_FINAL 0x04
-
-#define HASH_CTX_STS_IDLE 0x00
-#define HASH_CTX_STS_PROCESSING 0x01
-#define HASH_CTX_STS_LAST 0x02
-#define HASH_CTX_STS_COMPLETE 0x04
-
-enum hash_ctx_error {
- HASH_CTX_ERROR_NONE = 0,
- HASH_CTX_ERROR_INVALID_FLAGS = -1,
- HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
- HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
-
-#ifdef HASH_CTX_DEBUG
- HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
-#endif
-};
-
-
-#define hash_ctx_user_data(ctx) ((ctx)->user_data)
-#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
-#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
-#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
-#define hash_ctx_status(ctx) ((ctx)->status)
-#define hash_ctx_error(ctx) ((ctx)->error)
-#define hash_ctx_init(ctx) \
- do { \
- (ctx)->error = HASH_CTX_ERROR_NONE; \
- (ctx)->status = HASH_CTX_STS_COMPLETE; \
- } while (0)
-
-
-/* Hash Constants and Typedefs */
-#define SHA256_DIGEST_LENGTH 8
-#define SHA256_LOG2_BLOCK_SIZE 6
-
-#define SHA256_PADLENGTHFIELD_SIZE 8
-
-#ifdef SHA_MB_DEBUG
-#define assert(expr) \
-do { \
- if (unlikely(!(expr))) { \
- printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
- #expr, __FILE__, __func__, __LINE__); \
- } \
-} while (0)
-#else
-#define assert(expr) do {} while (0)
-#endif
-
-struct sha256_ctx_mgr {
- struct sha256_mb_mgr mgr;
-};
-
-/* typedef struct sha256_ctx_mgr sha256_ctx_mgr; */
-
-struct sha256_hash_ctx {
- /* Must be at struct offset 0 */
- struct job_sha256 job;
- /* status flag */
- int status;
- /* error flag */
- int error;
-
- uint64_t total_length;
- const void *incoming_buffer;
- uint32_t incoming_buffer_length;
- uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2];
- uint32_t partial_block_buffer_length;
- void *user_data;
-};
-
-#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h b/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
deleted file mode 100644
index b01ae40..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Header file for multi buffer SHA256 algorithm manager
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#ifndef __SHA_MB_MGR_H
-#define __SHA_MB_MGR_H
-
-#include <linux/types.h>
-
-#define NUM_SHA256_DIGEST_WORDS 8
-
-enum job_sts { STS_UNKNOWN = 0,
- STS_BEING_PROCESSED = 1,
- STS_COMPLETED = 2,
- STS_INTERNAL_ERROR = 3,
- STS_ERROR = 4
-};
-
-struct job_sha256 {
- u8 *buffer;
- u32 len;
- u32 result_digest[NUM_SHA256_DIGEST_WORDS] __aligned(32);
- enum job_sts status;
- void *user_data;
-};
-
-/* SHA256 out-of-order scheduler */
-
-/* typedef uint32_t sha8_digest_array[8][8]; */
-
-struct sha256_args_x8 {
- uint32_t digest[8][8];
- uint8_t *data_ptr[8];
-};
-
-struct sha256_lane_data {
- struct job_sha256 *job_in_lane;
-};
-
-struct sha256_mb_mgr {
- struct sha256_args_x8 args;
-
- uint32_t lens[8];
-
- /* each byte is index (0...7) of unused lanes */
- uint64_t unused_lanes;
- /* byte 4 is set to FF as a flag */
- struct sha256_lane_data ldata[8];
-};
-
-
-#define SHA256_MB_MGR_NUM_LANES_AVX2 8
-
-void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state);
-struct job_sha256 *sha256_mb_mgr_submit_avx2(struct sha256_mb_mgr *state,
- struct job_sha256 *job);
-struct job_sha256 *sha256_mb_mgr_flush_avx2(struct sha256_mb_mgr *state);
-struct job_sha256 *sha256_mb_mgr_get_comp_job_avx2(struct sha256_mb_mgr *state);
-
-#endif
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
deleted file mode 100644
index 5c377ba..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_datastruct.S
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Header file for multi buffer SHA256 algorithm data structure
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-# Macros for defining data structures
-
-# Usage example
-
-#START_FIELDS # JOB_AES
-### name size align
-#FIELD _plaintext, 8, 8 # pointer to plaintext
-#FIELD _ciphertext, 8, 8 # pointer to ciphertext
-#FIELD _IV, 16, 8 # IV
-#FIELD _keys, 8, 8 # pointer to keys
-#FIELD _len, 4, 4 # length in bytes
-#FIELD _status, 4, 4 # status enumeration
-#FIELD _user_data, 8, 8 # pointer to user data
-#UNION _union, size1, align1, \
-# size2, align2, \
-# size3, align3, \
-# ...
-#END_FIELDS
-#%assign _JOB_AES_size _FIELD_OFFSET
-#%assign _JOB_AES_align _STRUCT_ALIGN
-
-#########################################################################
-
-# Alternate "struc-like" syntax:
-# STRUCT job_aes2
-# RES_Q .plaintext, 1
-# RES_Q .ciphertext, 1
-# RES_DQ .IV, 1
-# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
-# RES_U .union, size1, align1, \
-# size2, align2, \
-# ...
-# ENDSTRUCT
-# # Following only needed if nesting
-# %assign job_aes2_size _FIELD_OFFSET
-# %assign job_aes2_align _STRUCT_ALIGN
-#
-# RES_* macros take a name, a count and an optional alignment.
-# The count in in terms of the base size of the macro, and the
-# default alignment is the base size.
-# The macros are:
-# Macro Base size
-# RES_B 1
-# RES_W 2
-# RES_D 4
-# RES_Q 8
-# RES_DQ 16
-# RES_Y 32
-# RES_Z 64
-#
-# RES_U defines a union. It's arguments are a name and two or more
-# pairs of "size, alignment"
-#
-# The two assigns are only needed if this structure is being nested
-# within another. Even if the assigns are not done, one can still use
-# STRUCT_NAME_size as the size of the structure.
-#
-# Note that for nesting, you still need to assign to STRUCT_NAME_size.
-#
-# The differences between this and using "struc" directly are that each
-# type is implicitly aligned to its natural length (although this can be
-# over-ridden with an explicit third parameter), and that the structure
-# is padded at the end to its overall alignment.
-#
-
-#########################################################################
-
-#ifndef _DATASTRUCT_ASM_
-#define _DATASTRUCT_ASM_
-
-#define SZ8 8*SHA256_DIGEST_WORD_SIZE
-#define ROUNDS 64*SZ8
-#define PTR_SZ 8
-#define SHA256_DIGEST_WORD_SIZE 4
-#define MAX_SHA256_LANES 8
-#define SHA256_DIGEST_WORDS 8
-#define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
-#define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * SHA256_DIGEST_WORDS)
-#define SHA256_BLK_SZ 64
-
-# START_FIELDS
-.macro START_FIELDS
- _FIELD_OFFSET = 0
- _STRUCT_ALIGN = 0
-.endm
-
-# FIELD name size align
-.macro FIELD name size align
- _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
- \name = _FIELD_OFFSET
- _FIELD_OFFSET = _FIELD_OFFSET + (\size)
-.if (\align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = \align
-.endif
-.endm
-
-# END_FIELDS
-.macro END_FIELDS
- _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
-.endm
-
-########################################################################
-
-.macro STRUCT p1
-START_FIELDS
-.struc \p1
-.endm
-
-.macro ENDSTRUCT
- tmp = _FIELD_OFFSET
- END_FIELDS
- tmp = (_FIELD_OFFSET - %%tmp)
-.if (tmp > 0)
- .lcomm tmp
-.endif
-.endstruc
-.endm
-
-## RES_int name size align
-.macro RES_int p1 p2 p3
- name = \p1
- size = \p2
- align = .\p3
-
- _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
-.align align
-.lcomm name size
- _FIELD_OFFSET = _FIELD_OFFSET + (size)
-.if (align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = align
-.endif
-.endm
-
-# macro RES_B name, size [, align]
-.macro RES_B _name, _size, _align=1
-RES_int _name _size _align
-.endm
-
-# macro RES_W name, size [, align]
-.macro RES_W _name, _size, _align=2
-RES_int _name 2*(_size) _align
-.endm
-
-# macro RES_D name, size [, align]
-.macro RES_D _name, _size, _align=4
-RES_int _name 4*(_size) _align
-.endm
-
-# macro RES_Q name, size [, align]
-.macro RES_Q _name, _size, _align=8
-RES_int _name 8*(_size) _align
-.endm
-
-# macro RES_DQ name, size [, align]
-.macro RES_DQ _name, _size, _align=16
-RES_int _name 16*(_size) _align
-.endm
-
-# macro RES_Y name, size [, align]
-.macro RES_Y _name, _size, _align=32
-RES_int _name 32*(_size) _align
-.endm
-
-# macro RES_Z name, size [, align]
-.macro RES_Z _name, _size, _align=64
-RES_int _name 64*(_size) _align
-.endm
-
-#endif
-
-
-########################################################################
-#### Define SHA256 Out Of Order Data Structures
-########################################################################
-
-START_FIELDS # LANE_DATA
-### name size align
-FIELD _job_in_lane, 8, 8 # pointer to job object
-END_FIELDS
-
- _LANE_DATA_size = _FIELD_OFFSET
- _LANE_DATA_align = _STRUCT_ALIGN
-
-########################################################################
-
-START_FIELDS # SHA256_ARGS_X4
-### name size align
-FIELD _digest, 4*8*8, 4 # transposed digest
-FIELD _data_ptr, 8*8, 8 # array of pointers to data
-END_FIELDS
-
- _SHA256_ARGS_X4_size = _FIELD_OFFSET
- _SHA256_ARGS_X4_align = _STRUCT_ALIGN
- _SHA256_ARGS_X8_size = _FIELD_OFFSET
- _SHA256_ARGS_X8_align = _STRUCT_ALIGN
-
-#######################################################################
-
-START_FIELDS # MB_MGR
-### name size align
-FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
-FIELD _lens, 4*8, 8
-FIELD _unused_lanes, 8, 8
-FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
-END_FIELDS
-
- _MB_MGR_size = _FIELD_OFFSET
- _MB_MGR_align = _STRUCT_ALIGN
-
-_args_digest = _args + _digest
-_args_data_ptr = _args + _data_ptr
-
-#######################################################################
-
-START_FIELDS #STACK_FRAME
-### name size align
-FIELD _data, 16*SZ8, 1 # transposed digest
-FIELD _digest, 8*SZ8, 1 # array of pointers to data
-FIELD _ytmp, 4*SZ8, 1
-FIELD _rsp, 8, 1
-END_FIELDS
-
- _STACK_FRAME_size = _FIELD_OFFSET
- _STACK_FRAME_align = _STRUCT_ALIGN
-
-#######################################################################
-
-########################################################################
-#### Define constants
-########################################################################
-
-#define STS_UNKNOWN 0
-#define STS_BEING_PROCESSED 1
-#define STS_COMPLETED 2
-
-########################################################################
-#### Define JOB_SHA256 structure
-########################################################################
-
-START_FIELDS # JOB_SHA256
-
-### name size align
-FIELD _buffer, 8, 8 # pointer to buffer
-FIELD _len, 8, 8 # length in bytes
-FIELD _result_digest, 8*4, 32 # Digest (output)
-FIELD _status, 4, 4
-FIELD _user_data, 8, 8
-END_FIELDS
-
- _JOB_SHA256_size = _FIELD_OFFSET
- _JOB_SHA256_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
deleted file mode 100644
index d2364c5..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Flush routine for SHA256 multibuffer
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha256_mb_mgr_datastruct.S"
-
-.extern sha256_x8_avx2
-
-#LINUX register definitions
-#define arg1 %rdi
-#define arg2 %rsi
-
-# Common register definitions
-#define state arg1
-#define job arg2
-#define len2 arg2
-
-# idx must be a register not clobberred by sha1_mult
-#define idx %r8
-#define DWORD_idx %r8d
-
-#define unused_lanes %rbx
-#define lane_data %rbx
-#define tmp2 %rbx
-#define tmp2_w %ebx
-
-#define job_rax %rax
-#define tmp1 %rax
-#define size_offset %rax
-#define tmp %rax
-#define start_offset %rax
-
-#define tmp3 %arg1
-
-#define extra_blocks %arg2
-#define p %arg2
-
-.macro LABEL prefix n
-\prefix\n\():
-.endm
-
-.macro JNE_SKIP i
-jne skip_\i
-.endm
-
-.altmacro
-.macro SET_OFFSET _offset
-offset = \_offset
-.endm
-.noaltmacro
-
-# JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
-# arg 1 : rcx : state
-ENTRY(sha256_mb_mgr_flush_avx2)
- FRAME_BEGIN
- push %rbx
-
- # If bit (32+3) is set, then all lanes are empty
- mov _unused_lanes(state), unused_lanes
- bt $32+3, unused_lanes
- jc return_null
-
- # find a lane with a non-null job
- xor idx, idx
- offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne one(%rip), idx
- offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne two(%rip), idx
- offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne three(%rip), idx
- offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne four(%rip), idx
- offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne five(%rip), idx
- offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne six(%rip), idx
- offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne seven(%rip), idx
-
- # copy idx to empty lanes
-copy_lane_data:
- offset = (_args + _data_ptr)
- mov offset(state,idx,8), tmp
-
- I = 0
-.rep 8
- offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
-.altmacro
- JNE_SKIP %I
- offset = (_args + _data_ptr + 8*I)
- mov tmp, offset(state)
- offset = (_lens + 4*I)
- movl $0xFFFFFFFF, offset(state)
-LABEL skip_ %I
- I = (I+1)
-.noaltmacro
-.endr
-
- # Find min length
- vmovdqu _lens+0*16(state), %xmm0
- vmovdqu _lens+1*16(state), %xmm1
-
- vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
- vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
- vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
-
- vmovd %xmm2, DWORD_idx
- mov idx, len2
- and $0xF, idx
- shr $4, len2
- jz len_is_0
-
- vpand clear_low_nibble(%rip), %xmm2, %xmm2
- vpshufd $0, %xmm2, %xmm2
-
- vpsubd %xmm2, %xmm0, %xmm0
- vpsubd %xmm2, %xmm1, %xmm1
-
- vmovdqu %xmm0, _lens+0*16(state)
- vmovdqu %xmm1, _lens+1*16(state)
-
- # "state" and "args" are the same address, arg1
- # len is arg2
- call sha256_x8_avx2
- # state and idx are intact
-
-len_is_0:
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- mov _unused_lanes(state), unused_lanes
- shl $4, unused_lanes
- or idx, unused_lanes
-
- mov unused_lanes, _unused_lanes(state)
- movl $0xFFFFFFFF, _lens(state,idx,4)
-
- vmovd _args_digest(state , idx, 4) , %xmm0
- vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
- vmovd _args_digest+4*32(state, idx, 4), %xmm1
- vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
- vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
- vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
-
- vmovdqu %xmm0, _result_digest(job_rax)
- offset = (_result_digest + 1*16)
- vmovdqu %xmm1, offset(job_rax)
-
-return:
- pop %rbx
- FRAME_END
- ret
-
-return_null:
- xor job_rax, job_rax
- jmp return
-ENDPROC(sha256_mb_mgr_flush_avx2)
-
-##############################################################################
-
-.align 16
-ENTRY(sha256_mb_mgr_get_comp_job_avx2)
- push %rbx
-
- ## if bit 32+3 is set, then all lanes are empty
- mov _unused_lanes(state), unused_lanes
- bt $(32+3), unused_lanes
- jc .return_null
-
- # Find min length
- vmovdqu _lens(state), %xmm0
- vmovdqu _lens+1*16(state), %xmm1
-
- vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
- vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
- vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
-
- vmovd %xmm2, DWORD_idx
- test $~0xF, idx
- jnz .return_null
-
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- mov _unused_lanes(state), unused_lanes
- shl $4, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens(state, idx, 4)
-
- vmovd _args_digest(state, idx, 4), %xmm0
- vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
- vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
- vmovd _args_digest+4*32(state, idx, 4), %xmm1
- vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
- vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
- vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
-
- vmovdqu %xmm0, _result_digest(job_rax)
- offset = (_result_digest + 1*16)
- vmovdqu %xmm1, offset(job_rax)
-
- pop %rbx
-
- ret
-
-.return_null:
- xor job_rax, job_rax
- pop %rbx
- ret
-ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
-
-.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
-.align 16
-clear_low_nibble:
-.octa 0x000000000000000000000000FFFFFFF0
-
-.section .rodata.cst8, "aM", @progbits, 8
-.align 8
-one:
-.quad 1
-two:
-.quad 2
-three:
-.quad 3
-four:
-.quad 4
-five:
-.quad 5
-six:
-.quad 6
-seven:
-.quad 7
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
deleted file mode 100644
index b0c4983..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_init_avx2.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Initialization code for multi buffer SHA256 algorithm for AVX2
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "sha256_mb_mgr.h"
-
-void sha256_mb_mgr_init_avx2(struct sha256_mb_mgr *state)
-{
- unsigned int j;
-
- state->unused_lanes = 0xF76543210ULL;
- for (j = 0; j < 8; j++) {
- state->lens[j] = 0xFFFFFFFF;
- state->ldata[j].job_in_lane = NULL;
- }
-}
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
deleted file mode 100644
index b36ae74..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Buffer submit code for multi buffer SHA256 algorithm
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha256_mb_mgr_datastruct.S"
-
-.extern sha256_x8_avx2
-
-# LINUX register definitions
-arg1 = %rdi
-arg2 = %rsi
-size_offset = %rcx
-tmp2 = %rcx
-extra_blocks = %rdx
-
-# Common definitions
-#define state arg1
-#define job %rsi
-#define len2 arg2
-#define p2 arg2
-
-# idx must be a register not clobberred by sha1_x8_avx2
-idx = %r8
-DWORD_idx = %r8d
-last_len = %r8
-
-p = %r11
-start_offset = %r11
-
-unused_lanes = %rbx
-BYTE_unused_lanes = %bl
-
-job_rax = %rax
-len = %rax
-DWORD_len = %eax
-
-lane = %r12
-tmp3 = %r12
-
-tmp = %r9
-DWORD_tmp = %r9d
-
-lane_data = %r10
-
-# JOB* sha256_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA256 *job)
-# arg 1 : rcx : state
-# arg 2 : rdx : job
-ENTRY(sha256_mb_mgr_submit_avx2)
- FRAME_BEGIN
- push %rbx
- push %r12
-
- mov _unused_lanes(state), unused_lanes
- mov unused_lanes, lane
- and $0xF, lane
- shr $4, unused_lanes
- imul $_LANE_DATA_size, lane, lane_data
- movl $STS_BEING_PROCESSED, _status(job)
- lea _ldata(state, lane_data), lane_data
- mov unused_lanes, _unused_lanes(state)
- movl _len(job), DWORD_len
-
- mov job, _job_in_lane(lane_data)
- shl $4, len
- or lane, len
-
- movl DWORD_len, _lens(state , lane, 4)
-
- # Load digest words from result_digest
- vmovdqu _result_digest(job), %xmm0
- vmovdqu _result_digest+1*16(job), %xmm1
- vmovd %xmm0, _args_digest(state, lane, 4)
- vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
- vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
- vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
- vmovd %xmm1, _args_digest+4*32(state , lane, 4)
-
- vpextrd $1, %xmm1, _args_digest+5*32(state , lane, 4)
- vpextrd $2, %xmm1, _args_digest+6*32(state , lane, 4)
- vpextrd $3, %xmm1, _args_digest+7*32(state , lane, 4)
-
- mov _buffer(job), p
- mov p, _args_data_ptr(state, lane, 8)
-
- cmp $0xF, unused_lanes
- jne return_null
-
-start_loop:
- # Find min length
- vmovdqa _lens(state), %xmm0
- vmovdqa _lens+1*16(state), %xmm1
-
- vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
- vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
- vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
- vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
-
- vmovd %xmm2, DWORD_idx
- mov idx, len2
- and $0xF, idx
- shr $4, len2
- jz len_is_0
-
- vpand clear_low_nibble(%rip), %xmm2, %xmm2
- vpshufd $0, %xmm2, %xmm2
-
- vpsubd %xmm2, %xmm0, %xmm0
- vpsubd %xmm2, %xmm1, %xmm1
-
- vmovdqa %xmm0, _lens + 0*16(state)
- vmovdqa %xmm1, _lens + 1*16(state)
-
- # "state" and "args" are the same address, arg1
- # len is arg2
- call sha256_x8_avx2
-
- # state and idx are intact
-
-len_is_0:
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- mov _unused_lanes(state), unused_lanes
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- shl $4, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens(state,idx,4)
-
- vmovd _args_digest(state, idx, 4), %xmm0
- vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
- vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
- vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
- vmovd _args_digest+4*32(state, idx, 4), %xmm1
-
- vpinsrd $1, _args_digest+5*32(state , idx, 4), %xmm1, %xmm1
- vpinsrd $2, _args_digest+6*32(state , idx, 4), %xmm1, %xmm1
- vpinsrd $3, _args_digest+7*32(state , idx, 4), %xmm1, %xmm1
-
- vmovdqu %xmm0, _result_digest(job_rax)
- vmovdqu %xmm1, _result_digest+1*16(job_rax)
-
-return:
- pop %r12
- pop %rbx
- FRAME_END
- ret
-
-return_null:
- xor job_rax, job_rax
- jmp return
-
-ENDPROC(sha256_mb_mgr_submit_avx2)
-
-.section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
-.align 16
-clear_low_nibble:
- .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
deleted file mode 100644
index 1687c80..0000000
--- a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
+++ /dev/null
@@ -1,598 +0,0 @@
-/*
- * Multi-buffer SHA256 algorithm hash compute routine
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include "sha256_mb_mgr_datastruct.S"
-
-## code to compute oct SHA256 using SSE-256
-## outer calling routine takes care of save and restore of XMM registers
-## Logic designed/laid out by JDG
-
-## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; %ymm0-15
-## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
-## Linux preserves: rdi rbp r8
-##
-## clobbers %ymm0-15
-
-arg1 = %rdi
-arg2 = %rsi
-reg3 = %rcx
-reg4 = %rdx
-
-# Common definitions
-STATE = arg1
-INP_SIZE = arg2
-
-IDX = %rax
-ROUND = %rbx
-TBL = reg3
-
-inp0 = %r9
-inp1 = %r10
-inp2 = %r11
-inp3 = %r12
-inp4 = %r13
-inp5 = %r14
-inp6 = %r15
-inp7 = reg4
-
-a = %ymm0
-b = %ymm1
-c = %ymm2
-d = %ymm3
-e = %ymm4
-f = %ymm5
-g = %ymm6
-h = %ymm7
-
-T1 = %ymm8
-
-a0 = %ymm12
-a1 = %ymm13
-a2 = %ymm14
-TMP = %ymm15
-TMP0 = %ymm6
-TMP1 = %ymm7
-
-TT0 = %ymm8
-TT1 = %ymm9
-TT2 = %ymm10
-TT3 = %ymm11
-TT4 = %ymm12
-TT5 = %ymm13
-TT6 = %ymm14
-TT7 = %ymm15
-
-# Define stack usage
-
-# Assume stack aligned to 32 bytes before call
-# Therefore FRAMESZ mod 32 must be 32-8 = 24
-
-#define FRAMESZ 0x388
-
-#define VMOVPS vmovups
-
-# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
-# "transpose" data in {r0...r7} using temps {t0...t1}
-# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
-# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
-# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
-# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
-# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
-# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
-# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
-# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
-# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
-#
-# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
-# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
-# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
-# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
-# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
-# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
-# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
-# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
-# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
-#
-
-.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
- # process top half (r0..r3) {a...d}
- vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
- vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
- vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
- vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
- vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
- vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
- vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
- vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
-
- # use r2 in place of t0
- # process bottom half (r4..r7) {e...h}
- vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
- vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
- vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
- vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
- vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
- vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
- vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
- vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
-
- vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
- vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
- vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
- vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
- vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
- vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
- vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
- vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
-
-.endm
-
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro _PRORD reg imm tmp
- vpslld $(32-\imm),\reg,\tmp
- vpsrld $\imm,\reg, \reg
- vpor \tmp,\reg, \reg
-.endm
-
-# PRORD_nd reg, imm, tmp, src
-.macro _PRORD_nd reg imm tmp src
- vpslld $(32-\imm), \src, \tmp
- vpsrld $\imm, \src, \reg
- vpor \tmp, \reg, \reg
-.endm
-
-# PRORD dst/src, amt
-.macro PRORD reg imm
- _PRORD \reg,\imm,TMP
-.endm
-
-# PRORD_nd dst, src, amt
-.macro PRORD_nd reg tmp imm
- _PRORD_nd \reg, \imm, TMP, \tmp
-.endm
-
-# arguments passed implicitly in preprocessor symbols i, a...h
-.macro ROUND_00_15 _T1 i
- PRORD_nd a0,e,5 # sig1: a0 = (e >> 5)
-
- vpxor g, f, a2 # ch: a2 = f^g
- vpand e,a2, a2 # ch: a2 = (f^g)&e
- vpxor g, a2, a2 # a2 = ch
-
- PRORD_nd a1,e,25 # sig1: a1 = (e >> 25)
-
- vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp)
- vpaddd (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
- vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
- PRORD a0, 6 # sig1: a0 = (e >> 6) ^ (e >> 11)
- vpaddd a2, h, h # h = h + ch
- PRORD_nd a2,a,11 # sig0: a2 = (a >> 11)
- vpaddd \_T1,h, h # h = h + ch + W + K
- vpxor a1, a0, a0 # a0 = sigma1
- PRORD_nd a1,a,22 # sig0: a1 = (a >> 22)
- vpxor c, a, \_T1 # maj: T1 = a^c
- add $SZ8, ROUND # ROUND++
- vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
- vpaddd a0, h, h
- vpaddd h, d, d
- vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
- PRORD a2,2 # sig0: a2 = (a >> 2) ^ (a >> 13)
- vpxor a1, a2, a2 # a2 = sig0
- vpand c, a, a1 # maj: a1 = a&c
- vpor \_T1, a1, a1 # a1 = maj
- vpaddd a1, h, h # h = h + ch + W + K + maj
- vpaddd a2, h, h # h = h + ch + W + K + maj + sigma0
- ROTATE_ARGS
-.endm
-
-# arguments passed implicitly in preprocessor symbols i, a...h
-.macro ROUND_16_XX _T1 i
- vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1
- vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1
- vmovdqu \_T1, a0
- PRORD \_T1,11
- vmovdqu a1, a2
- PRORD a1,2
- vpxor a0, \_T1, \_T1
- PRORD \_T1, 7
- vpxor a2, a1, a1
- PRORD a1, 17
- vpsrld $3, a0, a0
- vpxor a0, \_T1, \_T1
- vpsrld $10, a2, a2
- vpxor a2, a1, a1
- vpaddd (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
- vpaddd (SZ8*((\i-7)&0xf))(%rsp), a1, a1
- vpaddd a1, \_T1, \_T1
-
- ROUND_00_15 \_T1,\i
-.endm
-
-# SHA256_ARGS:
-# UINT128 digest[8]; // transposed digests
-# UINT8 *data_ptr[4];
-
-# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
-# arg 1 : STATE : pointer to array of pointers to input data
-# arg 2 : INP_SIZE : size of input in blocks
- # general registers preserved in outer calling routine
- # outer calling routine saves all the XMM registers
- # save rsp, allocate 32-byte aligned for local variables
-ENTRY(sha256_x8_avx2)
-
- # save callee-saved clobbered registers to comply with C function ABI
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, IDX
- sub $FRAMESZ, %rsp
- and $~0x1F, %rsp
- mov IDX, _rsp(%rsp)
-
- # Load the pre-transposed incoming digest.
- vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a
- vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b
- vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c
- vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d
- vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e
- vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f
- vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g
- vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h
-
- lea K256_8(%rip),TBL
-
- # load the address of each of the 4 message lanes
- # getting ready to transpose input onto stack
- mov _args_data_ptr+0*PTR_SZ(STATE),inp0
- mov _args_data_ptr+1*PTR_SZ(STATE),inp1
- mov _args_data_ptr+2*PTR_SZ(STATE),inp2
- mov _args_data_ptr+3*PTR_SZ(STATE),inp3
- mov _args_data_ptr+4*PTR_SZ(STATE),inp4
- mov _args_data_ptr+5*PTR_SZ(STATE),inp5
- mov _args_data_ptr+6*PTR_SZ(STATE),inp6
- mov _args_data_ptr+7*PTR_SZ(STATE),inp7
-
- xor IDX, IDX
-lloop:
- xor ROUND, ROUND
-
- # save old digest
- vmovdqu a, _digest(%rsp)
- vmovdqu b, _digest+1*SZ8(%rsp)
- vmovdqu c, _digest+2*SZ8(%rsp)
- vmovdqu d, _digest+3*SZ8(%rsp)
- vmovdqu e, _digest+4*SZ8(%rsp)
- vmovdqu f, _digest+5*SZ8(%rsp)
- vmovdqu g, _digest+6*SZ8(%rsp)
- vmovdqu h, _digest+7*SZ8(%rsp)
- i = 0
-.rep 2
- VMOVPS i*32(inp0, IDX), TT0
- VMOVPS i*32(inp1, IDX), TT1
- VMOVPS i*32(inp2, IDX), TT2
- VMOVPS i*32(inp3, IDX), TT3
- VMOVPS i*32(inp4, IDX), TT4
- VMOVPS i*32(inp5, IDX), TT5
- VMOVPS i*32(inp6, IDX), TT6
- VMOVPS i*32(inp7, IDX), TT7
- vmovdqu g, _ytmp(%rsp)
- vmovdqu h, _ytmp+1*SZ8(%rsp)
- TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
- vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
- vmovdqu _ytmp(%rsp), g
- vpshufb TMP1, TT0, TT0
- vpshufb TMP1, TT1, TT1
- vpshufb TMP1, TT2, TT2
- vpshufb TMP1, TT3, TT3
- vpshufb TMP1, TT4, TT4
- vpshufb TMP1, TT5, TT5
- vpshufb TMP1, TT6, TT6
- vpshufb TMP1, TT7, TT7
- vmovdqu _ytmp+1*SZ8(%rsp), h
- vmovdqu TT4, _ytmp(%rsp)
- vmovdqu TT5, _ytmp+1*SZ8(%rsp)
- vmovdqu TT6, _ytmp+2*SZ8(%rsp)
- vmovdqu TT7, _ytmp+3*SZ8(%rsp)
- ROUND_00_15 TT0,(i*8+0)
- vmovdqu _ytmp(%rsp), TT0
- ROUND_00_15 TT1,(i*8+1)
- vmovdqu _ytmp+1*SZ8(%rsp), TT1
- ROUND_00_15 TT2,(i*8+2)
- vmovdqu _ytmp+2*SZ8(%rsp), TT2
- ROUND_00_15 TT3,(i*8+3)
- vmovdqu _ytmp+3*SZ8(%rsp), TT3
- ROUND_00_15 TT0,(i*8+4)
- ROUND_00_15 TT1,(i*8+5)
- ROUND_00_15 TT2,(i*8+6)
- ROUND_00_15 TT3,(i*8+7)
- i = (i+1)
-.endr
- add $64, IDX
- i = (i*8)
-
- jmp Lrounds_16_xx
-.align 16
-Lrounds_16_xx:
-.rep 16
- ROUND_16_XX T1, i
- i = (i+1)
-.endr
-
- cmp $ROUNDS,ROUND
- jb Lrounds_16_xx
-
- # add old digest
- vpaddd _digest+0*SZ8(%rsp), a, a
- vpaddd _digest+1*SZ8(%rsp), b, b
- vpaddd _digest+2*SZ8(%rsp), c, c
- vpaddd _digest+3*SZ8(%rsp), d, d
- vpaddd _digest+4*SZ8(%rsp), e, e
- vpaddd _digest+5*SZ8(%rsp), f, f
- vpaddd _digest+6*SZ8(%rsp), g, g
- vpaddd _digest+7*SZ8(%rsp), h, h
-
- sub $1, INP_SIZE # unit is blocks
- jne lloop
-
- # write back to memory (state object) the transposed digest
- vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
- vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
-
- # update input pointers
- add IDX, inp0
- mov inp0, _args_data_ptr+0*8(STATE)
- add IDX, inp1
- mov inp1, _args_data_ptr+1*8(STATE)
- add IDX, inp2
- mov inp2, _args_data_ptr+2*8(STATE)
- add IDX, inp3
- mov inp3, _args_data_ptr+3*8(STATE)
- add IDX, inp4
- mov inp4, _args_data_ptr+4*8(STATE)
- add IDX, inp5
- mov inp5, _args_data_ptr+5*8(STATE)
- add IDX, inp6
- mov inp6, _args_data_ptr+6*8(STATE)
- add IDX, inp7
- mov inp7, _args_data_ptr+7*8(STATE)
-
- # Postamble
- mov _rsp(%rsp), %rsp
-
- # restore callee-saved clobbered registers
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-
- ret
-ENDPROC(sha256_x8_avx2)
-
-.section .rodata.K256_8, "a", @progbits
-.align 64
-K256_8:
- .octa 0x428a2f98428a2f98428a2f98428a2f98
- .octa 0x428a2f98428a2f98428a2f98428a2f98
- .octa 0x71374491713744917137449171374491
- .octa 0x71374491713744917137449171374491
- .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
- .octa 0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
- .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
- .octa 0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
- .octa 0x3956c25b3956c25b3956c25b3956c25b
- .octa 0x3956c25b3956c25b3956c25b3956c25b
- .octa 0x59f111f159f111f159f111f159f111f1
- .octa 0x59f111f159f111f159f111f159f111f1
- .octa 0x923f82a4923f82a4923f82a4923f82a4
- .octa 0x923f82a4923f82a4923f82a4923f82a4
- .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
- .octa 0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
- .octa 0xd807aa98d807aa98d807aa98d807aa98
- .octa 0xd807aa98d807aa98d807aa98d807aa98
- .octa 0x12835b0112835b0112835b0112835b01
- .octa 0x12835b0112835b0112835b0112835b01
- .octa 0x243185be243185be243185be243185be
- .octa 0x243185be243185be243185be243185be
- .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
- .octa 0x550c7dc3550c7dc3550c7dc3550c7dc3
- .octa 0x72be5d7472be5d7472be5d7472be5d74
- .octa 0x72be5d7472be5d7472be5d7472be5d74
- .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
- .octa 0x80deb1fe80deb1fe80deb1fe80deb1fe
- .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
- .octa 0x9bdc06a79bdc06a79bdc06a79bdc06a7
- .octa 0xc19bf174c19bf174c19bf174c19bf174
- .octa 0xc19bf174c19bf174c19bf174c19bf174
- .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
- .octa 0xe49b69c1e49b69c1e49b69c1e49b69c1
- .octa 0xefbe4786efbe4786efbe4786efbe4786
- .octa 0xefbe4786efbe4786efbe4786efbe4786
- .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
- .octa 0x0fc19dc60fc19dc60fc19dc60fc19dc6
- .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
- .octa 0x240ca1cc240ca1cc240ca1cc240ca1cc
- .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
- .octa 0x2de92c6f2de92c6f2de92c6f2de92c6f
- .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
- .octa 0x4a7484aa4a7484aa4a7484aa4a7484aa
- .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
- .octa 0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
- .octa 0x76f988da76f988da76f988da76f988da
- .octa 0x76f988da76f988da76f988da76f988da
- .octa 0x983e5152983e5152983e5152983e5152
- .octa 0x983e5152983e5152983e5152983e5152
- .octa 0xa831c66da831c66da831c66da831c66d
- .octa 0xa831c66da831c66da831c66da831c66d
- .octa 0xb00327c8b00327c8b00327c8b00327c8
- .octa 0xb00327c8b00327c8b00327c8b00327c8
- .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
- .octa 0xbf597fc7bf597fc7bf597fc7bf597fc7
- .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
- .octa 0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
- .octa 0xd5a79147d5a79147d5a79147d5a79147
- .octa 0xd5a79147d5a79147d5a79147d5a79147
- .octa 0x06ca635106ca635106ca635106ca6351
- .octa 0x06ca635106ca635106ca635106ca6351
- .octa 0x14292967142929671429296714292967
- .octa 0x14292967142929671429296714292967
- .octa 0x27b70a8527b70a8527b70a8527b70a85
- .octa 0x27b70a8527b70a8527b70a8527b70a85
- .octa 0x2e1b21382e1b21382e1b21382e1b2138
- .octa 0x2e1b21382e1b21382e1b21382e1b2138
- .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
- .octa 0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
- .octa 0x53380d1353380d1353380d1353380d13
- .octa 0x53380d1353380d1353380d1353380d13
- .octa 0x650a7354650a7354650a7354650a7354
- .octa 0x650a7354650a7354650a7354650a7354
- .octa 0x766a0abb766a0abb766a0abb766a0abb
- .octa 0x766a0abb766a0abb766a0abb766a0abb
- .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
- .octa 0x81c2c92e81c2c92e81c2c92e81c2c92e
- .octa 0x92722c8592722c8592722c8592722c85
- .octa 0x92722c8592722c8592722c8592722c85
- .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
- .octa 0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
- .octa 0xa81a664ba81a664ba81a664ba81a664b
- .octa 0xa81a664ba81a664ba81a664ba81a664b
- .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
- .octa 0xc24b8b70c24b8b70c24b8b70c24b8b70
- .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
- .octa 0xc76c51a3c76c51a3c76c51a3c76c51a3
- .octa 0xd192e819d192e819d192e819d192e819
- .octa 0xd192e819d192e819d192e819d192e819
- .octa 0xd6990624d6990624d6990624d6990624
- .octa 0xd6990624d6990624d6990624d6990624
- .octa 0xf40e3585f40e3585f40e3585f40e3585
- .octa 0xf40e3585f40e3585f40e3585f40e3585
- .octa 0x106aa070106aa070106aa070106aa070
- .octa 0x106aa070106aa070106aa070106aa070
- .octa 0x19a4c11619a4c11619a4c11619a4c116
- .octa 0x19a4c11619a4c11619a4c11619a4c116
- .octa 0x1e376c081e376c081e376c081e376c08
- .octa 0x1e376c081e376c081e376c081e376c08
- .octa 0x2748774c2748774c2748774c2748774c
- .octa 0x2748774c2748774c2748774c2748774c
- .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
- .octa 0x34b0bcb534b0bcb534b0bcb534b0bcb5
- .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
- .octa 0x391c0cb3391c0cb3391c0cb3391c0cb3
- .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
- .octa 0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
- .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
- .octa 0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
- .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
- .octa 0x682e6ff3682e6ff3682e6ff3682e6ff3
- .octa 0x748f82ee748f82ee748f82ee748f82ee
- .octa 0x748f82ee748f82ee748f82ee748f82ee
- .octa 0x78a5636f78a5636f78a5636f78a5636f
- .octa 0x78a5636f78a5636f78a5636f78a5636f
- .octa 0x84c8781484c8781484c8781484c87814
- .octa 0x84c8781484c8781484c8781484c87814
- .octa 0x8cc702088cc702088cc702088cc70208
- .octa 0x8cc702088cc702088cc702088cc70208
- .octa 0x90befffa90befffa90befffa90befffa
- .octa 0x90befffa90befffa90befffa90befffa
- .octa 0xa4506ceba4506ceba4506ceba4506ceb
- .octa 0xa4506ceba4506ceba4506ceba4506ceb
- .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
- .octa 0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
- .octa 0xc67178f2c67178f2c67178f2c67178f2
- .octa 0xc67178f2c67178f2c67178f2c67178f2
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
-.octa 0x0c0d0e0f08090a0b0405060700010203
-.octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-.global K256
-K256:
- .int 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .int 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .int 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .int 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .int 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .int 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .int 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .int 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .int 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .int 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .int 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .int 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .int 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .int 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .int 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .int 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 773a873..f9aff31 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -30,6 +30,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -37,19 +38,19 @@
#include <linux/types.h>
#include <crypto/sha.h>
#include <crypto/sha256_base.h>
-#include <asm/fpu/api.h>
#include <linux/string.h>
+#include <asm/simd.h>
asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
u64 rounds);
typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
-static int sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha256_transform_fn *sha256_xform)
+static int _sha256_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha256_transform_fn *sha256_xform)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
- if (!irq_fpu_usable() ||
+ if (!crypto_simd_usable() ||
(sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
return crypto_sha256_update(desc, data, len);
@@ -67,7 +68,7 @@
static int sha256_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
{
- if (!irq_fpu_usable())
+ if (!crypto_simd_usable())
return crypto_sha256_finup(desc, data, len, out);
kernel_fpu_begin();
@@ -83,7 +84,7 @@
static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha256_update(desc, data, len, sha256_transform_ssse3);
+ return _sha256_update(desc, data, len, sha256_transform_ssse3);
}
static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
@@ -150,7 +151,7 @@
static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha256_update(desc, data, len, sha256_transform_avx);
+ return _sha256_update(desc, data, len, sha256_transform_avx);
}
static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
@@ -232,7 +233,7 @@
static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha256_update(desc, data, len, sha256_transform_rorx);
+ return _sha256_update(desc, data, len, sha256_transform_rorx);
}
static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
@@ -312,7 +313,7 @@
static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha256_update(desc, data, len, sha256_ni_transform);
+ return _sha256_update(desc, data, len, sha256_ni_transform);
}
static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
diff --git a/arch/x86/crypto/sha512-mb/Makefile b/arch/x86/crypto/sha512-mb/Makefile
deleted file mode 100644
index 90f1ef6..0000000
--- a/arch/x86/crypto/sha512-mb/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Arch-specific CryptoAPI modules.
-#
-
-avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
- $(comma)4)$(comma)%ymm2,yes,no)
-ifeq ($(avx2_supported),yes)
- obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb.o
- sha512-mb-y := sha512_mb.o sha512_mb_mgr_flush_avx2.o \
- sha512_mb_mgr_init_avx2.o sha512_mb_mgr_submit_avx2.o sha512_x4_avx2.o
-endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
deleted file mode 100644
index 26b8567..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
- * Multi buffer SHA512 algorithm Glue Code
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <linux/list.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/sha.h>
-#include <crypto/mcryptd.h>
-#include <crypto/crypto_wq.h>
-#include <asm/byteorder.h>
-#include <linux/hardirq.h>
-#include <asm/fpu/api.h>
-#include "sha512_mb_ctx.h"
-
-#define FLUSH_INTERVAL 1000 /* in usec */
-
-static struct mcryptd_alg_state sha512_mb_alg_state;
-
-struct sha512_mb_ctx {
- struct mcryptd_ahash *mcryptd_tfm;
-};
-
-static inline struct mcryptd_hash_request_ctx
- *cast_hash_to_mcryptd_ctx(struct sha512_hash_ctx *hash_ctx)
-{
- struct ahash_request *areq;
-
- areq = container_of((void *) hash_ctx, struct ahash_request, __ctx);
- return container_of(areq, struct mcryptd_hash_request_ctx, areq);
-}
-
-static inline struct ahash_request
- *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
-{
- return container_of((void *) ctx, struct ahash_request, __ctx);
-}
-
-static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
- struct ahash_request *areq)
-{
- rctx->flag = HASH_UPDATE;
-}
-
-static asmlinkage void (*sha512_job_mgr_init)(struct sha512_mb_mgr *state);
-static asmlinkage struct job_sha512* (*sha512_job_mgr_submit)
- (struct sha512_mb_mgr *state,
- struct job_sha512 *job);
-static asmlinkage struct job_sha512* (*sha512_job_mgr_flush)
- (struct sha512_mb_mgr *state);
-static asmlinkage struct job_sha512* (*sha512_job_mgr_get_comp_job)
- (struct sha512_mb_mgr *state);
-
-inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2],
- uint64_t total_len)
-{
- uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
-
- memset(&padblock[i], 0, SHA512_BLOCK_SIZE);
- padblock[i] = 0x80;
-
- i += ((SHA512_BLOCK_SIZE - 1) &
- (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1)))
- + 1 + SHA512_PADLENGTHFIELD_SIZE;
-
-#if SHA512_PADLENGTHFIELD_SIZE == 16
- *((uint64_t *) &padblock[i - 16]) = 0;
-#endif
-
- *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
-
- /* Number of extra blocks to hash */
- return i >> SHA512_LOG2_BLOCK_SIZE;
-}
-
-static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
- (struct sha512_ctx_mgr *mgr, struct sha512_hash_ctx *ctx)
-{
- while (ctx) {
- if (ctx->status & HASH_CTX_STS_COMPLETE) {
- /* Clear PROCESSING bit */
- ctx->status = HASH_CTX_STS_COMPLETE;
- return ctx;
- }
-
- /*
- * If the extra blocks are empty, begin hashing what remains
- * in the user's buffer.
- */
- if (ctx->partial_block_buffer_length == 0 &&
- ctx->incoming_buffer_length) {
-
- const void *buffer = ctx->incoming_buffer;
- uint32_t len = ctx->incoming_buffer_length;
- uint32_t copy_len;
-
- /*
- * Only entire blocks can be hashed.
- * Copy remainder to extra blocks buffer.
- */
- copy_len = len & (SHA512_BLOCK_SIZE-1);
-
- if (copy_len) {
- len -= copy_len;
- memcpy(ctx->partial_block_buffer,
- ((const char *) buffer + len),
- copy_len);
- ctx->partial_block_buffer_length = copy_len;
- }
-
- ctx->incoming_buffer_length = 0;
-
- /* len should be a multiple of the block size now */
- assert((len % SHA512_BLOCK_SIZE) == 0);
-
- /* Set len to the number of blocks to be hashed */
- len >>= SHA512_LOG2_BLOCK_SIZE;
-
- if (len) {
-
- ctx->job.buffer = (uint8_t *) buffer;
- ctx->job.len = len;
- ctx = (struct sha512_hash_ctx *)
- sha512_job_mgr_submit(&mgr->mgr,
- &ctx->job);
- continue;
- }
- }
-
- /*
- * If the extra blocks are not empty, then we are
- * either on the last block(s) or we need more
- * user input before continuing.
- */
- if (ctx->status & HASH_CTX_STS_LAST) {
-
- uint8_t *buf = ctx->partial_block_buffer;
- uint32_t n_extra_blocks =
- sha512_pad(buf, ctx->total_length);
-
- ctx->status = (HASH_CTX_STS_PROCESSING |
- HASH_CTX_STS_COMPLETE);
- ctx->job.buffer = buf;
- ctx->job.len = (uint32_t) n_extra_blocks;
- ctx = (struct sha512_hash_ctx *)
- sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
- continue;
- }
-
- if (ctx)
- ctx->status = HASH_CTX_STS_IDLE;
- return ctx;
- }
-
- return NULL;
-}
-
-static struct sha512_hash_ctx
- *sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate)
-{
- /*
- * If get_comp_job returns NULL, there are no jobs complete.
- * If get_comp_job returns a job, verify that it is safe to return to
- * the user.
- * If it is not ready, resubmit the job to finish processing.
- * If sha512_ctx_mgr_resubmit returned a job, it is ready to be
- * returned.
- * Otherwise, all jobs currently being managed by the hash_ctx_mgr
- * still need processing.
- */
- struct sha512_ctx_mgr *mgr;
- struct sha512_hash_ctx *ctx;
- unsigned long flags;
-
- mgr = cstate->mgr;
- spin_lock_irqsave(&cstate->work_lock, flags);
- ctx = (struct sha512_hash_ctx *)
- sha512_job_mgr_get_comp_job(&mgr->mgr);
- ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
- spin_unlock_irqrestore(&cstate->work_lock, flags);
- return ctx;
-}
-
-static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
-{
- sha512_job_mgr_init(&mgr->mgr);
-}
-
-static struct sha512_hash_ctx
- *sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate,
- struct sha512_hash_ctx *ctx,
- const void *buffer,
- uint32_t len,
- int flags)
-{
- struct sha512_ctx_mgr *mgr;
- unsigned long irqflags;
-
- mgr = cstate->mgr;
- spin_lock_irqsave(&cstate->work_lock, irqflags);
- if (flags & ~(HASH_UPDATE | HASH_LAST)) {
- /* User should not pass anything other than UPDATE or LAST */
- ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
- goto unlock;
- }
-
- if (ctx->status & HASH_CTX_STS_PROCESSING) {
- /* Cannot submit to a currently processing job. */
- ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
- goto unlock;
- }
-
- if (ctx->status & HASH_CTX_STS_COMPLETE) {
- /* Cannot update a finished job. */
- ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
- goto unlock;
- }
-
- /*
- * If we made it here, there were no errors during this call to
- * submit
- */
- ctx->error = HASH_CTX_ERROR_NONE;
-
- /* Store buffer ptr info from user */
- ctx->incoming_buffer = buffer;
- ctx->incoming_buffer_length = len;
-
- /*
- * Store the user's request flags and mark this ctx as currently being
- * processed.
- */
- ctx->status = (flags & HASH_LAST) ?
- (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
- HASH_CTX_STS_PROCESSING;
-
- /* Advance byte counter */
- ctx->total_length += len;
-
- /*
- * If there is anything currently buffered in the extra blocks,
- * append to it until it contains a whole block.
- * Or if the user's buffer contains less than a whole block,
- * append as much as possible to the extra block.
- */
- if (ctx->partial_block_buffer_length || len < SHA512_BLOCK_SIZE) {
- /* Compute how many bytes to copy from user buffer into extra
- * block
- */
- uint32_t copy_len = SHA512_BLOCK_SIZE -
- ctx->partial_block_buffer_length;
- if (len < copy_len)
- copy_len = len;
-
- if (copy_len) {
- /* Copy and update relevant pointers and counters */
- memcpy
- (&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
- buffer, copy_len);
-
- ctx->partial_block_buffer_length += copy_len;
- ctx->incoming_buffer = (const void *)
- ((const char *)buffer + copy_len);
- ctx->incoming_buffer_length = len - copy_len;
- }
-
- /* The extra block should never contain more than 1 block
- * here
- */
- assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
-
- /* If the extra block buffer contains exactly 1 block, it can
- * be hashed.
- */
- if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
- ctx->partial_block_buffer_length = 0;
-
- ctx->job.buffer = ctx->partial_block_buffer;
- ctx->job.len = 1;
- ctx = (struct sha512_hash_ctx *)
- sha512_job_mgr_submit(&mgr->mgr, &ctx->job);
- }
- }
-
- ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
-unlock:
- spin_unlock_irqrestore(&cstate->work_lock, irqflags);
- return ctx;
-}
-
-static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate)
-{
- struct sha512_ctx_mgr *mgr;
- struct sha512_hash_ctx *ctx;
- unsigned long flags;
-
- mgr = cstate->mgr;
- spin_lock_irqsave(&cstate->work_lock, flags);
- while (1) {
- ctx = (struct sha512_hash_ctx *)
- sha512_job_mgr_flush(&mgr->mgr);
-
- /* If flush returned 0, there are no more jobs in flight. */
- if (!ctx)
- break;
-
- /*
- * If flush returned a job, resubmit the job to finish
- * processing.
- */
- ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
-
- /*
- * If sha512_ctx_mgr_resubmit returned a job, it is ready to
- * be returned. Otherwise, all jobs currently being managed by
- * the sha512_ctx_mgr still need processing. Loop.
- */
- if (ctx)
- break;
- }
- spin_unlock_irqrestore(&cstate->work_lock, flags);
- return ctx;
-}
-
-static int sha512_mb_init(struct ahash_request *areq)
-{
- struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
-
- hash_ctx_init(sctx);
- sctx->job.result_digest[0] = SHA512_H0;
- sctx->job.result_digest[1] = SHA512_H1;
- sctx->job.result_digest[2] = SHA512_H2;
- sctx->job.result_digest[3] = SHA512_H3;
- sctx->job.result_digest[4] = SHA512_H4;
- sctx->job.result_digest[5] = SHA512_H5;
- sctx->job.result_digest[6] = SHA512_H6;
- sctx->job.result_digest[7] = SHA512_H7;
- sctx->total_length = 0;
- sctx->partial_block_buffer_length = 0;
- sctx->status = HASH_CTX_STS_IDLE;
-
- return 0;
-}
-
-static int sha512_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
-{
- int i;
- struct sha512_hash_ctx *sctx = ahash_request_ctx(&rctx->areq);
- __be64 *dst = (__be64 *) rctx->out;
-
- for (i = 0; i < 8; ++i)
- dst[i] = cpu_to_be64(sctx->job.result_digest[i]);
-
- return 0;
-}
-
-static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
- struct mcryptd_alg_cstate *cstate, bool flush)
-{
- int flag = HASH_UPDATE;
- int nbytes, err = 0;
- struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
- struct sha512_hash_ctx *sha_ctx;
-
- /* more work ? */
- while (!(rctx->flag & HASH_DONE)) {
- nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
- if (nbytes < 0) {
- err = nbytes;
- goto out;
- }
- /* check if the walk is done */
- if (crypto_ahash_walk_last(&rctx->walk)) {
- rctx->flag |= HASH_DONE;
- if (rctx->flag & HASH_FINAL)
- flag |= HASH_LAST;
-
- }
- sha_ctx = (struct sha512_hash_ctx *)
- ahash_request_ctx(&rctx->areq);
- kernel_fpu_begin();
- sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx,
- rctx->walk.data, nbytes, flag);
- if (!sha_ctx) {
- if (flush)
- sha_ctx = sha512_ctx_mgr_flush(cstate);
- }
- kernel_fpu_end();
- if (sha_ctx)
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- else {
- rctx = NULL;
- goto out;
- }
- }
-
- /* copy the results */
- if (rctx->flag & HASH_FINAL)
- sha512_mb_set_results(rctx);
-
-out:
- *ret_rctx = rctx;
- return err;
-}
-
-static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
- struct mcryptd_alg_cstate *cstate,
- int err)
-{
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha512_hash_ctx *sha_ctx;
- struct mcryptd_hash_request_ctx *req_ctx;
- int ret;
- unsigned long flags;
-
- /* remove from work list */
- spin_lock_irqsave(&cstate->work_lock, flags);
- list_del(&rctx->waiter);
- spin_unlock_irqrestore(&cstate->work_lock, flags);
-
- if (irqs_disabled())
- rctx->complete(&req->base, err);
- else {
- local_bh_disable();
- rctx->complete(&req->base, err);
- local_bh_enable();
- }
-
- /* check to see if there are other jobs that are done */
- sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
- while (sha_ctx) {
- req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&req_ctx, cstate, false);
- if (req_ctx) {
- spin_lock_irqsave(&cstate->work_lock, flags);
- list_del(&req_ctx->waiter);
- spin_unlock_irqrestore(&cstate->work_lock, flags);
-
- req = cast_mcryptd_ctx_to_req(req_ctx);
- if (irqs_disabled())
- req_ctx->complete(&req->base, ret);
- else {
- local_bh_disable();
- req_ctx->complete(&req->base, ret);
- local_bh_enable();
- }
- }
- sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
- }
-
- return 0;
-}
-
-static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
- struct mcryptd_alg_cstate *cstate)
-{
- unsigned long next_flush;
- unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
- unsigned long flags;
-
- /* initialize tag */
- rctx->tag.arrival = jiffies; /* tag the arrival time */
- rctx->tag.seq_num = cstate->next_seq_num++;
- next_flush = rctx->tag.arrival + delay;
- rctx->tag.expire = next_flush;
-
- spin_lock_irqsave(&cstate->work_lock, flags);
- list_add_tail(&rctx->waiter, &cstate->work_list);
- spin_unlock_irqrestore(&cstate->work_lock, flags);
-
- mcryptd_arm_flusher(cstate, delay);
-}
-
-static int sha512_mb_update(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx,
- areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
-
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha512_hash_ctx *sha_ctx;
- int ret = 0, nbytes;
-
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
- if (nbytes < 0) {
- ret = nbytes;
- goto done;
- }
-
- if (crypto_ahash_walk_last(&rctx->walk))
- rctx->flag |= HASH_DONE;
-
- /* submit */
- sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
- sha512_mb_add_list(rctx, cstate);
- kernel_fpu_begin();
- sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
- nbytes, HASH_UPDATE);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
-
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha512_mb_finup(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx,
- areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
-
- struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
- struct sha512_hash_ctx *sha_ctx;
- int ret = 0, flag = HASH_UPDATE, nbytes;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- nbytes = crypto_ahash_walk_first(req, &rctx->walk);
-
- if (nbytes < 0) {
- ret = nbytes;
- goto done;
- }
-
- if (crypto_ahash_walk_last(&rctx->walk)) {
- rctx->flag |= HASH_DONE;
- flag = HASH_LAST;
- }
-
- /* submit */
- rctx->flag |= HASH_FINAL;
- sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
- sha512_mb_add_list(rctx, cstate);
-
- kernel_fpu_begin();
- sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
- nbytes, flag);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha512_mb_final(struct ahash_request *areq)
-{
- struct mcryptd_hash_request_ctx *rctx =
- container_of(areq, struct mcryptd_hash_request_ctx,
- areq);
- struct mcryptd_alg_cstate *cstate =
- this_cpu_ptr(sha512_mb_alg_state.alg_cstate);
-
- struct sha512_hash_ctx *sha_ctx;
- int ret = 0;
- u8 data;
-
- /* sanity check */
- if (rctx->tag.cpu != smp_processor_id()) {
- pr_err("mcryptd error: cpu clash\n");
- goto done;
- }
-
- /* need to init context */
- req_ctx_init(rctx, areq);
-
- rctx->flag |= HASH_DONE | HASH_FINAL;
-
- sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
- /* flag HASH_FINAL and 0 data size */
- sha512_mb_add_list(rctx, cstate);
- kernel_fpu_begin();
- sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST);
- kernel_fpu_end();
-
- /* check if anything is returned */
- if (!sha_ctx)
- return -EINPROGRESS;
-
- if (sha_ctx->error) {
- ret = sha_ctx->error;
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- goto done;
- }
-
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- ret = sha_finish_walk(&rctx, cstate, false);
- if (!rctx)
- return -EINPROGRESS;
-done:
- sha_complete_job(rctx, cstate, ret);
- return ret;
-}
-
-static int sha512_mb_export(struct ahash_request *areq, void *out)
-{
- struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
-
- memcpy(out, sctx, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha512_mb_import(struct ahash_request *areq, const void *in)
-{
- struct sha512_hash_ctx *sctx = ahash_request_ctx(areq);
-
- memcpy(sctx, in, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha512_mb_async_init_tfm(struct crypto_tfm *tfm)
-{
- struct mcryptd_ahash *mcryptd_tfm;
- struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
- struct mcryptd_hash_ctx *mctx;
-
- mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha512-mb",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(mcryptd_tfm))
- return PTR_ERR(mcryptd_tfm);
- mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
- mctx->alg_state = &sha512_mb_alg_state;
- ctx->mcryptd_tfm = mcryptd_tfm;
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- crypto_ahash_reqsize(&mcryptd_tfm->base));
-
- return 0;
-}
-
-static void sha512_mb_async_exit_tfm(struct crypto_tfm *tfm)
-{
- struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
- mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static int sha512_mb_areq_init_tfm(struct crypto_tfm *tfm)
-{
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- sizeof(struct sha512_hash_ctx));
-
- return 0;
-}
-
-static void sha512_mb_areq_exit_tfm(struct crypto_tfm *tfm)
-{
- struct sha512_mb_ctx *ctx = crypto_tfm_ctx(tfm);
-
- mcryptd_free_ahash(ctx->mcryptd_tfm);
-}
-
-static struct ahash_alg sha512_mb_areq_alg = {
- .init = sha512_mb_init,
- .update = sha512_mb_update,
- .final = sha512_mb_final,
- .finup = sha512_mb_finup,
- .export = sha512_mb_export,
- .import = sha512_mb_import,
- .halg = {
- .digestsize = SHA512_DIGEST_SIZE,
- .statesize = sizeof(struct sha512_hash_ctx),
- .base = {
- .cra_name = "__sha512-mb",
- .cra_driver_name = "__intel_sha512-mb",
- .cra_priority = 100,
- /*
- * use ASYNC flag as some buffers in multi-buffer
- * algo may not have completed before hashing thread
- * sleep
- */
- .cra_flags = CRYPTO_ALG_ASYNC |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT
- (sha512_mb_areq_alg.halg.base.cra_list),
- .cra_init = sha512_mb_areq_init_tfm,
- .cra_exit = sha512_mb_areq_exit_tfm,
- .cra_ctxsize = sizeof(struct sha512_hash_ctx),
- }
- }
-};
-
-static int sha512_mb_async_init(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_init(mcryptd_req);
-}
-
-static int sha512_mb_async_update(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_update(mcryptd_req);
-}
-
-static int sha512_mb_async_finup(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_finup(mcryptd_req);
-}
-
-static int sha512_mb_async_final(struct ahash_request *req)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
-
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_final(mcryptd_req);
-}
-
-static int sha512_mb_async_digest(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_digest(mcryptd_req);
-}
-
-static int sha512_mb_async_export(struct ahash_request *req, void *out)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- return crypto_ahash_export(mcryptd_req, out);
-}
-
-static int sha512_mb_async_import(struct ahash_request *req, const void *in)
-{
- struct ahash_request *mcryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct sha512_mb_ctx *ctx = crypto_ahash_ctx(tfm);
- struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
- struct crypto_ahash *child = mcryptd_ahash_child(mcryptd_tfm);
- struct mcryptd_hash_request_ctx *rctx;
- struct ahash_request *areq;
-
- memcpy(mcryptd_req, req, sizeof(*req));
- ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
- rctx = ahash_request_ctx(mcryptd_req);
-
- areq = &rctx->areq;
-
- ahash_request_set_tfm(areq, child);
- ahash_request_set_callback(areq, CRYPTO_TFM_REQ_MAY_SLEEP,
- rctx->complete, req);
-
- return crypto_ahash_import(mcryptd_req, in);
-}
-
-static struct ahash_alg sha512_mb_async_alg = {
- .init = sha512_mb_async_init,
- .update = sha512_mb_async_update,
- .final = sha512_mb_async_final,
- .finup = sha512_mb_async_finup,
- .digest = sha512_mb_async_digest,
- .export = sha512_mb_async_export,
- .import = sha512_mb_async_import,
- .halg = {
- .digestsize = SHA512_DIGEST_SIZE,
- .statesize = sizeof(struct sha512_hash_ctx),
- .base = {
- .cra_name = "sha512",
- .cra_driver_name = "sha512_mb",
- /*
- * Low priority, since with few concurrent hash requests
- * this is extremely slow due to the flush delay. Users
- * whose workloads would benefit from this can request
- * it explicitly by driver name, or can increase its
- * priority at runtime using NETLINK_CRYPTO.
- */
- .cra_priority = 50,
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = SHA512_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_list = LIST_HEAD_INIT
- (sha512_mb_async_alg.halg.base.cra_list),
- .cra_init = sha512_mb_async_init_tfm,
- .cra_exit = sha512_mb_async_exit_tfm,
- .cra_ctxsize = sizeof(struct sha512_mb_ctx),
- .cra_alignmask = 0,
- },
- },
-};
-
-static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
-{
- struct mcryptd_hash_request_ctx *rctx;
- unsigned long cur_time;
- unsigned long next_flush = 0;
- struct sha512_hash_ctx *sha_ctx;
-
-
- cur_time = jiffies;
-
- while (!list_empty(&cstate->work_list)) {
- rctx = list_entry(cstate->work_list.next,
- struct mcryptd_hash_request_ctx, waiter);
- if time_before(cur_time, rctx->tag.expire)
- break;
- kernel_fpu_begin();
- sha_ctx = (struct sha512_hash_ctx *)
- sha512_ctx_mgr_flush(cstate);
- kernel_fpu_end();
- if (!sha_ctx) {
- pr_err("sha512_mb error: nothing got flushed for"
- " non-empty list\n");
- break;
- }
- rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
- sha_finish_walk(&rctx, cstate, true);
- sha_complete_job(rctx, cstate, 0);
- }
-
- if (!list_empty(&cstate->work_list)) {
- rctx = list_entry(cstate->work_list.next,
- struct mcryptd_hash_request_ctx, waiter);
- /* get the hash context and then flush time */
- next_flush = rctx->tag.expire;
- mcryptd_arm_flusher(cstate, get_delay(next_flush));
- }
- return next_flush;
-}
-
-static int __init sha512_mb_mod_init(void)
-{
-
- int cpu;
- int err;
- struct mcryptd_alg_cstate *cpu_state;
-
- /* check for dependent cpu features */
- if (!boot_cpu_has(X86_FEATURE_AVX2) ||
- !boot_cpu_has(X86_FEATURE_BMI2))
- return -ENODEV;
-
- /* initialize multibuffer structures */
- sha512_mb_alg_state.alg_cstate =
- alloc_percpu(struct mcryptd_alg_cstate);
-
- sha512_job_mgr_init = sha512_mb_mgr_init_avx2;
- sha512_job_mgr_submit = sha512_mb_mgr_submit_avx2;
- sha512_job_mgr_flush = sha512_mb_mgr_flush_avx2;
- sha512_job_mgr_get_comp_job = sha512_mb_mgr_get_comp_job_avx2;
-
- if (!sha512_mb_alg_state.alg_cstate)
- return -ENOMEM;
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
- cpu_state->next_flush = 0;
- cpu_state->next_seq_num = 0;
- cpu_state->flusher_engaged = false;
- INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
- cpu_state->cpu = cpu;
- cpu_state->alg_state = &sha512_mb_alg_state;
- cpu_state->mgr = kzalloc(sizeof(struct sha512_ctx_mgr),
- GFP_KERNEL);
- if (!cpu_state->mgr)
- goto err2;
- sha512_ctx_mgr_init(cpu_state->mgr);
- INIT_LIST_HEAD(&cpu_state->work_list);
- spin_lock_init(&cpu_state->work_lock);
- }
- sha512_mb_alg_state.flusher = &sha512_mb_flusher;
-
- err = crypto_register_ahash(&sha512_mb_areq_alg);
- if (err)
- goto err2;
- err = crypto_register_ahash(&sha512_mb_async_alg);
- if (err)
- goto err1;
-
-
- return 0;
-err1:
- crypto_unregister_ahash(&sha512_mb_areq_alg);
-err2:
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
- kfree(cpu_state->mgr);
- }
- free_percpu(sha512_mb_alg_state.alg_cstate);
- return -ENODEV;
-}
-
-static void __exit sha512_mb_mod_fini(void)
-{
- int cpu;
- struct mcryptd_alg_cstate *cpu_state;
-
- crypto_unregister_ahash(&sha512_mb_async_alg);
- crypto_unregister_ahash(&sha512_mb_areq_alg);
- for_each_possible_cpu(cpu) {
- cpu_state = per_cpu_ptr(sha512_mb_alg_state.alg_cstate, cpu);
- kfree(cpu_state->mgr);
- }
- free_percpu(sha512_mb_alg_state.alg_cstate);
-}
-
-module_init(sha512_mb_mod_init);
-module_exit(sha512_mb_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, multi buffer accelerated");
-
-MODULE_ALIAS("sha512");
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
deleted file mode 100644
index e5c465b..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Header file for multi buffer SHA512 context
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _SHA_MB_CTX_INTERNAL_H
-#define _SHA_MB_CTX_INTERNAL_H
-
-#include "sha512_mb_mgr.h"
-
-#define HASH_UPDATE 0x00
-#define HASH_LAST 0x01
-#define HASH_DONE 0x02
-#define HASH_FINAL 0x04
-
-#define HASH_CTX_STS_IDLE 0x00
-#define HASH_CTX_STS_PROCESSING 0x01
-#define HASH_CTX_STS_LAST 0x02
-#define HASH_CTX_STS_COMPLETE 0x04
-
-enum hash_ctx_error {
- HASH_CTX_ERROR_NONE = 0,
- HASH_CTX_ERROR_INVALID_FLAGS = -1,
- HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
- HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
-};
-
-#define hash_ctx_user_data(ctx) ((ctx)->user_data)
-#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
-#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
-#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
-#define hash_ctx_status(ctx) ((ctx)->status)
-#define hash_ctx_error(ctx) ((ctx)->error)
-#define hash_ctx_init(ctx) \
- do { \
- (ctx)->error = HASH_CTX_ERROR_NONE; \
- (ctx)->status = HASH_CTX_STS_COMPLETE; \
- } while (0)
-
-/* Hash Constants and Typedefs */
-#define SHA512_DIGEST_LENGTH 8
-#define SHA512_LOG2_BLOCK_SIZE 7
-
-#define SHA512_PADLENGTHFIELD_SIZE 16
-
-#ifdef SHA_MB_DEBUG
-#define assert(expr) \
-do { \
- if (unlikely(!(expr))) { \
- printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
- #expr, __FILE__, __func__, __LINE__); \
- } \
-} while (0)
-#else
-#define assert(expr) do {} while (0)
-#endif
-
-struct sha512_ctx_mgr {
- struct sha512_mb_mgr mgr;
-};
-
-/* typedef struct sha512_ctx_mgr sha512_ctx_mgr; */
-
-struct sha512_hash_ctx {
- /* Must be at struct offset 0 */
- struct job_sha512 job;
- /* status flag */
- int status;
- /* error flag */
- int error;
-
- uint64_t total_length;
- const void *incoming_buffer;
- uint32_t incoming_buffer_length;
- uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2];
- uint32_t partial_block_buffer_length;
- void *user_data;
-};
-
-#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h b/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
deleted file mode 100644
index 178f17e..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Header file for multi buffer SHA512 algorithm manager
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __SHA_MB_MGR_H
-#define __SHA_MB_MGR_H
-
-#include <linux/types.h>
-
-#define NUM_SHA512_DIGEST_WORDS 8
-
-enum job_sts {STS_UNKNOWN = 0,
- STS_BEING_PROCESSED = 1,
- STS_COMPLETED = 2,
- STS_INTERNAL_ERROR = 3,
- STS_ERROR = 4
-};
-
-struct job_sha512 {
- u8 *buffer;
- u64 len;
- u64 result_digest[NUM_SHA512_DIGEST_WORDS] __aligned(32);
- enum job_sts status;
- void *user_data;
-};
-
-struct sha512_args_x4 {
- uint64_t digest[8][4];
- uint8_t *data_ptr[4];
-};
-
-struct sha512_lane_data {
- struct job_sha512 *job_in_lane;
-};
-
-struct sha512_mb_mgr {
- struct sha512_args_x4 args;
-
- uint64_t lens[4];
-
- /* each byte is index (0...7) of unused lanes */
- uint64_t unused_lanes;
- /* byte 4 is set to FF as a flag */
- struct sha512_lane_data ldata[4];
-};
-
-#define SHA512_MB_MGR_NUM_LANES_AVX2 4
-
-void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state);
-struct job_sha512 *sha512_mb_mgr_submit_avx2(struct sha512_mb_mgr *state,
- struct job_sha512 *job);
-struct job_sha512 *sha512_mb_mgr_flush_avx2(struct sha512_mb_mgr *state);
-struct job_sha512 *sha512_mb_mgr_get_comp_job_avx2(struct sha512_mb_mgr *state);
-
-#endif
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
deleted file mode 100644
index cf2636d..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_datastruct.S
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Header file for multi buffer SHA256 algorithm data structure
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-# Macros for defining data structures
-
-# Usage example
-
-#START_FIELDS # JOB_AES
-### name size align
-#FIELD _plaintext, 8, 8 # pointer to plaintext
-#FIELD _ciphertext, 8, 8 # pointer to ciphertext
-#FIELD _IV, 16, 8 # IV
-#FIELD _keys, 8, 8 # pointer to keys
-#FIELD _len, 4, 4 # length in bytes
-#FIELD _status, 4, 4 # status enumeration
-#FIELD _user_data, 8, 8 # pointer to user data
-#UNION _union, size1, align1, \
-# size2, align2, \
-# size3, align3, \
-# ...
-#END_FIELDS
-#%assign _JOB_AES_size _FIELD_OFFSET
-#%assign _JOB_AES_align _STRUCT_ALIGN
-
-#########################################################################
-
-# Alternate "struc-like" syntax:
-# STRUCT job_aes2
-# RES_Q .plaintext, 1
-# RES_Q .ciphertext, 1
-# RES_DQ .IV, 1
-# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
-# RES_U .union, size1, align1, \
-# size2, align2, \
-# ...
-# ENDSTRUCT
-# # Following only needed if nesting
-# %assign job_aes2_size _FIELD_OFFSET
-# %assign job_aes2_align _STRUCT_ALIGN
-#
-# RES_* macros take a name, a count and an optional alignment.
-# The count in in terms of the base size of the macro, and the
-# default alignment is the base size.
-# The macros are:
-# Macro Base size
-# RES_B 1
-# RES_W 2
-# RES_D 4
-# RES_Q 8
-# RES_DQ 16
-# RES_Y 32
-# RES_Z 64
-#
-# RES_U defines a union. It's arguments are a name and two or more
-# pairs of "size, alignment"
-#
-# The two assigns are only needed if this structure is being nested
-# within another. Even if the assigns are not done, one can still use
-# STRUCT_NAME_size as the size of the structure.
-#
-# Note that for nesting, you still need to assign to STRUCT_NAME_size.
-#
-# The differences between this and using "struc" directly are that each
-# type is implicitly aligned to its natural length (although this can be
-# over-ridden with an explicit third parameter), and that the structure
-# is padded at the end to its overall alignment.
-#
-
-#########################################################################
-
-#ifndef _DATASTRUCT_ASM_
-#define _DATASTRUCT_ASM_
-
-#define PTR_SZ 8
-#define SHA512_DIGEST_WORD_SIZE 8
-#define SHA512_MB_MGR_NUM_LANES_AVX2 4
-#define NUM_SHA512_DIGEST_WORDS 8
-#define SZ4 4*SHA512_DIGEST_WORD_SIZE
-#define ROUNDS 80*SZ4
-#define SHA512_DIGEST_ROW_SIZE (SHA512_MB_MGR_NUM_LANES_AVX2 * 8)
-
-# START_FIELDS
-.macro START_FIELDS
- _FIELD_OFFSET = 0
- _STRUCT_ALIGN = 0
-.endm
-
-# FIELD name size align
-.macro FIELD name size align
- _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
- \name = _FIELD_OFFSET
- _FIELD_OFFSET = _FIELD_OFFSET + (\size)
-.if (\align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = \align
-.endif
-.endm
-
-# END_FIELDS
-.macro END_FIELDS
- _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
-.endm
-
-.macro STRUCT p1
-START_FIELDS
-.struc \p1
-.endm
-
-.macro ENDSTRUCT
- tmp = _FIELD_OFFSET
- END_FIELDS
- tmp = (_FIELD_OFFSET - ##tmp)
-.if (tmp > 0)
- .lcomm tmp
-.endm
-
-## RES_int name size align
-.macro RES_int p1 p2 p3
- name = \p1
- size = \p2
- align = .\p3
-
- _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
-.align align
-.lcomm name size
- _FIELD_OFFSET = _FIELD_OFFSET + (size)
-.if (align > _STRUCT_ALIGN)
- _STRUCT_ALIGN = align
-.endif
-.endm
-
-# macro RES_B name, size [, align]
-.macro RES_B _name, _size, _align=1
-RES_int _name _size _align
-.endm
-
-# macro RES_W name, size [, align]
-.macro RES_W _name, _size, _align=2
-RES_int _name 2*(_size) _align
-.endm
-
-# macro RES_D name, size [, align]
-.macro RES_D _name, _size, _align=4
-RES_int _name 4*(_size) _align
-.endm
-
-# macro RES_Q name, size [, align]
-.macro RES_Q _name, _size, _align=8
-RES_int _name 8*(_size) _align
-.endm
-
-# macro RES_DQ name, size [, align]
-.macro RES_DQ _name, _size, _align=16
-RES_int _name 16*(_size) _align
-.endm
-
-# macro RES_Y name, size [, align]
-.macro RES_Y _name, _size, _align=32
-RES_int _name 32*(_size) _align
-.endm
-
-# macro RES_Z name, size [, align]
-.macro RES_Z _name, _size, _align=64
-RES_int _name 64*(_size) _align
-.endm
-
-#endif
-
-###################################################################
-### Define SHA512 Out Of Order Data Structures
-###################################################################
-
-START_FIELDS # LANE_DATA
-### name size align
-FIELD _job_in_lane, 8, 8 # pointer to job object
-END_FIELDS
-
- _LANE_DATA_size = _FIELD_OFFSET
- _LANE_DATA_align = _STRUCT_ALIGN
-
-####################################################################
-
-START_FIELDS # SHA512_ARGS_X4
-### name size align
-FIELD _digest, 8*8*4, 4 # transposed digest
-FIELD _data_ptr, 8*4, 8 # array of pointers to data
-END_FIELDS
-
- _SHA512_ARGS_X4_size = _FIELD_OFFSET
- _SHA512_ARGS_X4_align = _STRUCT_ALIGN
-
-#####################################################################
-
-START_FIELDS # MB_MGR
-### name size align
-FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
-FIELD _lens, 8*4, 8
-FIELD _unused_lanes, 8, 8
-FIELD _ldata, _LANE_DATA_size*4, _LANE_DATA_align
-END_FIELDS
-
- _MB_MGR_size = _FIELD_OFFSET
- _MB_MGR_align = _STRUCT_ALIGN
-
-_args_digest = _args + _digest
-_args_data_ptr = _args + _data_ptr
-
-#######################################################################
-
-#######################################################################
-#### Define constants
-#######################################################################
-
-#define STS_UNKNOWN 0
-#define STS_BEING_PROCESSED 1
-#define STS_COMPLETED 2
-
-#######################################################################
-#### Define JOB_SHA512 structure
-#######################################################################
-
-START_FIELDS # JOB_SHA512
-### name size align
-FIELD _buffer, 8, 8 # pointer to buffer
-FIELD _len, 8, 8 # length in bytes
-FIELD _result_digest, 8*8, 32 # Digest (output)
-FIELD _status, 4, 4
-FIELD _user_data, 8, 8
-END_FIELDS
-
- _JOB_SHA512_size = _FIELD_OFFSET
- _JOB_SHA512_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
deleted file mode 100644
index 7c629ca..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Flush routine for SHA512 multibuffer
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha512_mb_mgr_datastruct.S"
-
-.extern sha512_x4_avx2
-
-# LINUX register definitions
-#define arg1 %rdi
-#define arg2 %rsi
-
-# idx needs to be other than arg1, arg2, rbx, r12
-#define idx %rdx
-
-# Common definitions
-#define state arg1
-#define job arg2
-#define len2 arg2
-
-#define unused_lanes %rbx
-#define lane_data %rbx
-#define tmp2 %rbx
-
-#define job_rax %rax
-#define tmp1 %rax
-#define size_offset %rax
-#define tmp %rax
-#define start_offset %rax
-
-#define tmp3 arg1
-
-#define extra_blocks arg2
-#define p arg2
-
-#define tmp4 %r8
-#define lens0 %r8
-
-#define lens1 %r9
-#define lens2 %r10
-#define lens3 %r11
-
-.macro LABEL prefix n
-\prefix\n\():
-.endm
-
-.macro JNE_SKIP i
-jne skip_\i
-.endm
-
-.altmacro
-.macro SET_OFFSET _offset
-offset = \_offset
-.endm
-.noaltmacro
-
-# JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state)
-# arg 1 : rcx : state
-ENTRY(sha512_mb_mgr_flush_avx2)
- FRAME_BEGIN
- push %rbx
-
- # If bit (32+3) is set, then all lanes are empty
- mov _unused_lanes(state), unused_lanes
- bt $32+7, unused_lanes
- jc return_null
-
- # find a lane with a non-null job
- xor idx, idx
- offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne one(%rip), idx
- offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne two(%rip), idx
- offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
- cmovne three(%rip), idx
-
- # copy idx to empty lanes
-copy_lane_data:
- offset = (_args + _data_ptr)
- mov offset(state,idx,8), tmp
-
- I = 0
-.rep 4
- offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
- cmpq $0, offset(state)
-.altmacro
- JNE_SKIP %I
- offset = (_args + _data_ptr + 8*I)
- mov tmp, offset(state)
- offset = (_lens + 8*I +4)
- movl $0xFFFFFFFF, offset(state)
-LABEL skip_ %I
- I = (I+1)
-.noaltmacro
-.endr
-
- # Find min length
- mov _lens + 0*8(state),lens0
- mov lens0,idx
- mov _lens + 1*8(state),lens1
- cmp idx,lens1
- cmovb lens1,idx
- mov _lens + 2*8(state),lens2
- cmp idx,lens2
- cmovb lens2,idx
- mov _lens + 3*8(state),lens3
- cmp idx,lens3
- cmovb lens3,idx
- mov idx,len2
- and $0xF,idx
- and $~0xFF,len2
- jz len_is_0
-
- sub len2, lens0
- sub len2, lens1
- sub len2, lens2
- sub len2, lens3
- shr $32,len2
- mov lens0, _lens + 0*8(state)
- mov lens1, _lens + 1*8(state)
- mov lens2, _lens + 2*8(state)
- mov lens3, _lens + 3*8(state)
-
- # "state" and "args" are the same address, arg1
- # len is arg2
- call sha512_x4_avx2
- # state and idx are intact
-
-len_is_0:
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- mov _unused_lanes(state), unused_lanes
- shl $8, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens+4(state, idx, 8)
-
- vmovq _args_digest+0*32(state, idx, 8), %xmm0
- vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
- vmovq _args_digest+2*32(state, idx, 8), %xmm1
- vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
- vmovq _args_digest+4*32(state, idx, 8), %xmm2
- vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
- vmovq _args_digest+6*32(state, idx, 8), %xmm3
- vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
-
- vmovdqu %xmm0, _result_digest(job_rax)
- vmovdqu %xmm1, _result_digest+1*16(job_rax)
- vmovdqu %xmm2, _result_digest+2*16(job_rax)
- vmovdqu %xmm3, _result_digest+3*16(job_rax)
-
-return:
- pop %rbx
- FRAME_END
- ret
-
-return_null:
- xor job_rax, job_rax
- jmp return
-ENDPROC(sha512_mb_mgr_flush_avx2)
-.align 16
-
-ENTRY(sha512_mb_mgr_get_comp_job_avx2)
- push %rbx
-
- mov _unused_lanes(state), unused_lanes
- bt $(32+7), unused_lanes
- jc .return_null
-
- # Find min length
- mov _lens(state),lens0
- mov lens0,idx
- mov _lens+1*8(state),lens1
- cmp idx,lens1
- cmovb lens1,idx
- mov _lens+2*8(state),lens2
- cmp idx,lens2
- cmovb lens2,idx
- mov _lens+3*8(state),lens3
- cmp idx,lens3
- cmovb lens3,idx
- test $~0xF,idx
- jnz .return_null
- and $0xF,idx
-
- #process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- mov _unused_lanes(state), unused_lanes
- shl $8, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF, _lens+4(state, idx, 8)
-
- vmovq _args_digest(state, idx, 8), %xmm0
- vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
- vmovq _args_digest+2*32(state, idx, 8), %xmm1
- vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
- vmovq _args_digest+4*32(state, idx, 8), %xmm2
- vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
- vmovq _args_digest+6*32(state, idx, 8), %xmm3
- vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
-
- vmovdqu %xmm0, _result_digest+0*16(job_rax)
- vmovdqu %xmm1, _result_digest+1*16(job_rax)
- vmovdqu %xmm2, _result_digest+2*16(job_rax)
- vmovdqu %xmm3, _result_digest+3*16(job_rax)
-
- pop %rbx
-
- ret
-
-.return_null:
- xor job_rax, job_rax
- pop %rbx
- ret
-ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
-
-.section .rodata.cst8.one, "aM", @progbits, 8
-.align 8
-one:
-.quad 1
-
-.section .rodata.cst8.two, "aM", @progbits, 8
-.align 8
-two:
-.quad 2
-
-.section .rodata.cst8.three, "aM", @progbits, 8
-.align 8
-three:
-.quad 3
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
deleted file mode 100644
index d088050..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Initialization code for multi buffer SHA256 algorithm for AVX2
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "sha512_mb_mgr.h"
-
-void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
-{
- unsigned int j;
-
- /* initially all lanes are unused */
- state->lens[0] = 0xFFFFFFFF00000000;
- state->lens[1] = 0xFFFFFFFF00000001;
- state->lens[2] = 0xFFFFFFFF00000002;
- state->lens[3] = 0xFFFFFFFF00000003;
-
- state->unused_lanes = 0xFF03020100;
- for (j = 0; j < 4; j++)
- state->ldata[j].job_in_lane = NULL;
-}
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
deleted file mode 100644
index 4ba709b..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Buffer submit code for multi buffer SHA512 algorithm
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-#include "sha512_mb_mgr_datastruct.S"
-
-.extern sha512_x4_avx2
-
-#define arg1 %rdi
-#define arg2 %rsi
-
-#define idx %rdx
-#define last_len %rdx
-
-#define size_offset %rcx
-#define tmp2 %rcx
-
-# Common definitions
-#define state arg1
-#define job arg2
-#define len2 arg2
-#define p2 arg2
-
-#define p %r11
-#define start_offset %r11
-
-#define unused_lanes %rbx
-
-#define job_rax %rax
-#define len %rax
-
-#define lane %r12
-#define tmp3 %r12
-#define lens3 %r12
-
-#define extra_blocks %r8
-#define lens0 %r8
-
-#define tmp %r9
-#define lens1 %r9
-
-#define lane_data %r10
-#define lens2 %r10
-
-#define DWORD_len %eax
-
-# JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job)
-# arg 1 : rcx : state
-# arg 2 : rdx : job
-ENTRY(sha512_mb_mgr_submit_avx2)
- FRAME_BEGIN
- push %rbx
- push %r12
-
- mov _unused_lanes(state), unused_lanes
- movzb %bl,lane
- shr $8, unused_lanes
- imul $_LANE_DATA_size, lane,lane_data
- movl $STS_BEING_PROCESSED, _status(job)
- lea _ldata(state, lane_data), lane_data
- mov unused_lanes, _unused_lanes(state)
- movl _len(job), DWORD_len
-
- mov job, _job_in_lane(lane_data)
- movl DWORD_len,_lens+4(state , lane, 8)
-
- # Load digest words from result_digest
- vmovdqu _result_digest+0*16(job), %xmm0
- vmovdqu _result_digest+1*16(job), %xmm1
- vmovdqu _result_digest+2*16(job), %xmm2
- vmovdqu _result_digest+3*16(job), %xmm3
-
- vmovq %xmm0, _args_digest(state, lane, 8)
- vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8)
- vmovq %xmm1, _args_digest+2*32(state , lane, 8)
- vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8)
- vmovq %xmm2, _args_digest+4*32(state , lane, 8)
- vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8)
- vmovq %xmm3, _args_digest+6*32(state , lane, 8)
- vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8)
-
- mov _buffer(job), p
- mov p, _args_data_ptr(state, lane, 8)
-
- cmp $0xFF, unused_lanes
- jne return_null
-
-start_loop:
-
- # Find min length
- mov _lens+0*8(state),lens0
- mov lens0,idx
- mov _lens+1*8(state),lens1
- cmp idx,lens1
- cmovb lens1, idx
- mov _lens+2*8(state),lens2
- cmp idx,lens2
- cmovb lens2,idx
- mov _lens+3*8(state),lens3
- cmp idx,lens3
- cmovb lens3,idx
- mov idx,len2
- and $0xF,idx
- and $~0xFF,len2
- jz len_is_0
-
- sub len2,lens0
- sub len2,lens1
- sub len2,lens2
- sub len2,lens3
- shr $32,len2
- mov lens0, _lens + 0*8(state)
- mov lens1, _lens + 1*8(state)
- mov lens2, _lens + 2*8(state)
- mov lens3, _lens + 3*8(state)
-
- # "state" and "args" are the same address, arg1
- # len is arg2
- call sha512_x4_avx2
- # state and idx are intact
-
-len_is_0:
-
- # process completed job "idx"
- imul $_LANE_DATA_size, idx, lane_data
- lea _ldata(state, lane_data), lane_data
-
- mov _job_in_lane(lane_data), job_rax
- mov _unused_lanes(state), unused_lanes
- movq $0, _job_in_lane(lane_data)
- movl $STS_COMPLETED, _status(job_rax)
- shl $8, unused_lanes
- or idx, unused_lanes
- mov unused_lanes, _unused_lanes(state)
-
- movl $0xFFFFFFFF,_lens+4(state,idx,8)
- vmovq _args_digest+0*32(state , idx, 8), %xmm0
- vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
- vmovq _args_digest+2*32(state , idx, 8), %xmm1
- vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
- vmovq _args_digest+4*32(state , idx, 8), %xmm2
- vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
- vmovq _args_digest+6*32(state , idx, 8), %xmm3
- vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
-
- vmovdqu %xmm0, _result_digest + 0*16(job_rax)
- vmovdqu %xmm1, _result_digest + 1*16(job_rax)
- vmovdqu %xmm2, _result_digest + 2*16(job_rax)
- vmovdqu %xmm3, _result_digest + 3*16(job_rax)
-
-return:
- pop %r12
- pop %rbx
- FRAME_END
- ret
-
-return_null:
- xor job_rax, job_rax
- jmp return
-ENDPROC(sha512_mb_mgr_submit_avx2)
-
-/* UNUSED?
-.section .rodata.cst16, "aM", @progbits, 16
-.align 16
-H0: .int 0x6a09e667
-H1: .int 0xbb67ae85
-H2: .int 0x3c6ef372
-H3: .int 0xa54ff53a
-H4: .int 0x510e527f
-H5: .int 0x9b05688c
-H6: .int 0x1f83d9ab
-H7: .int 0x5be0cd19
-*/
diff --git a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
deleted file mode 100644
index e22e907..0000000
--- a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * Multi-buffer SHA512 algorithm hash compute routine
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Megha Dey <megha.dey@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2016 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-# code to compute quad SHA512 using AVX2
-# use YMMs to tackle the larger digest size
-# outer calling routine takes care of save and restore of XMM registers
-# Logic designed/laid out by JDG
-
-# Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
-# Stack must be aligned to 32 bytes before call
-# Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
-# Linux preserves: rcx rdx rdi rbp r13 r14 r15
-# clobbers ymm0-15
-
-#include <linux/linkage.h>
-#include "sha512_mb_mgr_datastruct.S"
-
-arg1 = %rdi
-arg2 = %rsi
-
-# Common definitions
-STATE = arg1
-INP_SIZE = arg2
-
-IDX = %rax
-ROUND = %rbx
-TBL = %r8
-
-inp0 = %r9
-inp1 = %r10
-inp2 = %r11
-inp3 = %r12
-
-a = %ymm0
-b = %ymm1
-c = %ymm2
-d = %ymm3
-e = %ymm4
-f = %ymm5
-g = %ymm6
-h = %ymm7
-
-a0 = %ymm8
-a1 = %ymm9
-a2 = %ymm10
-
-TT0 = %ymm14
-TT1 = %ymm13
-TT2 = %ymm12
-TT3 = %ymm11
-TT4 = %ymm10
-TT5 = %ymm9
-
-T1 = %ymm14
-TMP = %ymm15
-
-# Define stack usage
-STACK_SPACE1 = SZ4*16 + NUM_SHA512_DIGEST_WORDS*SZ4 + 24
-
-#define VMOVPD vmovupd
-_digest = SZ4*16
-
-# transpose r0, r1, r2, r3, t0, t1
-# "transpose" data in {r0..r3} using temps {t0..t3}
-# Input looks like: {r0 r1 r2 r3}
-# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
-# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
-# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
-# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
-#
-# output looks like: {t0 r1 r0 r3}
-# t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
-# r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
-# r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
-# r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
-
-.macro TRANSPOSE r0 r1 r2 r3 t0 t1
- vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
- vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
- vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
- vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
-
- vperm2f128 $0x20, \r2, \r0, \r1 # h6...a6
- vperm2f128 $0x31, \r2, \r0, \r3 # h2...a2
- vperm2f128 $0x31, \t1, \t0, \r0 # h5...a5
- vperm2f128 $0x20, \t1, \t0, \t0 # h1...a1
-.endm
-
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-# PRORQ reg, imm, tmp
-# packed-rotate-right-double
-# does a rotate by doing two shifts and an or
-.macro _PRORQ reg imm tmp
- vpsllq $(64-\imm),\reg,\tmp
- vpsrlq $\imm,\reg, \reg
- vpor \tmp,\reg, \reg
-.endm
-
-# non-destructive
-# PRORQ_nd reg, imm, tmp, src
-.macro _PRORQ_nd reg imm tmp src
- vpsllq $(64-\imm), \src, \tmp
- vpsrlq $\imm, \src, \reg
- vpor \tmp, \reg, \reg
-.endm
-
-# PRORQ dst/src, amt
-.macro PRORQ reg imm
- _PRORQ \reg, \imm, TMP
-.endm
-
-# PRORQ_nd dst, src, amt
-.macro PRORQ_nd reg tmp imm
- _PRORQ_nd \reg, \imm, TMP, \tmp
-.endm
-
-#; arguments passed implicitly in preprocessor symbols i, a...h
-.macro ROUND_00_15 _T1 i
- PRORQ_nd a0, e, (18-14) # sig1: a0 = (e >> 4)
-
- vpxor g, f, a2 # ch: a2 = f^g
- vpand e,a2, a2 # ch: a2 = (f^g)&e
- vpxor g, a2, a2 # a2 = ch
-
- PRORQ_nd a1,e,41 # sig1: a1 = (e >> 25)
-
- offset = SZ4*(\i & 0xf)
- vmovdqu \_T1,offset(%rsp)
- vpaddq (TBL,ROUND,1), \_T1, \_T1 # T1 = W + K
- vpxor e,a0, a0 # sig1: a0 = e ^ (e >> 5)
- PRORQ a0, 14 # sig1: a0 = (e >> 6) ^ (e >> 11)
- vpaddq a2, h, h # h = h + ch
- PRORQ_nd a2,a,6 # sig0: a2 = (a >> 11)
- vpaddq \_T1,h, h # h = h + ch + W + K
- vpxor a1, a0, a0 # a0 = sigma1
- vmovdqu a,\_T1
- PRORQ_nd a1,a,39 # sig0: a1 = (a >> 22)
- vpxor c, \_T1, \_T1 # maj: T1 = a^c
- add $SZ4, ROUND # ROUND++
- vpand b, \_T1, \_T1 # maj: T1 = (a^c)&b
- vpaddq a0, h, h
- vpaddq h, d, d
- vpxor a, a2, a2 # sig0: a2 = a ^ (a >> 11)
- PRORQ a2,28 # sig0: a2 = (a >> 2) ^ (a >> 13)
- vpxor a1, a2, a2 # a2 = sig0
- vpand c, a, a1 # maj: a1 = a&c
- vpor \_T1, a1, a1 # a1 = maj
- vpaddq a1, h, h # h = h + ch + W + K + maj
- vpaddq a2, h, h # h = h + ch + W + K + maj + sigma0
- ROTATE_ARGS
-.endm
-
-
-#; arguments passed implicitly in preprocessor symbols i, a...h
-.macro ROUND_16_XX _T1 i
- vmovdqu SZ4*((\i-15)&0xf)(%rsp), \_T1
- vmovdqu SZ4*((\i-2)&0xf)(%rsp), a1
- vmovdqu \_T1, a0
- PRORQ \_T1,7
- vmovdqu a1, a2
- PRORQ a1,42
- vpxor a0, \_T1, \_T1
- PRORQ \_T1, 1
- vpxor a2, a1, a1
- PRORQ a1, 19
- vpsrlq $7, a0, a0
- vpxor a0, \_T1, \_T1
- vpsrlq $6, a2, a2
- vpxor a2, a1, a1
- vpaddq SZ4*((\i-16)&0xf)(%rsp), \_T1, \_T1
- vpaddq SZ4*((\i-7)&0xf)(%rsp), a1, a1
- vpaddq a1, \_T1, \_T1
-
- ROUND_00_15 \_T1,\i
-.endm
-
-
-# void sha512_x4_avx2(void *STATE, const int INP_SIZE)
-# arg 1 : STATE : pointer to input data
-# arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
-ENTRY(sha512_x4_avx2)
- # general registers preserved in outer calling routine
- # outer calling routine saves all the XMM registers
- # save callee-saved clobbered registers to comply with C function ABI
- push %r12
- push %r13
- push %r14
- push %r15
-
- sub $STACK_SPACE1, %rsp
-
- # Load the pre-transposed incoming digest.
- vmovdqu 0*SHA512_DIGEST_ROW_SIZE(STATE),a
- vmovdqu 1*SHA512_DIGEST_ROW_SIZE(STATE),b
- vmovdqu 2*SHA512_DIGEST_ROW_SIZE(STATE),c
- vmovdqu 3*SHA512_DIGEST_ROW_SIZE(STATE),d
- vmovdqu 4*SHA512_DIGEST_ROW_SIZE(STATE),e
- vmovdqu 5*SHA512_DIGEST_ROW_SIZE(STATE),f
- vmovdqu 6*SHA512_DIGEST_ROW_SIZE(STATE),g
- vmovdqu 7*SHA512_DIGEST_ROW_SIZE(STATE),h
-
- lea K512_4(%rip),TBL
-
- # load the address of each of the 4 message lanes
- # getting ready to transpose input onto stack
- mov _data_ptr+0*PTR_SZ(STATE),inp0
- mov _data_ptr+1*PTR_SZ(STATE),inp1
- mov _data_ptr+2*PTR_SZ(STATE),inp2
- mov _data_ptr+3*PTR_SZ(STATE),inp3
-
- xor IDX, IDX
-lloop:
- xor ROUND, ROUND
-
- # save old digest
- vmovdqu a, _digest(%rsp)
- vmovdqu b, _digest+1*SZ4(%rsp)
- vmovdqu c, _digest+2*SZ4(%rsp)
- vmovdqu d, _digest+3*SZ4(%rsp)
- vmovdqu e, _digest+4*SZ4(%rsp)
- vmovdqu f, _digest+5*SZ4(%rsp)
- vmovdqu g, _digest+6*SZ4(%rsp)
- vmovdqu h, _digest+7*SZ4(%rsp)
- i = 0
-.rep 4
- vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP
- VMOVPD i*32(inp0, IDX), TT2
- VMOVPD i*32(inp1, IDX), TT1
- VMOVPD i*32(inp2, IDX), TT4
- VMOVPD i*32(inp3, IDX), TT3
- TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
- vpshufb TMP, TT0, TT0
- vpshufb TMP, TT1, TT1
- vpshufb TMP, TT2, TT2
- vpshufb TMP, TT3, TT3
- ROUND_00_15 TT0,(i*4+0)
- ROUND_00_15 TT1,(i*4+1)
- ROUND_00_15 TT2,(i*4+2)
- ROUND_00_15 TT3,(i*4+3)
- i = (i+1)
-.endr
- add $128, IDX
-
- i = (i*4)
-
- jmp Lrounds_16_xx
-.align 16
-Lrounds_16_xx:
-.rep 16
- ROUND_16_XX T1, i
- i = (i+1)
-.endr
- cmp $0xa00,ROUND
- jb Lrounds_16_xx
-
- # add old digest
- vpaddq _digest(%rsp), a, a
- vpaddq _digest+1*SZ4(%rsp), b, b
- vpaddq _digest+2*SZ4(%rsp), c, c
- vpaddq _digest+3*SZ4(%rsp), d, d
- vpaddq _digest+4*SZ4(%rsp), e, e
- vpaddq _digest+5*SZ4(%rsp), f, f
- vpaddq _digest+6*SZ4(%rsp), g, g
- vpaddq _digest+7*SZ4(%rsp), h, h
-
- sub $1, INP_SIZE # unit is blocks
- jne lloop
-
- # write back to memory (state object) the transposed digest
- vmovdqu a, 0*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu b, 1*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu c, 2*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu d, 3*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu e, 4*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu f, 5*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu g, 6*SHA512_DIGEST_ROW_SIZE(STATE)
- vmovdqu h, 7*SHA512_DIGEST_ROW_SIZE(STATE)
-
- # update input data pointers
- add IDX, inp0
- mov inp0, _data_ptr+0*PTR_SZ(STATE)
- add IDX, inp1
- mov inp1, _data_ptr+1*PTR_SZ(STATE)
- add IDX, inp2
- mov inp2, _data_ptr+2*PTR_SZ(STATE)
- add IDX, inp3
- mov inp3, _data_ptr+3*PTR_SZ(STATE)
-
- #;;;;;;;;;;;;;;;
- #; Postamble
- add $STACK_SPACE1, %rsp
- # restore callee-saved clobbered registers
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-
- # outer calling routine restores XMM and other GP registers
- ret
-ENDPROC(sha512_x4_avx2)
-
-.section .rodata.K512_4, "a", @progbits
-.align 64
-K512_4:
- .octa 0x428a2f98d728ae22428a2f98d728ae22,\
- 0x428a2f98d728ae22428a2f98d728ae22
- .octa 0x7137449123ef65cd7137449123ef65cd,\
- 0x7137449123ef65cd7137449123ef65cd
- .octa 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f,\
- 0xb5c0fbcfec4d3b2fb5c0fbcfec4d3b2f
- .octa 0xe9b5dba58189dbbce9b5dba58189dbbc,\
- 0xe9b5dba58189dbbce9b5dba58189dbbc
- .octa 0x3956c25bf348b5383956c25bf348b538,\
- 0x3956c25bf348b5383956c25bf348b538
- .octa 0x59f111f1b605d01959f111f1b605d019,\
- 0x59f111f1b605d01959f111f1b605d019
- .octa 0x923f82a4af194f9b923f82a4af194f9b,\
- 0x923f82a4af194f9b923f82a4af194f9b
- .octa 0xab1c5ed5da6d8118ab1c5ed5da6d8118,\
- 0xab1c5ed5da6d8118ab1c5ed5da6d8118
- .octa 0xd807aa98a3030242d807aa98a3030242,\
- 0xd807aa98a3030242d807aa98a3030242
- .octa 0x12835b0145706fbe12835b0145706fbe,\
- 0x12835b0145706fbe12835b0145706fbe
- .octa 0x243185be4ee4b28c243185be4ee4b28c,\
- 0x243185be4ee4b28c243185be4ee4b28c
- .octa 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2,\
- 0x550c7dc3d5ffb4e2550c7dc3d5ffb4e2
- .octa 0x72be5d74f27b896f72be5d74f27b896f,\
- 0x72be5d74f27b896f72be5d74f27b896f
- .octa 0x80deb1fe3b1696b180deb1fe3b1696b1,\
- 0x80deb1fe3b1696b180deb1fe3b1696b1
- .octa 0x9bdc06a725c712359bdc06a725c71235,\
- 0x9bdc06a725c712359bdc06a725c71235
- .octa 0xc19bf174cf692694c19bf174cf692694,\
- 0xc19bf174cf692694c19bf174cf692694
- .octa 0xe49b69c19ef14ad2e49b69c19ef14ad2,\
- 0xe49b69c19ef14ad2e49b69c19ef14ad2
- .octa 0xefbe4786384f25e3efbe4786384f25e3,\
- 0xefbe4786384f25e3efbe4786384f25e3
- .octa 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5,\
- 0x0fc19dc68b8cd5b50fc19dc68b8cd5b5
- .octa 0x240ca1cc77ac9c65240ca1cc77ac9c65,\
- 0x240ca1cc77ac9c65240ca1cc77ac9c65
- .octa 0x2de92c6f592b02752de92c6f592b0275,\
- 0x2de92c6f592b02752de92c6f592b0275
- .octa 0x4a7484aa6ea6e4834a7484aa6ea6e483,\
- 0x4a7484aa6ea6e4834a7484aa6ea6e483
- .octa 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4,\
- 0x5cb0a9dcbd41fbd45cb0a9dcbd41fbd4
- .octa 0x76f988da831153b576f988da831153b5,\
- 0x76f988da831153b576f988da831153b5
- .octa 0x983e5152ee66dfab983e5152ee66dfab,\
- 0x983e5152ee66dfab983e5152ee66dfab
- .octa 0xa831c66d2db43210a831c66d2db43210,\
- 0xa831c66d2db43210a831c66d2db43210
- .octa 0xb00327c898fb213fb00327c898fb213f,\
- 0xb00327c898fb213fb00327c898fb213f
- .octa 0xbf597fc7beef0ee4bf597fc7beef0ee4,\
- 0xbf597fc7beef0ee4bf597fc7beef0ee4
- .octa 0xc6e00bf33da88fc2c6e00bf33da88fc2,\
- 0xc6e00bf33da88fc2c6e00bf33da88fc2
- .octa 0xd5a79147930aa725d5a79147930aa725,\
- 0xd5a79147930aa725d5a79147930aa725
- .octa 0x06ca6351e003826f06ca6351e003826f,\
- 0x06ca6351e003826f06ca6351e003826f
- .octa 0x142929670a0e6e70142929670a0e6e70,\
- 0x142929670a0e6e70142929670a0e6e70
- .octa 0x27b70a8546d22ffc27b70a8546d22ffc,\
- 0x27b70a8546d22ffc27b70a8546d22ffc
- .octa 0x2e1b21385c26c9262e1b21385c26c926,\
- 0x2e1b21385c26c9262e1b21385c26c926
- .octa 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed,\
- 0x4d2c6dfc5ac42aed4d2c6dfc5ac42aed
- .octa 0x53380d139d95b3df53380d139d95b3df,\
- 0x53380d139d95b3df53380d139d95b3df
- .octa 0x650a73548baf63de650a73548baf63de,\
- 0x650a73548baf63de650a73548baf63de
- .octa 0x766a0abb3c77b2a8766a0abb3c77b2a8,\
- 0x766a0abb3c77b2a8766a0abb3c77b2a8
- .octa 0x81c2c92e47edaee681c2c92e47edaee6,\
- 0x81c2c92e47edaee681c2c92e47edaee6
- .octa 0x92722c851482353b92722c851482353b,\
- 0x92722c851482353b92722c851482353b
- .octa 0xa2bfe8a14cf10364a2bfe8a14cf10364,\
- 0xa2bfe8a14cf10364a2bfe8a14cf10364
- .octa 0xa81a664bbc423001a81a664bbc423001,\
- 0xa81a664bbc423001a81a664bbc423001
- .octa 0xc24b8b70d0f89791c24b8b70d0f89791,\
- 0xc24b8b70d0f89791c24b8b70d0f89791
- .octa 0xc76c51a30654be30c76c51a30654be30,\
- 0xc76c51a30654be30c76c51a30654be30
- .octa 0xd192e819d6ef5218d192e819d6ef5218,\
- 0xd192e819d6ef5218d192e819d6ef5218
- .octa 0xd69906245565a910d69906245565a910,\
- 0xd69906245565a910d69906245565a910
- .octa 0xf40e35855771202af40e35855771202a,\
- 0xf40e35855771202af40e35855771202a
- .octa 0x106aa07032bbd1b8106aa07032bbd1b8,\
- 0x106aa07032bbd1b8106aa07032bbd1b8
- .octa 0x19a4c116b8d2d0c819a4c116b8d2d0c8,\
- 0x19a4c116b8d2d0c819a4c116b8d2d0c8
- .octa 0x1e376c085141ab531e376c085141ab53,\
- 0x1e376c085141ab531e376c085141ab53
- .octa 0x2748774cdf8eeb992748774cdf8eeb99,\
- 0x2748774cdf8eeb992748774cdf8eeb99
- .octa 0x34b0bcb5e19b48a834b0bcb5e19b48a8,\
- 0x34b0bcb5e19b48a834b0bcb5e19b48a8
- .octa 0x391c0cb3c5c95a63391c0cb3c5c95a63,\
- 0x391c0cb3c5c95a63391c0cb3c5c95a63
- .octa 0x4ed8aa4ae3418acb4ed8aa4ae3418acb,\
- 0x4ed8aa4ae3418acb4ed8aa4ae3418acb
- .octa 0x5b9cca4f7763e3735b9cca4f7763e373,\
- 0x5b9cca4f7763e3735b9cca4f7763e373
- .octa 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3,\
- 0x682e6ff3d6b2b8a3682e6ff3d6b2b8a3
- .octa 0x748f82ee5defb2fc748f82ee5defb2fc,\
- 0x748f82ee5defb2fc748f82ee5defb2fc
- .octa 0x78a5636f43172f6078a5636f43172f60,\
- 0x78a5636f43172f6078a5636f43172f60
- .octa 0x84c87814a1f0ab7284c87814a1f0ab72,\
- 0x84c87814a1f0ab7284c87814a1f0ab72
- .octa 0x8cc702081a6439ec8cc702081a6439ec,\
- 0x8cc702081a6439ec8cc702081a6439ec
- .octa 0x90befffa23631e2890befffa23631e28,\
- 0x90befffa23631e2890befffa23631e28
- .octa 0xa4506cebde82bde9a4506cebde82bde9,\
- 0xa4506cebde82bde9a4506cebde82bde9
- .octa 0xbef9a3f7b2c67915bef9a3f7b2c67915,\
- 0xbef9a3f7b2c67915bef9a3f7b2c67915
- .octa 0xc67178f2e372532bc67178f2e372532b,\
- 0xc67178f2e372532bc67178f2e372532b
- .octa 0xca273eceea26619cca273eceea26619c,\
- 0xca273eceea26619cca273eceea26619c
- .octa 0xd186b8c721c0c207d186b8c721c0c207,\
- 0xd186b8c721c0c207d186b8c721c0c207
- .octa 0xeada7dd6cde0eb1eeada7dd6cde0eb1e,\
- 0xeada7dd6cde0eb1eeada7dd6cde0eb1e
- .octa 0xf57d4f7fee6ed178f57d4f7fee6ed178,\
- 0xf57d4f7fee6ed178f57d4f7fee6ed178
- .octa 0x06f067aa72176fba06f067aa72176fba,\
- 0x06f067aa72176fba06f067aa72176fba
- .octa 0x0a637dc5a2c898a60a637dc5a2c898a6,\
- 0x0a637dc5a2c898a60a637dc5a2c898a6
- .octa 0x113f9804bef90dae113f9804bef90dae,\
- 0x113f9804bef90dae113f9804bef90dae
- .octa 0x1b710b35131c471b1b710b35131c471b,\
- 0x1b710b35131c471b1b710b35131c471b
- .octa 0x28db77f523047d8428db77f523047d84,\
- 0x28db77f523047d8428db77f523047d84
- .octa 0x32caab7b40c7249332caab7b40c72493,\
- 0x32caab7b40c7249332caab7b40c72493
- .octa 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc,\
- 0x3c9ebe0a15c9bebc3c9ebe0a15c9bebc
- .octa 0x431d67c49c100d4c431d67c49c100d4c,\
- 0x431d67c49c100d4c431d67c49c100d4c
- .octa 0x4cc5d4becb3e42b64cc5d4becb3e42b6,\
- 0x4cc5d4becb3e42b64cc5d4becb3e42b6
- .octa 0x597f299cfc657e2a597f299cfc657e2a,\
- 0x597f299cfc657e2a597f299cfc657e2a
- .octa 0x5fcb6fab3ad6faec5fcb6fab3ad6faec,\
- 0x5fcb6fab3ad6faec5fcb6fab3ad6faec
- .octa 0x6c44198c4a4758176c44198c4a475817,\
- 0x6c44198c4a4758176c44198c4a475817
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
- .octa 0x18191a1b1c1d1e1f1011121314151617
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f1b811b..458356a 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -28,16 +28,16 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/cryptohash.h>
+#include <linux/string.h>
#include <linux/types.h>
#include <crypto/sha.h>
#include <crypto/sha512_base.h>
-#include <asm/fpu/api.h>
-
-#include <linux/string.h>
+#include <asm/simd.h>
asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
u64 rounds);
@@ -49,7 +49,7 @@
{
struct sha512_state *sctx = shash_desc_ctx(desc);
- if (!irq_fpu_usable() ||
+ if (!crypto_simd_usable() ||
(sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
return crypto_sha512_update(desc, data, len);
@@ -67,7 +67,7 @@
static int sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
{
- if (!irq_fpu_usable())
+ if (!crypto_simd_usable())
return crypto_sha512_finup(desc, data, len, out);
kernel_fpu_begin();
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 73b471d..698b8f2 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
*
@@ -5,22 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 694ea45..290cc4e 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/***************************************************************************
* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
* *
-* This program is free software; you can redistribute it and/or modify *
-* it under the terms of the GNU General Public License as published by *
-* the Free Software Foundation; either version 2 of the License, or *
-* (at your option) any later version. *
-* *
-* This program is distributed in the hope that it will be useful, *
-* but WITHOUT ANY WARRANTY; without even the implied warranty of *
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
-* GNU General Public License for more details. *
-* *
-* You should have received a copy of the GNU General Public License *
-* along with this program; if not, write to the *
-* Free Software Foundation, Inc., *
-* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
.file "twofish-i586-asm.S"
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index e7273a6..e495e07 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -1,23 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Twofish Cipher 3-way parallel algorithm (x86_64)
*
* Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index a350c99..ecef2cb 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/***************************************************************************
* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
* *
-* This program is free software; you can redistribute it and/or modify *
-* it under the terms of the GNU General Public License as published by *
-* the Free Software Foundation; either version 2 of the License, or *
-* (at your option) any later version. *
-* *
-* This program is distributed in the hope that it will be useful, *
-* but WITHOUT ANY WARRANTY; without even the implied warranty of *
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
-* GNU General Public License for more details. *
-* *
-* You should have received a copy of the GNU General Public License *
-* along with this program; if not, write to the *
-* Free Software Foundation, Inc., *
-* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
.file "twofish-x86_64-asm.S"
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 66d9892..d561c82 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for AVX assembler version of Twofish Cipher
*
@@ -5,22 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <linux/module.h>
@@ -225,7 +210,7 @@
return glue_xts_req_128bit(&twofish_enc_xts, req,
XTS_TWEAK_CAST(twofish_enc_blk),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -235,7 +220,7 @@
return glue_xts_req_128bit(&twofish_dec_xts, req,
XTS_TWEAK_CAST(twofish_enc_blk),
- &ctx->tweak_ctx, &ctx->crypt_ctx);
+ &ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
static struct skcipher_alg twofish_algs[] = {
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 5714855..1dc9e29 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -1,23 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue Code for 3-way parallel assembler optimized version of Twofish
*
* Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
- * USA
- *
*/
#include <asm/crypto/glue_helper.h>