Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index b8e69fe..043b0b1 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -69,12 +69,21 @@
help
Use optimized AES assembler routines for ARM platforms.
+ On ARM processors without the Crypto Extensions, this is the
+ fastest AES implementation for single blocks. For multiple
+ blocks, the NEON bit-sliced implementation is usually faster.
+
+ This implementation may be vulnerable to cache timing attacks,
+ since it uses lookup tables. However, as countermeasures it
+ disables IRQs and preloads the tables; it is hoped this makes
+ such attacks very difficult.
+
config CRYPTO_AES_ARM_BS
tristate "Bit sliced AES using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
+ select CRYPTO_LIB_AES
select CRYPTO_SIMD
- select CRYPTO_AES
help
Use a faster and more secure NEON based implementation of AES in CBC,
CTR and XTS modes
@@ -89,6 +98,7 @@
tristate "Accelerated AES using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
+ select CRYPTO_LIB_AES
select CRYPTO_SIMD
help
Use an implementation of AES in CBC, CTR and XTS modes that uses
@@ -99,6 +109,7 @@
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_CRYPTD
+ select CRYPTO_GF128MUL
help
Use an implementation of GHASH (used by the GCM AEAD chaining mode)
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
@@ -116,9 +127,14 @@
select CRYPTO_HASH
config CRYPTO_CHACHA20_NEON
- tristate "NEON accelerated ChaCha20 symmetric cipher"
+ tristate "NEON accelerated ChaCha stream cipher algorithms"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
+config CRYPTO_NHPOLY1305_NEON
+ tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_NHPOLY1305
+
endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index bd5bcee..4180f3a 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,7 +9,8 @@
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -52,7 +53,8 @@
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@
@@ -65,4 +67,4 @@
$(call cmd,perl)
endif
-targets += sha256-core.S sha512-core.S
+clean-files += sha256-core.S sha512-core.S
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S
index ba8e6a3..4d17073 100644
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -1,17 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.text
+ .arch armv8-a
.fpu crypto-neon-fp-armv8
.align 3
@@ -47,63 +45,73 @@
veor q0, q0, \key3
.endm
- .macro enc_dround_3x, key1, key2
+ .macro enc_dround_4x, key1, key2
enc_round q0, \key1
enc_round q1, \key1
enc_round q2, \key1
+ enc_round q3, \key1
enc_round q0, \key2
enc_round q1, \key2
enc_round q2, \key2
+ enc_round q3, \key2
.endm
- .macro dec_dround_3x, key1, key2
+ .macro dec_dround_4x, key1, key2
dec_round q0, \key1
dec_round q1, \key1
dec_round q2, \key1
+ dec_round q3, \key1
dec_round q0, \key2
dec_round q1, \key2
dec_round q2, \key2
+ dec_round q3, \key2
.endm
- .macro enc_fround_3x, key1, key2, key3
+ .macro enc_fround_4x, key1, key2, key3
enc_round q0, \key1
enc_round q1, \key1
enc_round q2, \key1
+ enc_round q3, \key1
aese.8 q0, \key2
aese.8 q1, \key2
aese.8 q2, \key2
+ aese.8 q3, \key2
veor q0, q0, \key3
veor q1, q1, \key3
veor q2, q2, \key3
+ veor q3, q3, \key3
.endm
- .macro dec_fround_3x, key1, key2, key3
+ .macro dec_fround_4x, key1, key2, key3
dec_round q0, \key1
dec_round q1, \key1
dec_round q2, \key1
+ dec_round q3, \key1
aesd.8 q0, \key2
aesd.8 q1, \key2
aesd.8 q2, \key2
+ aesd.8 q3, \key2
veor q0, q0, \key3
veor q1, q1, \key3
veor q2, q2, \key3
+ veor q3, q3, \key3
.endm
.macro do_block, dround, fround
cmp r3, #12 @ which key size?
- vld1.8 {q10-q11}, [ip]!
+ vld1.32 {q10-q11}, [ip]!
\dround q8, q9
- vld1.8 {q12-q13}, [ip]!
+ vld1.32 {q12-q13}, [ip]!
\dround q10, q11
- vld1.8 {q10-q11}, [ip]!
+ vld1.32 {q10-q11}, [ip]!
\dround q12, q13
- vld1.8 {q12-q13}, [ip]!
+ vld1.32 {q12-q13}, [ip]!
\dround q10, q11
blo 0f @ AES-128: 10 rounds
- vld1.8 {q10-q11}, [ip]!
+ vld1.32 {q10-q11}, [ip]!
\dround q12, q13
beq 1f @ AES-192: 12 rounds
- vld1.8 {q12-q13}, [ip]
+ vld1.32 {q12-q13}, [ip]
\dround q10, q11
0: \fround q12, q13, q14
bx lr
@@ -117,8 +125,9 @@
* transforms. These should preserve all registers except q0 - q2 and ip
* Arguments:
* q0 : first in/output block
- * q1 : second in/output block (_3x version only)
- * q2 : third in/output block (_3x version only)
+ * q1 : second in/output block (_4x version only)
+ * q2 : third in/output block (_4x version only)
+ * q3 : fourth in/output block (_4x version only)
* q8 : first round key
* q9 : secound round key
* q14 : final round key
@@ -139,44 +148,44 @@
ENDPROC(aes_decrypt)
.align 6
-aes_encrypt_3x:
+aes_encrypt_4x:
add ip, r2, #32 @ 3rd round key
- do_block enc_dround_3x, enc_fround_3x
-ENDPROC(aes_encrypt_3x)
+ do_block enc_dround_4x, enc_fround_4x
+ENDPROC(aes_encrypt_4x)
.align 6
-aes_decrypt_3x:
+aes_decrypt_4x:
add ip, r2, #32 @ 3rd round key
- do_block dec_dround_3x, dec_fround_3x
-ENDPROC(aes_decrypt_3x)
+ do_block dec_dround_4x, dec_fround_4x
+ENDPROC(aes_decrypt_4x)
.macro prepare_key, rk, rounds
add ip, \rk, \rounds, lsl #4
- vld1.8 {q8-q9}, [\rk] @ load first 2 round keys
- vld1.8 {q14}, [ip] @ load last round key
+ vld1.32 {q8-q9}, [\rk] @ load first 2 round keys
+ vld1.32 {q14}, [ip] @ load last round key
.endm
/*
- * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
* int blocks)
- * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
* int blocks)
*/
ENTRY(ce_aes_ecb_encrypt)
push {r4, lr}
ldr r4, [sp, #8]
prepare_key r2, r3
-.Lecbencloop3x:
- subs r4, r4, #3
+.Lecbencloop4x:
+ subs r4, r4, #4
bmi .Lecbenc1x
vld1.8 {q0-q1}, [r1]!
- vld1.8 {q2}, [r1]!
- bl aes_encrypt_3x
+ vld1.8 {q2-q3}, [r1]!
+ bl aes_encrypt_4x
vst1.8 {q0-q1}, [r0]!
- vst1.8 {q2}, [r0]!
- b .Lecbencloop3x
+ vst1.8 {q2-q3}, [r0]!
+ b .Lecbencloop4x
.Lecbenc1x:
- adds r4, r4, #3
+ adds r4, r4, #4
beq .Lecbencout
.Lecbencloop:
vld1.8 {q0}, [r1]!
@@ -192,17 +201,17 @@
push {r4, lr}
ldr r4, [sp, #8]
prepare_key r2, r3
-.Lecbdecloop3x:
- subs r4, r4, #3
+.Lecbdecloop4x:
+ subs r4, r4, #4
bmi .Lecbdec1x
vld1.8 {q0-q1}, [r1]!
- vld1.8 {q2}, [r1]!
- bl aes_decrypt_3x
+ vld1.8 {q2-q3}, [r1]!
+ bl aes_decrypt_4x
vst1.8 {q0-q1}, [r0]!
- vst1.8 {q2}, [r0]!
- b .Lecbdecloop3x
+ vst1.8 {q2-q3}, [r0]!
+ b .Lecbdecloop4x
.Lecbdec1x:
- adds r4, r4, #3
+ adds r4, r4, #4
beq .Lecbdecout
.Lecbdecloop:
vld1.8 {q0}, [r1]!
@@ -215,9 +224,9 @@
ENDPROC(ce_aes_ecb_decrypt)
/*
- * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 iv[])
- * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 iv[])
*/
ENTRY(ce_aes_cbc_encrypt)
@@ -239,123 +248,216 @@
ENTRY(ce_aes_cbc_decrypt)
push {r4-r6, lr}
ldrd r4, r5, [sp, #16]
- vld1.8 {q6}, [r5] @ keep iv in q6
+ vld1.8 {q15}, [r5] @ keep iv in q15
prepare_key r2, r3
-.Lcbcdecloop3x:
- subs r4, r4, #3
+.Lcbcdecloop4x:
+ subs r4, r4, #4
bmi .Lcbcdec1x
vld1.8 {q0-q1}, [r1]!
- vld1.8 {q2}, [r1]!
- vmov q3, q0
- vmov q4, q1
- vmov q5, q2
- bl aes_decrypt_3x
- veor q0, q0, q6
- veor q1, q1, q3
- veor q2, q2, q4
- vmov q6, q5
+ vld1.8 {q2-q3}, [r1]!
+ vmov q4, q0
+ vmov q5, q1
+ vmov q6, q2
+ vmov q7, q3
+ bl aes_decrypt_4x
+ veor q0, q0, q15
+ veor q1, q1, q4
+ veor q2, q2, q5
+ veor q3, q3, q6
+ vmov q15, q7
vst1.8 {q0-q1}, [r0]!
- vst1.8 {q2}, [r0]!
- b .Lcbcdecloop3x
+ vst1.8 {q2-q3}, [r0]!
+ b .Lcbcdecloop4x
.Lcbcdec1x:
- adds r4, r4, #3
+ adds r4, r4, #4
beq .Lcbcdecout
- vmov q15, q14 @ preserve last round key
+ vmov q6, q14 @ preserve last round key
.Lcbcdecloop:
vld1.8 {q0}, [r1]! @ get next ct block
veor q14, q15, q6 @ combine prev ct with last key
- vmov q6, q0
+ vmov q15, q0
bl aes_decrypt
vst1.8 {q0}, [r0]!
subs r4, r4, #1
bne .Lcbcdecloop
.Lcbcdecout:
- vst1.8 {q6}, [r5] @ keep iv in q6
+ vst1.8 {q15}, [r5] @ keep iv in q15
pop {r4-r6, pc}
ENDPROC(ce_aes_cbc_decrypt)
+
/*
- * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
+ * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
+ * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ * int rounds, int bytes, u8 const iv[])
+ */
+
+ENTRY(ce_aes_cbc_cts_encrypt)
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #16]
+
+ movw ip, :lower16:.Lcts_permute_table
+ movt ip, :upper16:.Lcts_permute_table
+ sub r4, r4, #16
+ add lr, ip, #32
+ add ip, ip, r4
+ sub lr, lr, r4
+ vld1.8 {q5}, [ip]
+ vld1.8 {q6}, [lr]
+
+ add ip, r1, r4
+ vld1.8 {q0}, [r1] @ overlapping loads
+ vld1.8 {q3}, [ip]
+
+ vld1.8 {q1}, [r5] @ get iv
+ prepare_key r2, r3
+
+ veor q0, q0, q1 @ xor with iv
+ bl aes_encrypt
+
+ vtbl.8 d4, {d0-d1}, d10
+ vtbl.8 d5, {d0-d1}, d11
+ vtbl.8 d2, {d6-d7}, d12
+ vtbl.8 d3, {d6-d7}, d13
+
+ veor q0, q0, q1
+ bl aes_encrypt
+
+ add r4, r0, r4
+ vst1.8 {q2}, [r4] @ overlapping stores
+ vst1.8 {q0}, [r0]
+
+ pop {r4-r6, pc}
+ENDPROC(ce_aes_cbc_cts_encrypt)
+
+ENTRY(ce_aes_cbc_cts_decrypt)
+ push {r4-r6, lr}
+ ldrd r4, r5, [sp, #16]
+
+ movw ip, :lower16:.Lcts_permute_table
+ movt ip, :upper16:.Lcts_permute_table
+ sub r4, r4, #16
+ add lr, ip, #32
+ add ip, ip, r4
+ sub lr, lr, r4
+ vld1.8 {q5}, [ip]
+ vld1.8 {q6}, [lr]
+
+ add ip, r1, r4
+ vld1.8 {q0}, [r1] @ overlapping loads
+ vld1.8 {q1}, [ip]
+
+ vld1.8 {q3}, [r5] @ get iv
+ prepare_key r2, r3
+
+ bl aes_decrypt
+
+ vtbl.8 d4, {d0-d1}, d10
+ vtbl.8 d5, {d0-d1}, d11
+ vtbx.8 d0, {d2-d3}, d12
+ vtbx.8 d1, {d2-d3}, d13
+
+ veor q1, q1, q2
+ bl aes_decrypt
+ veor q0, q0, q3 @ xor with iv
+
+ add r4, r0, r4
+ vst1.8 {q1}, [r4] @ overlapping stores
+ vst1.8 {q0}, [r0]
+
+ pop {r4-r6, pc}
+ENDPROC(ce_aes_cbc_cts_decrypt)
+
+
+ /*
+ * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 ctr[])
*/
ENTRY(ce_aes_ctr_encrypt)
push {r4-r6, lr}
ldrd r4, r5, [sp, #16]
- vld1.8 {q6}, [r5] @ load ctr
+ vld1.8 {q7}, [r5] @ load ctr
prepare_key r2, r3
- vmov r6, s27 @ keep swabbed ctr in r6
+ vmov r6, s31 @ keep swabbed ctr in r6
rev r6, r6
cmn r6, r4 @ 32 bit overflow?
bcs .Lctrloop
-.Lctrloop3x:
- subs r4, r4, #3
+.Lctrloop4x:
+ subs r4, r4, #4
bmi .Lctr1x
add r6, r6, #1
- vmov q0, q6
- vmov q1, q6
+ vmov q0, q7
+ vmov q1, q7
rev ip, r6
add r6, r6, #1
- vmov q2, q6
+ vmov q2, q7
vmov s7, ip
rev ip, r6
add r6, r6, #1
+ vmov q3, q7
vmov s11, ip
- vld1.8 {q3-q4}, [r1]!
- vld1.8 {q5}, [r1]!
- bl aes_encrypt_3x
- veor q0, q0, q3
- veor q1, q1, q4
- veor q2, q2, q5
+ rev ip, r6
+ add r6, r6, #1
+ vmov s15, ip
+ vld1.8 {q4-q5}, [r1]!
+ vld1.8 {q6}, [r1]!
+ vld1.8 {q15}, [r1]!
+ bl aes_encrypt_4x
+ veor q0, q0, q4
+ veor q1, q1, q5
+ veor q2, q2, q6
+ veor q3, q3, q15
rev ip, r6
vst1.8 {q0-q1}, [r0]!
- vst1.8 {q2}, [r0]!
- vmov s27, ip
- b .Lctrloop3x
+ vst1.8 {q2-q3}, [r0]!
+ vmov s31, ip
+ b .Lctrloop4x
.Lctr1x:
- adds r4, r4, #3
+ adds r4, r4, #4
beq .Lctrout
.Lctrloop:
- vmov q0, q6
+ vmov q0, q7
bl aes_encrypt
+
+ adds r6, r6, #1 @ increment BE ctr
+ rev ip, r6
+ vmov s31, ip
+ bcs .Lctrcarry
+
+.Lctrcarrydone:
subs r4, r4, #1
bmi .Lctrtailblock @ blocks < 0 means tail block
vld1.8 {q3}, [r1]!
veor q3, q0, q3
vst1.8 {q3}, [r0]!
-
- adds r6, r6, #1 @ increment BE ctr
- rev ip, r6
- vmov s27, ip
- bcs .Lctrcarry
- teq r4, #0
bne .Lctrloop
+
.Lctrout:
- vst1.8 {q6}, [r5]
+ vst1.8 {q7}, [r5] @ return next CTR value
pop {r4-r6, pc}
.Lctrtailblock:
- vst1.8 {q0}, [r0, :64] @ return just the key stream
- pop {r4-r6, pc}
+ vst1.8 {q0}, [r0, :64] @ return the key stream
+ b .Lctrout
.Lctrcarry:
- .irp sreg, s26, s25, s24
+ .irp sreg, s30, s29, s28
vmov ip, \sreg @ load next word of ctr
rev ip, ip @ ... to handle the carry
adds ip, ip, #1
rev ip, ip
vmov \sreg, ip
- bcc 0f
+ bcc .Lctrcarrydone
.endr
-0: teq r4, #0
- beq .Lctrout
- b .Lctrloop
+ b .Lctrcarrydone
ENDPROC(ce_aes_ctr_encrypt)
/*
- * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 iv[], u8 const rk2[], int first)
- * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
- * int blocks, u8 iv[], u8 const rk2[], int first)
+ * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
+ * int bytes, u8 iv[], u32 const rk2[], int first)
+ * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
+ * int bytes, u8 iv[], u32 const rk2[], int first)
*/
.macro next_tweak, out, in, const, tmp
@@ -366,13 +468,10 @@
veor \out, \out, \tmp
.endm
- .align 3
-.Lxts_mul_x:
- .quad 1, 0x87
-
ce_aes_xts_init:
- vldr d14, .Lxts_mul_x
- vldr d15, .Lxts_mul_x + 8
+ vmov.i32 d30, #0x87 @ compose tweak mask vector
+ vmovl.u32 q15, d30
+ vshr.u64 d30, d31, #7
ldrd r4, r5, [sp, #16] @ load args
ldr r6, [sp, #28]
@@ -393,49 +492,86 @@
bl ce_aes_xts_init @ run shared prologue
prepare_key r2, r3
- vmov q3, q0
+ vmov q4, q0
teq r6, #0 @ start of a block?
- bne .Lxtsenc3x
+ bne .Lxtsenc4x
-.Lxtsencloop3x:
- next_tweak q3, q3, q7, q6
-.Lxtsenc3x:
- subs r4, r4, #3
+.Lxtsencloop4x:
+ next_tweak q4, q4, q15, q10
+.Lxtsenc4x:
+ subs r4, r4, #64
bmi .Lxtsenc1x
- vld1.8 {q0-q1}, [r1]! @ get 3 pt blocks
- vld1.8 {q2}, [r1]!
- next_tweak q4, q3, q7, q6
- veor q0, q0, q3
- next_tweak q5, q4, q7, q6
- veor q1, q1, q4
- veor q2, q2, q5
- bl aes_encrypt_3x
- veor q0, q0, q3
- veor q1, q1, q4
- veor q2, q2, q5
- vst1.8 {q0-q1}, [r0]! @ write 3 ct blocks
- vst1.8 {q2}, [r0]!
- vmov q3, q5
+ vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
+ vld1.8 {q2-q3}, [r1]!
+ next_tweak q5, q4, q15, q10
+ veor q0, q0, q4
+ next_tweak q6, q5, q15, q10
+ veor q1, q1, q5
+ next_tweak q7, q6, q15, q10
+ veor q2, q2, q6
+ veor q3, q3, q7
+ bl aes_encrypt_4x
+ veor q0, q0, q4
+ veor q1, q1, q5
+ veor q2, q2, q6
+ veor q3, q3, q7
+ vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks
+ vst1.8 {q2-q3}, [r0]!
+ vmov q4, q7
teq r4, #0
- beq .Lxtsencout
- b .Lxtsencloop3x
+ beq .Lxtsencret
+ b .Lxtsencloop4x
.Lxtsenc1x:
- adds r4, r4, #3
+ adds r4, r4, #64
beq .Lxtsencout
+ subs r4, r4, #16
+ bmi .LxtsencctsNx
.Lxtsencloop:
vld1.8 {q0}, [r1]!
- veor q0, q0, q3
+.Lxtsencctsout:
+ veor q0, q0, q4
bl aes_encrypt
- veor q0, q0, q3
- vst1.8 {q0}, [r0]!
- subs r4, r4, #1
+ veor q0, q0, q4
+ teq r4, #0
beq .Lxtsencout
- next_tweak q3, q3, q7, q6
+ subs r4, r4, #16
+ next_tweak q4, q4, q15, q6
+ bmi .Lxtsenccts
+ vst1.8 {q0}, [r0]!
b .Lxtsencloop
.Lxtsencout:
- vst1.8 {q3}, [r5]
+ vst1.8 {q0}, [r0]
+.Lxtsencret:
+ vst1.8 {q4}, [r5]
pop {r4-r6, pc}
+
+.LxtsencctsNx:
+ vmov q0, q3
+ sub r0, r0, #16
+.Lxtsenccts:
+ movw ip, :lower16:.Lcts_permute_table
+ movt ip, :upper16:.Lcts_permute_table
+
+ add r1, r1, r4 @ rewind input pointer
+ add r4, r4, #16 @ # bytes in final block
+ add lr, ip, #32
+ add ip, ip, r4
+ sub lr, lr, r4
+ add r4, r0, r4 @ output address of final block
+
+ vld1.8 {q1}, [r1] @ load final partial block
+ vld1.8 {q2}, [ip]
+ vld1.8 {q3}, [lr]
+
+ vtbl.8 d4, {d0-d1}, d4
+ vtbl.8 d5, {d0-d1}, d5
+ vtbx.8 d0, {d2-d3}, d6
+ vtbx.8 d1, {d2-d3}, d7
+
+ vst1.8 {q2}, [r4] @ overlapping stores
+ mov r4, #0
+ b .Lxtsencctsout
ENDPROC(ce_aes_xts_encrypt)
@@ -444,50 +580,90 @@
bl ce_aes_xts_init @ run shared prologue
prepare_key r2, r3
- vmov q3, q0
+ vmov q4, q0
+
+ /* subtract 16 bytes if we are doing CTS */
+ tst r4, #0xf
+ subne r4, r4, #0x10
teq r6, #0 @ start of a block?
- bne .Lxtsdec3x
+ bne .Lxtsdec4x
-.Lxtsdecloop3x:
- next_tweak q3, q3, q7, q6
-.Lxtsdec3x:
- subs r4, r4, #3
+.Lxtsdecloop4x:
+ next_tweak q4, q4, q15, q10
+.Lxtsdec4x:
+ subs r4, r4, #64
bmi .Lxtsdec1x
- vld1.8 {q0-q1}, [r1]! @ get 3 ct blocks
- vld1.8 {q2}, [r1]!
- next_tweak q4, q3, q7, q6
- veor q0, q0, q3
- next_tweak q5, q4, q7, q6
- veor q1, q1, q4
- veor q2, q2, q5
- bl aes_decrypt_3x
- veor q0, q0, q3
- veor q1, q1, q4
- veor q2, q2, q5
- vst1.8 {q0-q1}, [r0]! @ write 3 pt blocks
- vst1.8 {q2}, [r0]!
- vmov q3, q5
+ vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
+ vld1.8 {q2-q3}, [r1]!
+ next_tweak q5, q4, q15, q10
+ veor q0, q0, q4
+ next_tweak q6, q5, q15, q10
+ veor q1, q1, q5
+ next_tweak q7, q6, q15, q10
+ veor q2, q2, q6
+ veor q3, q3, q7
+ bl aes_decrypt_4x
+ veor q0, q0, q4
+ veor q1, q1, q5
+ veor q2, q2, q6
+ veor q3, q3, q7
+ vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks
+ vst1.8 {q2-q3}, [r0]!
+ vmov q4, q7
teq r4, #0
beq .Lxtsdecout
- b .Lxtsdecloop3x
+ b .Lxtsdecloop4x
.Lxtsdec1x:
- adds r4, r4, #3
+ adds r4, r4, #64
beq .Lxtsdecout
+ subs r4, r4, #16
.Lxtsdecloop:
vld1.8 {q0}, [r1]!
- veor q0, q0, q3
- add ip, r2, #32 @ 3rd round key
+ bmi .Lxtsdeccts
+.Lxtsdecctsout:
+ veor q0, q0, q4
bl aes_decrypt
- veor q0, q0, q3
+ veor q0, q0, q4
vst1.8 {q0}, [r0]!
- subs r4, r4, #1
+ teq r4, #0
beq .Lxtsdecout
- next_tweak q3, q3, q7, q6
+ subs r4, r4, #16
+ next_tweak q4, q4, q15, q6
b .Lxtsdecloop
.Lxtsdecout:
- vst1.8 {q3}, [r5]
+ vst1.8 {q4}, [r5]
pop {r4-r6, pc}
+
+.Lxtsdeccts:
+ movw ip, :lower16:.Lcts_permute_table
+ movt ip, :upper16:.Lcts_permute_table
+
+ add r1, r1, r4 @ rewind input pointer
+ add r4, r4, #16 @ # bytes in final block
+ add lr, ip, #32
+ add ip, ip, r4
+ sub lr, lr, r4
+ add r4, r0, r4 @ output address of final block
+
+ next_tweak q5, q4, q15, q6
+
+ vld1.8 {q1}, [r1] @ load final partial block
+ vld1.8 {q2}, [ip]
+ vld1.8 {q3}, [lr]
+
+ veor q0, q0, q5
+ bl aes_decrypt
+ veor q0, q0, q5
+
+ vtbl.8 d4, {d0-d1}, d4
+ vtbl.8 d5, {d0-d1}, d5
+ vtbx.8 d0, {d2-d3}, d6
+ vtbx.8 d1, {d2-d3}, d7
+
+ vst1.8 {q2}, [r4] @ overlapping stores
+ mov r4, #0
+ b .Lxtsdecctsout
ENDPROC(ce_aes_xts_decrypt)
/*
@@ -508,8 +684,18 @@
* operation on round key *src
*/
ENTRY(ce_aes_invert)
- vld1.8 {q0}, [r1]
+ vld1.32 {q0}, [r1]
aesimc.8 q0, q0
- vst1.8 {q0}, [r0]
+ vst1.32 {q0}, [r0]
bx lr
ENDPROC(ce_aes_invert)
+
+ .section ".rodata", "a"
+ .align 6
+.Lcts_permute_table:
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c
index d0a9cec..cdb1a07 100644
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -1,19 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* aes-ce-glue.c - wrapper code for ARMv8 AES
*
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
-#include <asm/hwcap.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
#include <crypto/aes.h>
+#include <crypto/ctr.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <linux/cpufeature.h>
#include <linux/module.h>
#include <crypto/xts.h>
@@ -26,25 +26,29 @@
asmlinkage u32 ce_aes_sub(u32 input);
asmlinkage void ce_aes_invert(void *dst, void *src);
-asmlinkage void ce_aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void ce_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
-asmlinkage void ce_aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void ce_aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
-asmlinkage void ce_aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void ce_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
-asmlinkage void ce_aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void ce_aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
+asmlinkage void ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 const iv[]);
+asmlinkage void ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
+ int rounds, int bytes, u8 const iv[]);
-asmlinkage void ce_aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+asmlinkage void ce_aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 ctr[]);
-asmlinkage void ce_aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
- int rounds, int blocks, u8 iv[],
- u8 const rk2[], int first);
-asmlinkage void ce_aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
- int rounds, int blocks, u8 iv[],
- u8 const rk2[], int first);
+asmlinkage void ce_aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int bytes, u8 iv[],
+ u32 const rk2[], int first);
+asmlinkage void ce_aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
+ int rounds, int bytes, u8 iv[],
+ u32 const rk2[], int first);
struct aes_block {
u8 b[AES_BLOCK_SIZE];
@@ -81,21 +85,17 @@
key_len != AES_KEYSIZE_256)
return -EINVAL;
- memcpy(ctx->key_enc, in_key, key_len);
ctx->key_length = key_len;
+ for (i = 0; i < kwords; i++)
+ ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
kernel_neon_begin();
for (i = 0; i < sizeof(rcon); i++) {
u32 *rki = ctx->key_enc + (i * kwords);
u32 *rko = rki + kwords;
-#ifndef CONFIG_CPU_BIG_ENDIAN
rko[0] = ror32(ce_aes_sub(rki[kwords - 1]), 8);
rko[0] = rko[0] ^ rki[0] ^ rcon[i];
-#else
- rko[0] = rol32(ce_aes_sub(rki[kwords - 1]), 8);
- rko[0] = rko[0] ^ rki[0] ^ (rcon[i] << 24);
-#endif
rko[1] = rko[0] ^ rki[1];
rko[2] = rko[1] ^ rki[2];
rko[3] = rko[2] ^ rki[3];
@@ -182,15 +182,15 @@
unsigned int blocks;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
ce_aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, num_rounds(ctx), blocks);
+ ctx->key_enc, num_rounds(ctx), blocks);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
@@ -202,58 +202,192 @@
unsigned int blocks;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
ce_aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_dec, num_rounds(ctx), blocks);
+ ctx->key_dec, num_rounds(ctx), blocks);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
- kernel_neon_end();
+ return err;
+}
+
+static int cbc_encrypt_walk(struct skcipher_request *req,
+ struct skcipher_walk *walk)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ unsigned int blocks;
+ int err = 0;
+
+ while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
+ ce_aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr,
+ ctx->key_enc, num_rounds(ctx), blocks,
+ walk->iv);
+ kernel_neon_end();
+ err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
+ }
return err;
}
static int cbc_encrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
- unsigned int blocks;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+ return cbc_encrypt_walk(req, &walk);
+}
- kernel_neon_begin();
- while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
- ce_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, num_rounds(ctx), blocks,
- walk.iv);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+static int cbc_decrypt_walk(struct skcipher_request *req,
+ struct skcipher_walk *walk)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ unsigned int blocks;
+ int err = 0;
+
+ while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
+ ce_aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr,
+ ctx->key_dec, num_rounds(ctx), blocks,
+ walk->iv);
+ kernel_neon_end();
+ err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
- unsigned int blocks;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
+ return cbc_decrypt_walk(req, &walk);
+}
+
+static int cts_cbc_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false) ?:
+ cbc_encrypt_walk(&subreq, &walk);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
kernel_neon_begin();
- while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
- ce_aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_dec, num_rounds(ctx), blocks,
- walk.iv);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
- }
+ ce_aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_enc, num_rounds(ctx), walk.nbytes,
+ walk.iv);
kernel_neon_end();
- return err;
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false) ?:
+ cbc_decrypt_walk(&subreq, &walk);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_neon_begin();
+ ce_aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key_dec, num_rounds(ctx), walk.nbytes,
+ walk.iv);
+ kernel_neon_end();
+
+ return skcipher_walk_done(&walk, 0);
}
static int ctr_encrypt(struct skcipher_request *req)
@@ -263,13 +397,14 @@
struct skcipher_walk walk;
int err, blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+ kernel_neon_begin();
ce_aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key_enc, num_rounds(ctx), blocks,
+ ctx->key_enc, num_rounds(ctx), blocks,
walk.iv);
+ kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
if (walk.nbytes) {
@@ -283,36 +418,109 @@
*/
blocks = -1;
- ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc,
- num_rounds(ctx), blocks, walk.iv);
+ kernel_neon_begin();
+ ce_aes_ctr_encrypt(tail, NULL, ctx->key_enc, num_rounds(ctx),
+ blocks, walk.iv);
+ kernel_neon_end();
crypto_xor_cpy(tdst, tsrc, tail, nbytes);
err = skcipher_walk_done(&walk, 0);
}
- kernel_neon_end();
-
return err;
}
+static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+{
+ struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ unsigned long flags;
+
+ /*
+ * Temporarily disable interrupts to avoid races where
+ * cachelines are evicted when the CPU is interrupted
+ * to do something else.
+ */
+ local_irq_save(flags);
+ aes_encrypt(ctx, dst, src);
+ local_irq_restore(flags);
+}
+
+static int ctr_encrypt_sync(struct skcipher_request *req)
+{
+ if (!crypto_simd_usable())
+ return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
+
+ return ctr_encrypt(req);
+}
+
static int xts_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = num_rounds(&ctx->key1);
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct scatterlist *src, *dst;
struct skcipher_walk walk;
- unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int xts_blocks = DIV_ROUND_UP(req->cryptlen,
+ AES_BLOCK_SIZE) - 2;
+
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ xts_blocks * AES_BLOCK_SIZE,
+ req->iv);
+ req = &subreq;
+ err = skcipher_walk_virt(&walk, req, false);
+ } else {
+ tail = 0;
+ }
+
+ for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
+ int nbytes = walk.nbytes;
+
+ if (walk.nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ kernel_neon_begin();
+ ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_enc, rounds, nbytes, walk.iv,
+ ctx->key2.key_enc, first);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ if (err || likely(!tail))
+ return err;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
- ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key1.key_enc, rounds, blocks,
- walk.iv, (u8 *)ctx->key2.key_enc, first);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
- }
+ ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_enc, rounds, walk.nbytes, walk.iv,
+ ctx->key2.key_enc, first);
kernel_neon_end();
- return err;
+ return skcipher_walk_done(&walk, 0);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -320,87 +528,165 @@
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = num_rounds(&ctx->key1);
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct scatterlist *src, *dst;
struct skcipher_walk walk;
- unsigned int blocks;
- err = skcipher_walk_virt(&walk, req, true);
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int xts_blocks = DIV_ROUND_UP(req->cryptlen,
+ AES_BLOCK_SIZE) - 2;
+
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ xts_blocks * AES_BLOCK_SIZE,
+ req->iv);
+ req = &subreq;
+ err = skcipher_walk_virt(&walk, req, false);
+ } else {
+ tail = 0;
+ }
+
+ for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
+ int nbytes = walk.nbytes;
+
+ if (walk.nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ kernel_neon_begin();
+ ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_dec, rounds, nbytes, walk.iv,
+ ctx->key2.key_enc, first);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ if (err || likely(!tail))
+ return err;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
kernel_neon_begin();
- for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
- ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
- (u8 *)ctx->key1.key_dec, rounds, blocks,
- walk.iv, (u8 *)ctx->key2.key_enc, first);
- err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
- }
+ ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+ ctx->key1.key_dec, rounds, walk.nbytes, walk.iv,
+ ctx->key2.key_enc, first);
kernel_neon_end();
- return err;
+ return skcipher_walk_done(&walk, 0);
}
static struct skcipher_alg aes_algs[] = { {
- .base = {
- .cra_name = "__ecb(aes)",
- .cra_driver_name = "__ecb-aes-ce",
- .cra_priority = 300,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = ce_aes_setkey,
- .encrypt = ecb_encrypt,
- .decrypt = ecb_decrypt,
+ .base.cra_name = "__ecb(aes)",
+ .base.cra_driver_name = "__ecb-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .setkey = ce_aes_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
}, {
- .base = {
- .cra_name = "__cbc(aes)",
- .cra_driver_name = "__cbc-aes-ce",
- .cra_priority = 300,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ce_aes_setkey,
- .encrypt = cbc_encrypt,
- .decrypt = cbc_decrypt,
+ .base.cra_name = "__cbc(aes)",
+ .base.cra_driver_name = "__cbc-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = ce_aes_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
}, {
- .base = {
- .cra_name = "__ctr(aes)",
- .cra_driver_name = "__ctr-aes-ce",
- .cra_priority = 300,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .chunksize = AES_BLOCK_SIZE,
- .setkey = ce_aes_setkey,
- .encrypt = ctr_encrypt,
- .decrypt = ctr_encrypt,
+ .base.cra_name = "__cts(cbc(aes))",
+ .base.cra_driver_name = "__cts-cbc-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = ce_aes_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
}, {
- .base = {
- .cra_name = "__xts(aes)",
- .cra_driver_name = "__xts-aes-ce",
- .cra_priority = 300,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
- .cra_module = THIS_MODULE,
- },
- .min_keysize = 2 * AES_MIN_KEY_SIZE,
- .max_keysize = 2 * AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = xts_set_key,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
+ .base.cra_name = "__ctr(aes)",
+ .base.cra_driver_name = "__ctr-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = ce_aes_setkey,
+ .encrypt = ctr_encrypt,
+ .decrypt = ctr_encrypt,
+}, {
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-ce-sync",
+ .base.cra_priority = 300 - 1,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct crypto_aes_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = ce_aes_setkey,
+ .encrypt = ctr_encrypt_sync,
+ .decrypt = ctr_encrypt_sync,
+}, {
+ .base.cra_name = "__xts(aes)",
+ .base.cra_driver_name = "__xts-aes-ce",
+ .base.cra_priority = 300,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = AES_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct crypto_aes_xts_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = xts_set_key,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
} };
static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
@@ -429,6 +715,9 @@
return err;
for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+ if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
+ continue;
+
algname = aes_algs[i].base.cra_name + 2;
drvname = aes_algs[i].base.cra_driver_name + 2;
basename = aes_algs[i].base.cra_driver_name;
diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S
index 184d6c2..472e56d 100644
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -1,15 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Scalar AES core transform
*
* Copyright (C) 2017 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
+#include <asm/assembler.h>
#include <asm/cache.h>
.text
@@ -41,7 +39,7 @@
.endif
.endm
- .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op
+ .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr
__select \out0, \in0, 0
__select t0, \in1, 1
__load \out0, \out0, 0, \sz, \op
@@ -73,6 +71,14 @@
__load t0, t0, 3, \sz, \op
__load \t4, \t4, 3, \sz, \op
+ .ifnb \oldcpsr
+ /*
+ * This is the final round and we're done with all data-dependent table
+ * lookups, so we can safely re-enable interrupts.
+ */
+ restore_irqs \oldcpsr
+ .endif
+
eor \out1, \out1, t1, ror #24
eor \out0, \out0, t2, ror #16
ldm rk!, {t1, t2}
@@ -83,14 +89,14 @@
eor \out1, \out1, t2
.endm
- .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+ .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
__hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op
- __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op
+ __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr
.endm
- .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op
+ .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr
__hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op
- __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op
+ __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr
.endm
.macro __rev, out, in
@@ -118,13 +124,14 @@
.macro do_crypt, round, ttab, ltab, bsz
push {r3-r11, lr}
+ // Load keys first, to reduce latency in case they're not cached yet.
+ ldm rk!, {r8-r11}
+
ldr r4, [in]
ldr r5, [in, #4]
ldr r6, [in, #8]
ldr r7, [in, #12]
- ldm rk!, {r8-r11}
-
#ifdef CONFIG_CPU_BIG_ENDIAN
__rev r4, r4
__rev r5, r5
@@ -138,6 +145,25 @@
eor r7, r7, r11
__adrl ttab, \ttab
+ /*
+ * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into
+ * L1 cache, assuming cacheline size >= 32. This is a hardening measure
+ * intended to make cache-timing attacks more difficult. They may not
+ * be fully prevented, however; see the paper
+ * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf
+ * ("Cache-timing attacks on AES") for a discussion of the many
+ * difficulties involved in writing truly constant-time AES software.
+ */
+ save_and_disable_irqs t0
+ .set i, 0
+ .rept 1024 / 128
+ ldr r8, [ttab, #i + 0]
+ ldr r9, [ttab, #i + 32]
+ ldr r10, [ttab, #i + 64]
+ ldr r11, [ttab, #i + 96]
+ .set i, i + 128
+ .endr
+ push {t0} // oldcpsr
tst rounds, #2
bne 1f
@@ -151,8 +177,21 @@
\round r4, r5, r6, r7, r8, r9, r10, r11
b 0b
-2: __adrl ttab, \ltab
- \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b
+2: .ifb \ltab
+ add ttab, ttab, #1
+ .else
+ __adrl ttab, \ltab
+ // Prefetch inverse S-box for final round; see explanation above
+ .set i, 0
+ .rept 256 / 64
+ ldr t0, [ttab, #i + 0]
+ ldr t1, [ttab, #i + 32]
+ .set i, i + 64
+ .endr
+ .endif
+
+ pop {rounds} // oldcpsr
+ \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds
#ifdef CONFIG_CPU_BIG_ENDIAN
__rev r4, r4
@@ -175,48 +214,10 @@
.endm
ENTRY(__aes_arm_encrypt)
- do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
+ do_crypt fround, crypto_ft_tab,, 2
ENDPROC(__aes_arm_encrypt)
.align 5
ENTRY(__aes_arm_decrypt)
- do_crypt iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
+ do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0
ENDPROC(__aes_arm_decrypt)
-
- .section ".rodata", "a"
- .align L1_CACHE_SHIFT
- .type __aes_arm_inverse_sbox, %object
-__aes_arm_inverse_sbox:
- .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
- .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
- .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
- .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
- .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
- .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
- .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
- .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
- .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
- .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
- .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
- .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
- .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
- .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
- .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
- .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
- .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
- .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
- .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
- .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
- .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
- .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
- .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
- .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
- .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
- .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
- .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
- .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
- .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
- .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
- .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
- .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
- .size __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox
diff --git a/arch/arm/crypto/aes-cipher-glue.c b/arch/arm/crypto/aes-cipher-glue.c
index c222f6e..8cd00f5 100644
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Scalar AES core transform
*
* Copyright (C) 2017 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <crypto/aes.h>
@@ -14,12 +11,9 @@
#include <linux/module.h>
asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
-EXPORT_SYMBOL(__aes_arm_encrypt);
-
asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
-EXPORT_SYMBOL(__aes_arm_decrypt);
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_arm_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@@ -27,7 +21,7 @@
__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
}
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_arm_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@@ -47,8 +41,8 @@
.cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE,
.cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE,
.cra_cipher.cia_setkey = crypto_aes_set_key,
- .cra_cipher.cia_encrypt = aes_encrypt,
- .cra_cipher.cia_decrypt = aes_decrypt,
+ .cra_cipher.cia_encrypt = aes_arm_encrypt,
+ .cra_cipher.cia_decrypt = aes_arm_decrypt,
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
.cra_alignmask = 3,
diff --git a/arch/arm/crypto/aes-neonbs-core.S b/arch/arm/crypto/aes-neonbs-core.S
index 2b625c6..cfaed4e 100644
--- a/arch/arm/crypto/aes-neonbs-core.S
+++ b/arch/arm/crypto/aes-neonbs-core.S
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Bit sliced AES using NEON instructions
*
* Copyright (C) 2017 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
/*
@@ -890,19 +887,17 @@
veor \out, \out, \tmp
.endm
- .align 4
-.Lxts_mul_x:
- .quad 1, 0x87
-
/*
* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[])
+ * int blocks, u8 iv[], int reorder_last_tweak)
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
- * int blocks, u8 iv[])
+ * int blocks, u8 iv[], int reorder_last_tweak)
*/
__xts_prepare8:
vld1.8 {q14}, [r7] // load iv
- __ldr q15, .Lxts_mul_x // load tweak mask
+ vmov.i32 d30, #0x87 // compose tweak mask vector
+ vmovl.u32 q15, d30
+ vshr.u64 d30, d31, #7
vmov q12, q14
__adr ip, 0f
@@ -949,17 +944,25 @@
vld1.8 {q7}, [r1]!
next_tweak q14, q12, q15, q13
- veor q7, q7, q12
+THUMB( itt le )
+ W(cmple) r8, #0
+ ble 1f
+0: veor q7, q7, q12
vst1.8 {q12}, [r4, :128]
-0: vst1.8 {q14}, [r7] // store next iv
+ vst1.8 {q14}, [r7] // store next iv
bx lr
+
+1: vswp q12, q14
+ b 0b
ENDPROC(__xts_prepare8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
push {r4-r8, lr}
mov r5, sp // preserve sp
ldrd r6, r7, [sp, #24] // get blocks and iv args
+ ldr r8, [sp, #32] // reorder final tweak?
+ rsb r8, r8, #1
sub ip, sp, #128 // make room for 8x tweak
bic ip, ip, #0xf // align sp to 16 bytes
mov sp, ip
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index 07e3194..e85839a 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -1,18 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Bit sliced AES using NEON instructions
*
* Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <asm/neon.h>
+#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/cbc.h>
+#include <crypto/ctr.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <linux/module.h>
@@ -38,9 +38,9 @@
int rounds, int blocks, u8 ctr[], u8 final[]);
asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[]);
+ int rounds, int blocks, u8 iv[], int);
asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[]);
+ int rounds, int blocks, u8 iv[], int);
struct aesbs_ctx {
int rounds;
@@ -54,9 +54,15 @@
struct aesbs_xts_ctx {
struct aesbs_ctx key;
+ struct crypto_cipher *cts_tfm;
struct crypto_cipher *tweak_tfm;
};
+struct aesbs_ctr_ctx {
+ struct aesbs_ctx key; /* must be first member */
+ struct crypto_aes_ctx fallback;
+};
+
static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
@@ -64,7 +70,7 @@
struct crypto_aes_ctx rk;
int err;
- err = crypto_aes_expand_key(&rk, in_key, key_len);
+ err = aes_expandkey(&rk, in_key, key_len);
if (err)
return err;
@@ -86,9 +92,8 @@
struct skcipher_walk walk;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@@ -96,12 +101,13 @@
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
+ kernel_neon_begin();
fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
ctx->rounds, blocks);
+ kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
@@ -123,7 +129,7 @@
struct crypto_aes_ctx rk;
int err;
- err = crypto_aes_expand_key(&rk, in_key, key_len);
+ err = aes_expandkey(&rk, in_key, key_len);
if (err)
return err;
@@ -155,9 +161,8 @@
struct skcipher_walk walk;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@@ -165,13 +170,14 @@
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
+ kernel_neon_begin();
aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key.rk, ctx->key.rounds, blocks,
walk.iv);
+ kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
@@ -192,6 +198,25 @@
crypto_free_cipher(ctx->enc_tfm);
}
+static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
+ unsigned int key_len)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = aes_expandkey(&ctx->fallback, in_key, key_len);
+ if (err)
+ return err;
+
+ ctx->key.rounds = 6 + key_len / 4;
+
+ kernel_neon_begin();
+ aesbs_convert_key(ctx->key.rk, ctx->fallback.key_enc, ctx->key.rounds);
+ kernel_neon_end();
+
+ return 0;
+}
+
static int ctr_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
@@ -200,9 +225,8 @@
u8 buf[AES_BLOCK_SIZE];
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_neon_begin();
while (walk.nbytes > 0) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
@@ -213,8 +237,10 @@
final = NULL;
}
+ kernel_neon_begin();
aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->rk, ctx->rounds, blocks, walk.iv, final);
+ kernel_neon_end();
if (final) {
u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
@@ -229,11 +255,33 @@
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
- kernel_neon_end();
return err;
}
+static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+{
+ struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+ unsigned long flags;
+
+ /*
+ * Temporarily disable interrupts to avoid races where
+ * cachelines are evicted when the CPU is interrupted
+ * to do something else.
+ */
+ local_irq_save(flags);
+ aes_encrypt(&ctx->fallback, dst, src);
+ local_irq_restore(flags);
+}
+
+static int ctr_encrypt_sync(struct skcipher_request *req)
+{
+ if (!crypto_simd_usable())
+ return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
+
+ return ctr_encrypt(req);
+}
+
static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
@@ -245,6 +293,9 @@
return err;
key_len /= 2;
+ err = crypto_cipher_setkey(ctx->cts_tfm, in_key, key_len);
+ if (err)
+ return err;
err = crypto_cipher_setkey(ctx->tweak_tfm, in_key + key_len, key_len);
if (err)
return err;
@@ -256,7 +307,13 @@
{
struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ ctx->cts_tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(ctx->cts_tfm))
+ return PTR_ERR(ctx->cts_tfm);
+
ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0);
+ if (IS_ERR(ctx->tweak_tfm))
+ crypto_free_cipher(ctx->cts_tfm);
return PTR_ERR_OR_ZERO(ctx->tweak_tfm);
}
@@ -266,47 +323,89 @@
struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
crypto_free_cipher(ctx->tweak_tfm);
+ crypto_free_cipher(ctx->cts_tfm);
}
-static int __xts_crypt(struct skcipher_request *req,
+static int __xts_crypt(struct skcipher_request *req, bool encrypt,
void (*fn)(u8 out[], u8 const in[], u8 const rk[],
- int rounds, int blocks, u8 iv[]))
+ int rounds, int blocks, u8 iv[], int))
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct skcipher_request subreq;
+ u8 buf[2 * AES_BLOCK_SIZE];
struct skcipher_walk walk;
int err;
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ if (unlikely(tail)) {
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ req->cryptlen - tail, req->iv);
+ req = &subreq;
+ }
+
err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv);
- kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
+ int reorder_last_tweak = !encrypt && tail > 0;
- if (walk.nbytes < walk.total)
+ if (walk.nbytes < walk.total) {
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
+ reorder_last_tweak = 0;
+ }
+ kernel_neon_begin();
fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
- ctx->key.rounds, blocks, walk.iv);
+ ctx->key.rounds, blocks, walk.iv, reorder_last_tweak);
+ kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
- kernel_neon_end();
- return err;
+ if (err || likely(!tail))
+ return err;
+
+ /* handle ciphertext stealing */
+ scatterwalk_map_and_copy(buf, req->dst, req->cryptlen - AES_BLOCK_SIZE,
+ AES_BLOCK_SIZE, 0);
+ memcpy(buf + AES_BLOCK_SIZE, buf, tail);
+ scatterwalk_map_and_copy(buf, req->src, req->cryptlen, tail, 0);
+
+ crypto_xor(buf, req->iv, AES_BLOCK_SIZE);
+
+ if (encrypt)
+ crypto_cipher_encrypt_one(ctx->cts_tfm, buf, buf);
+ else
+ crypto_cipher_decrypt_one(ctx->cts_tfm, buf, buf);
+
+ crypto_xor(buf, req->iv, AES_BLOCK_SIZE);
+
+ scatterwalk_map_and_copy(buf, req->dst, req->cryptlen - AES_BLOCK_SIZE,
+ AES_BLOCK_SIZE + tail, 1);
+ return 0;
}
static int xts_encrypt(struct skcipher_request *req)
{
- return __xts_crypt(req, aesbs_xts_encrypt);
+ return __xts_crypt(req, true, aesbs_xts_encrypt);
}
static int xts_decrypt(struct skcipher_request *req)
{
- return __xts_crypt(req, aesbs_xts_decrypt);
+ return __xts_crypt(req, false, aesbs_xts_decrypt);
}
static struct skcipher_alg aes_algs[] = { {
@@ -360,6 +459,22 @@
.encrypt = ctr_encrypt,
.decrypt = ctr_encrypt,
}, {
+ .base.cra_name = "ctr(aes)",
+ .base.cra_driver_name = "ctr-aes-neonbs-sync",
+ .base.cra_priority = 250 - 1,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aesbs_ctr_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .walksize = 8 * AES_BLOCK_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesbs_ctr_setkey_sync,
+ .encrypt = ctr_encrypt_sync,
+ .decrypt = ctr_encrypt_sync,
+}, {
.base.cra_name = "__xts(aes)",
.base.cra_driver_name = "__xts-aes-neonbs",
.base.cra_priority = 250,
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
new file mode 100644
index 0000000..eb22926
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -0,0 +1,560 @@
+/*
+ * ChaCha/XChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+ * NEON doesn't have a rotate instruction. The alternatives are, more or less:
+ *
+ * (a) vshl.u32 + vsri.u32 (needs temporary register)
+ * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
+ * (c) vrev32.16 (16-bit rotations only)
+ * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
+ * needs index vector)
+ *
+ * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
+ * the only choices are (a) and (b). We use (a) since it takes two-thirds the
+ * cycles of (b) on both Cortex-A7 and Cortex-A53.
+ *
+ * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+ * and doesn't need a temporary register.
+ *
+ * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
+ * is twice as fast as (a), even when doing (a) on multiple registers
+ * simultaneously to eliminate the stall between vshl and vsri. Also, it
+ * parallelizes better when temporary registers are scarce.
+ *
+ * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+ * (a), so the need to load the rotation table actually makes the vtbl method
+ * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
+ * seems to be a good compromise to get a more significant speed boost on some
+ * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+ .align 5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3. It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+ adr ip, .Lrol8_table
+ vld1.8 {d10}, [ip, :64]
+
+.Ldoubleround:
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vext.8 q1, q1, q1, #4
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vext.8 q3, q3, q3, #12
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vrev32.16 q3, q3
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #12
+ vsri.u32 q1, q4, #20
+
+ // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vadd.i32 q0, q0, q1
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
+
+ // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vadd.i32 q2, q2, q3
+ veor q4, q1, q2
+ vshl.u32 q1, q4, #7
+ vsri.u32 q1, q4, #25
+
+ // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vext.8 q1, q1, q1, #12
+ // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vext.8 q2, q2, q2, #8
+ // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vext.8 q3, q3, q3, #4
+
+ subs r3, r3, #2
+ bne .Ldoubleround
+
+ bx lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+ // r0: Input state matrix, s
+ // r1: 1 data block output, o
+ // r2: 1 data block input, i
+ // r3: nrounds
+ push {lr}
+
+ // x0..3 = s0..3
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ vmov q8, q0
+ vmov q9, q1
+ vmov q10, q2
+ vmov q11, q3
+
+ bl chacha_permute
+
+ add ip, r2, #0x20
+ vld1.8 {q4-q5}, [r2]
+ vld1.8 {q6-q7}, [ip]
+
+ // o0 = i0 ^ (x0 + s0)
+ vadd.i32 q0, q0, q8
+ veor q0, q0, q4
+
+ // o1 = i1 ^ (x1 + s1)
+ vadd.i32 q1, q1, q9
+ veor q1, q1, q5
+
+ // o2 = i2 ^ (x2 + s2)
+ vadd.i32 q2, q2, q10
+ veor q2, q2, q6
+
+ // o3 = i3 ^ (x3 + s3)
+ vadd.i32 q3, q3, q11
+ veor q3, q3, q7
+
+ add ip, r1, #0x20
+ vst1.8 {q0-q1}, [r1]
+ vst1.8 {q2-q3}, [ip]
+
+ pop {pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+ // r0: Input state matrix, s
+ // r1: output (8 32-bit words)
+ // r2: nrounds
+ push {lr}
+
+ vld1.32 {q0-q1}, [r0]!
+ vld1.32 {q2-q3}, [r0]
+
+ mov r3, r2
+ bl chacha_permute
+
+ vst1.32 {q0}, [r1]!
+ vst1.32 {q3}, [r1]
+
+ pop {pc}
+ENDPROC(hchacha_block_neon)
+
+ .align 4
+.Lctrinc: .word 0, 1, 2, 3
+.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
+
+ .align 5
+ENTRY(chacha_4block_xor_neon)
+ push {r4-r5}
+ mov r4, sp // preserve the stack pointer
+ sub ip, sp, #0x20 // allocate a 32 byte buffer
+ bic ip, ip, #0x1f // aligned to 32 bytes
+ mov sp, ip
+
+ // r0: Input state matrix, s
+ // r1: 4 data blocks output, o
+ // r2: 4 data blocks input, i
+ // r3: nrounds
+
+ //
+ // This function encrypts four consecutive ChaCha blocks by loading
+ // the state matrix in NEON registers four times. The algorithm performs
+ // each operation on the corresponding word of each state matrix, hence
+ // requires no word shuffling. The words are re-interleaved before the
+ // final addition of the original state and the XORing step.
+ //
+
+ // x0..15[0-3] = s0..15[0-3]
+ add ip, r0, #0x20
+ vld1.32 {q0-q1}, [r0]
+ vld1.32 {q2-q3}, [ip]
+
+ adr r5, .Lctrinc
+ vdup.32 q15, d7[1]
+ vdup.32 q14, d7[0]
+ vld1.32 {q4}, [r5, :128]
+ vdup.32 q13, d6[1]
+ vdup.32 q12, d6[0]
+ vdup.32 q11, d5[1]
+ vdup.32 q10, d5[0]
+ vadd.u32 q12, q12, q4 // x12 += counter values 0-3
+ vdup.32 q9, d4[1]
+ vdup.32 q8, d4[0]
+ vdup.32 q7, d3[1]
+ vdup.32 q6, d3[0]
+ vdup.32 q5, d2[1]
+ vdup.32 q4, d2[0]
+ vdup.32 q3, d1[1]
+ vdup.32 q2, d1[0]
+ vdup.32 q1, d0[1]
+ vdup.32 q0, d0[0]
+
+ adr ip, .Lrol8_table
+ b 1f
+
+.Ldoubleround4:
+ vld1.32 {q8-q9}, [sp, :256]
+1:
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+ vrev32.16 q15, q15
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #12
+ vshl.u32 q5, q9, #12
+ vsri.u32 q4, q8, #20
+ vsri.u32 q5, q9, #20
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #12
+ vshl.u32 q7, q9, #12
+ vsri.u32 q6, q8, #20
+ vsri.u32 q7, q9, #20
+
+ // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q4
+ vadd.i32 q1, q1, q5
+ vadd.i32 q2, q2, q6
+ vadd.i32 q3, q3, q7
+
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
+
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vadd.i32 q8, q8, q12
+ vadd.i32 q9, q9, q13
+ vadd.i32 q10, q10, q14
+ vadd.i32 q11, q11, q15
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q4, q8
+ veor q9, q5, q9
+ vshl.u32 q4, q8, #7
+ vshl.u32 q5, q9, #7
+ vsri.u32 q4, q8, #25
+ vsri.u32 q5, q9, #25
+
+ veor q8, q6, q10
+ veor q9, q7, q11
+ vshl.u32 q6, q8, #7
+ vshl.u32 q7, q9, #7
+ vsri.u32 q6, q8, #25
+ vsri.u32 q7, q9, #25
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vrev32.16 q15, q15
+ vrev32.16 q12, q12
+ vrev32.16 q13, q13
+ vrev32.16 q14, q14
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #12
+ vshl.u32 q4, q9, #12
+ vsri.u32 q7, q8, #20
+ vsri.u32 q4, q9, #20
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #12
+ vshl.u32 q6, q9, #12
+ vsri.u32 q5, q8, #20
+ vsri.u32 q6, q9, #20
+
+ // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
+ vadd.i32 q0, q0, q5
+ vadd.i32 q1, q1, q6
+ vadd.i32 q2, q2, q7
+ vadd.i32 q3, q3, q4
+
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
+
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+
+ vld1.32 {q8-q9}, [sp, :256]
+
+ // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vadd.i32 q10, q10, q15
+ vadd.i32 q11, q11, q12
+ vadd.i32 q8, q8, q13
+ vadd.i32 q9, q9, q14
+
+ vst1.32 {q8-q9}, [sp, :256]
+
+ veor q8, q7, q8
+ veor q9, q4, q9
+ vshl.u32 q7, q8, #7
+ vshl.u32 q4, q9, #7
+ vsri.u32 q7, q8, #25
+ vsri.u32 q4, q9, #25
+
+ veor q8, q5, q10
+ veor q9, q6, q11
+ vshl.u32 q5, q8, #7
+ vshl.u32 q6, q9, #7
+ vsri.u32 q5, q8, #25
+ vsri.u32 q6, q9, #25
+
+ subs r3, r3, #2
+ bne .Ldoubleround4
+
+ // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+ // x8..9[0-3] are on the stack.
+
+ // Re-interleave the words in the first two rows of each block (x0..7).
+ // Also add the counter values 0-3 to x12[0-3].
+ vld1.32 {q8}, [r5, :128] // load counter values 0-3
+ vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
+ vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
+ vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
+ vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
+ vadd.u32 q12, q8 // x12 += counter values 0-3
+ vswp d1, d4
+ vswp d3, d6
+ vld1.32 {q8-q9}, [r0]! // load s0..7
+ vswp d9, d12
+ vswp d11, d14
+
+ // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+ // after XORing the first 32 bytes.
+ vswp q1, q4
+
+ // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+ // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
+ vadd.u32 q0, q0, q8
+ vadd.u32 q2, q2, q8
+ vadd.u32 q4, q4, q8
+ vadd.u32 q3, q3, q8
+
+ // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
+ vadd.u32 q1, q1, q9
+ vadd.u32 q6, q6, q9
+ vadd.u32 q5, q5, q9
+ vadd.u32 q7, q7, q9
+
+ // XOR first 32 bytes using keystream from first two rows of first block
+ vld1.8 {q8-q9}, [r2]!
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vst1.8 {q8-q9}, [r1]!
+
+ // Re-interleave the words in the last two rows of each block (x8..15).
+ vld1.32 {q8-q9}, [sp, :256]
+ vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
+ vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
+ vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
+ vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
+ vld1.32 {q0-q1}, [r0] // load s8..15
+ vswp d25, d28
+ vswp d27, d30
+ vswp d17, d20
+ vswp d19, d22
+
+ // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+ // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
+ vadd.u32 q8, q8, q0
+ vadd.u32 q10, q10, q0
+ vadd.u32 q9, q9, q0
+ vadd.u32 q11, q11, q0
+
+ // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+ vadd.u32 q12, q12, q1
+ vadd.u32 q14, q14, q1
+ vadd.u32 q13, q13, q1
+ vadd.u32 q15, q15, q1
+
+ // XOR the rest of the data with the keystream
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q8
+ veor q1, q1, q12
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q2
+ veor q1, q1, q6
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q10
+ veor q1, q1, q14
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q4
+ veor q1, q1, q5
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q9
+ veor q1, q1, q13
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]!
+ veor q0, q0, q3
+ veor q1, q1, q7
+ vst1.8 {q0-q1}, [r1]!
+
+ vld1.8 {q0-q1}, [r2]
+ mov sp, r4 // restore original stack pointer
+ veor q0, q0, q11
+ veor q1, q1, q15
+ vst1.8 {q0-q1}, [r1]
+
+ pop {r4-r5}
+ bx lr
+ENDPROC(chacha_4block_xor_neon)
diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c
new file mode 100644
index 0000000..a8e9b53
--- /dev/null
+++ b/arch/arm/crypto/chacha-neon-glue.c
@@ -0,0 +1,202 @@
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_neon(state, dst, src, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ while (bytes >= CHACHA_BLOCK_SIZE) {
+ chacha_block_xor_neon(state, dst, src, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE;
+ src += CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
+ state[12]++;
+ }
+ if (bytes) {
+ memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, buf, buf, nrounds);
+ memcpy(dst, buf, bytes);
+ }
+}
+
+static int chacha_neon_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, ctx->nrounds);
+ kernel_neon_end();
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+ return crypto_chacha_crypt(req);
+
+ return chacha_neon_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
+ return crypto_xchacha_crypt(req);
+
+ crypto_chacha_init(state, ctx, req->iv);
+
+ kernel_neon_begin();
+ hchacha_block_neon(state, subctx.key, ctx->nrounds);
+ kernel_neon_end();
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ return chacha_neon_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha_neon,
+ .decrypt = chacha_neon,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha12_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ }
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
deleted file mode 100644
index 451a849..0000000
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
- .text
- .fpu neon
- .align 5
-
-ENTRY(chacha20_block_xor_neon)
- // r0: Input state matrix, s
- // r1: 1 data block output, o
- // r2: 1 data block input, i
-
- //
- // This function encrypts one ChaCha20 block by loading the state matrix
- // in four NEON registers. It performs matrix operation on four words in
- // parallel, but requireds shuffling to rearrange the words after each
- // round.
- //
-
- // x0..3 = s0..3
- add ip, r0, #0x20
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [ip]
-
- vmov q8, q0
- vmov q9, q1
- vmov q10, q2
- vmov q11, q3
-
- mov r3, #10
-
-.Ldoubleround:
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vrev32.16 q3, q3
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #12
- vsri.u32 q1, q4, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vadd.i32 q0, q0, q1
- veor q4, q3, q0
- vshl.u32 q3, q4, #8
- vsri.u32 q3, q4, #24
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #7
- vsri.u32 q1, q4, #25
-
- // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vext.8 q1, q1, q1, #4
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vext.8 q2, q2, q2, #8
- // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vext.8 q3, q3, q3, #12
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vadd.i32 q0, q0, q1
- veor q3, q3, q0
- vrev32.16 q3, q3
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #12
- vsri.u32 q1, q4, #20
-
- // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vadd.i32 q0, q0, q1
- veor q4, q3, q0
- vshl.u32 q3, q4, #8
- vsri.u32 q3, q4, #24
-
- // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vadd.i32 q2, q2, q3
- veor q4, q1, q2
- vshl.u32 q1, q4, #7
- vsri.u32 q1, q4, #25
-
- // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vext.8 q1, q1, q1, #12
- // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vext.8 q2, q2, q2, #8
- // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vext.8 q3, q3, q3, #4
-
- subs r3, r3, #1
- bne .Ldoubleround
-
- add ip, r2, #0x20
- vld1.8 {q4-q5}, [r2]
- vld1.8 {q6-q7}, [ip]
-
- // o0 = i0 ^ (x0 + s0)
- vadd.i32 q0, q0, q8
- veor q0, q0, q4
-
- // o1 = i1 ^ (x1 + s1)
- vadd.i32 q1, q1, q9
- veor q1, q1, q5
-
- // o2 = i2 ^ (x2 + s2)
- vadd.i32 q2, q2, q10
- veor q2, q2, q6
-
- // o3 = i3 ^ (x3 + s3)
- vadd.i32 q3, q3, q11
- veor q3, q3, q7
-
- add ip, r1, #0x20
- vst1.8 {q0-q1}, [r1]
- vst1.8 {q2-q3}, [ip]
-
- bx lr
-ENDPROC(chacha20_block_xor_neon)
-
- .align 5
-ENTRY(chacha20_4block_xor_neon)
- push {r4-r6, lr}
- mov ip, sp // preserve the stack pointer
- sub r3, sp, #0x20 // allocate a 32 byte buffer
- bic r3, r3, #0x1f // aligned to 32 bytes
- mov sp, r3
-
- // r0: Input state matrix, s
- // r1: 4 data blocks output, o
- // r2: 4 data blocks input, i
-
- //
- // This function encrypts four consecutive ChaCha20 blocks by loading
- // the state matrix in NEON registers four times. The algorithm performs
- // each operation on the corresponding word of each state matrix, hence
- // requires no word shuffling. For final XORing step we transpose the
- // matrix by interleaving 32- and then 64-bit words, which allows us to
- // do XOR in NEON registers.
- //
-
- // x0..15[0-3] = s0..3[0..3]
- add r3, r0, #0x20
- vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [r3]
-
- adr r3, CTRINC
- vdup.32 q15, d7[1]
- vdup.32 q14, d7[0]
- vld1.32 {q11}, [r3, :128]
- vdup.32 q13, d6[1]
- vdup.32 q12, d6[0]
- vadd.i32 q12, q12, q11 // x12 += counter values 0-3
- vdup.32 q11, d5[1]
- vdup.32 q10, d5[0]
- vdup.32 q9, d4[1]
- vdup.32 q8, d4[0]
- vdup.32 q7, d3[1]
- vdup.32 q6, d3[0]
- vdup.32 q5, d2[1]
- vdup.32 q4, d2[0]
- vdup.32 q3, d1[1]
- vdup.32 q2, d1[0]
- vdup.32 q1, d0[1]
- vdup.32 q0, d0[0]
-
- mov r3, #10
-
-.Ldoubleround4:
- // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vadd.i32 q0, q0, q4
- vadd.i32 q1, q1, q5
- vadd.i32 q2, q2, q6
- vadd.i32 q3, q3, q7
-
- veor q12, q12, q0
- veor q13, q13, q1
- veor q14, q14, q2
- veor q15, q15, q3
-
- vrev32.16 q12, q12
- vrev32.16 q13, q13
- vrev32.16 q14, q14
- vrev32.16 q15, q15
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i32 q10, q10, q14
- vadd.i32 q11, q11, q15
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q4, q8
- veor q9, q5, q9
- vshl.u32 q4, q8, #12
- vshl.u32 q5, q9, #12
- vsri.u32 q4, q8, #20
- vsri.u32 q5, q9, #20
-
- veor q8, q6, q10
- veor q9, q7, q11
- vshl.u32 q6, q8, #12
- vshl.u32 q7, q9, #12
- vsri.u32 q6, q8, #20
- vsri.u32 q7, q9, #20
-
- // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vadd.i32 q0, q0, q4
- vadd.i32 q1, q1, q5
- vadd.i32 q2, q2, q6
- vadd.i32 q3, q3, q7
-
- veor q8, q12, q0
- veor q9, q13, q1
- vshl.u32 q12, q8, #8
- vshl.u32 q13, q9, #8
- vsri.u32 q12, q8, #24
- vsri.u32 q13, q9, #24
-
- veor q8, q14, q2
- veor q9, q15, q3
- vshl.u32 q14, q8, #8
- vshl.u32 q15, q9, #8
- vsri.u32 q14, q8, #24
- vsri.u32 q15, q9, #24
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vadd.i32 q8, q8, q12
- vadd.i32 q9, q9, q13
- vadd.i32 q10, q10, q14
- vadd.i32 q11, q11, q15
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q4, q8
- veor q9, q5, q9
- vshl.u32 q4, q8, #7
- vshl.u32 q5, q9, #7
- vsri.u32 q4, q8, #25
- vsri.u32 q5, q9, #25
-
- veor q8, q6, q10
- veor q9, q7, q11
- vshl.u32 q6, q8, #7
- vshl.u32 q7, q9, #7
- vsri.u32 q6, q8, #25
- vsri.u32 q7, q9, #25
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vadd.i32 q0, q0, q5
- vadd.i32 q1, q1, q6
- vadd.i32 q2, q2, q7
- vadd.i32 q3, q3, q4
-
- veor q15, q15, q0
- veor q12, q12, q1
- veor q13, q13, q2
- veor q14, q14, q3
-
- vrev32.16 q15, q15
- vrev32.16 q12, q12
- vrev32.16 q13, q13
- vrev32.16 q14, q14
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vadd.i32 q10, q10, q15
- vadd.i32 q11, q11, q12
- vadd.i32 q8, q8, q13
- vadd.i32 q9, q9, q14
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q7, q8
- veor q9, q4, q9
- vshl.u32 q7, q8, #12
- vshl.u32 q4, q9, #12
- vsri.u32 q7, q8, #20
- vsri.u32 q4, q9, #20
-
- veor q8, q5, q10
- veor q9, q6, q11
- vshl.u32 q5, q8, #12
- vshl.u32 q6, q9, #12
- vsri.u32 q5, q8, #20
- vsri.u32 q6, q9, #20
-
- // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vadd.i32 q0, q0, q5
- vadd.i32 q1, q1, q6
- vadd.i32 q2, q2, q7
- vadd.i32 q3, q3, q4
-
- veor q8, q15, q0
- veor q9, q12, q1
- vshl.u32 q15, q8, #8
- vshl.u32 q12, q9, #8
- vsri.u32 q15, q8, #24
- vsri.u32 q12, q9, #24
-
- veor q8, q13, q2
- veor q9, q14, q3
- vshl.u32 q13, q8, #8
- vshl.u32 q14, q9, #8
- vsri.u32 q13, q8, #24
- vsri.u32 q14, q9, #24
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vadd.i32 q10, q10, q15
- vadd.i32 q11, q11, q12
- vadd.i32 q8, q8, q13
- vadd.i32 q9, q9, q14
-
- vst1.32 {q8-q9}, [sp, :256]
-
- veor q8, q7, q8
- veor q9, q4, q9
- vshl.u32 q7, q8, #7
- vshl.u32 q4, q9, #7
- vsri.u32 q7, q8, #25
- vsri.u32 q4, q9, #25
-
- veor q8, q5, q10
- veor q9, q6, q11
- vshl.u32 q5, q8, #7
- vshl.u32 q6, q9, #7
- vsri.u32 q5, q8, #25
- vsri.u32 q6, q9, #25
-
- subs r3, r3, #1
- beq 0f
-
- vld1.32 {q8-q9}, [sp, :256]
- b .Ldoubleround4
-
- // x0[0-3] += s0[0]
- // x1[0-3] += s0[1]
- // x2[0-3] += s0[2]
- // x3[0-3] += s0[3]
-0: ldmia r0!, {r3-r6}
- vdup.32 q8, r3
- vdup.32 q9, r4
- vadd.i32 q0, q0, q8
- vadd.i32 q1, q1, q9
- vdup.32 q8, r5
- vdup.32 q9, r6
- vadd.i32 q2, q2, q8
- vadd.i32 q3, q3, q9
-
- // x4[0-3] += s1[0]
- // x5[0-3] += s1[1]
- // x6[0-3] += s1[2]
- // x7[0-3] += s1[3]
- ldmia r0!, {r3-r6}
- vdup.32 q8, r3
- vdup.32 q9, r4
- vadd.i32 q4, q4, q8
- vadd.i32 q5, q5, q9
- vdup.32 q8, r5
- vdup.32 q9, r6
- vadd.i32 q6, q6, q8
- vadd.i32 q7, q7, q9
-
- // interleave 32-bit words in state n, n+1
- vzip.32 q0, q1
- vzip.32 q2, q3
- vzip.32 q4, q5
- vzip.32 q6, q7
-
- // interleave 64-bit words in state n, n+2
- vswp d1, d4
- vswp d3, d6
- vswp d9, d12
- vswp d11, d14
-
- // xor with corresponding input, write to output
- vld1.8 {q8-q9}, [r2]!
- veor q8, q8, q0
- veor q9, q9, q4
- vst1.8 {q8-q9}, [r1]!
-
- vld1.32 {q8-q9}, [sp, :256]
-
- // x8[0-3] += s2[0]
- // x9[0-3] += s2[1]
- // x10[0-3] += s2[2]
- // x11[0-3] += s2[3]
- ldmia r0!, {r3-r6}
- vdup.32 q0, r3
- vdup.32 q4, r4
- vadd.i32 q8, q8, q0
- vadd.i32 q9, q9, q4
- vdup.32 q0, r5
- vdup.32 q4, r6
- vadd.i32 q10, q10, q0
- vadd.i32 q11, q11, q4
-
- // x12[0-3] += s3[0]
- // x13[0-3] += s3[1]
- // x14[0-3] += s3[2]
- // x15[0-3] += s3[3]
- ldmia r0!, {r3-r6}
- vdup.32 q0, r3
- vdup.32 q4, r4
- adr r3, CTRINC
- vadd.i32 q12, q12, q0
- vld1.32 {q0}, [r3, :128]
- vadd.i32 q13, q13, q4
- vadd.i32 q12, q12, q0 // x12 += counter values 0-3
-
- vdup.32 q0, r5
- vdup.32 q4, r6
- vadd.i32 q14, q14, q0
- vadd.i32 q15, q15, q4
-
- // interleave 32-bit words in state n, n+1
- vzip.32 q8, q9
- vzip.32 q10, q11
- vzip.32 q12, q13
- vzip.32 q14, q15
-
- // interleave 64-bit words in state n, n+2
- vswp d17, d20
- vswp d19, d22
- vswp d25, d28
- vswp d27, d30
-
- vmov q4, q1
-
- vld1.8 {q0-q1}, [r2]!
- veor q0, q0, q8
- veor q1, q1, q12
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- veor q0, q0, q2
- veor q1, q1, q6
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- veor q0, q0, q10
- veor q1, q1, q14
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- veor q0, q0, q4
- veor q1, q1, q5
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- veor q0, q0, q9
- veor q1, q1, q13
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]!
- veor q0, q0, q3
- veor q1, q1, q7
- vst1.8 {q0-q1}, [r1]!
-
- vld1.8 {q0-q1}, [r2]
- veor q0, q0, q11
- veor q1, q1, q15
- vst1.8 {q0-q1}, [r1]
-
- mov sp, ip
- pop {r4-r6, pc}
-ENDPROC(chacha20_4block_xor_neon)
-
- .align 4
-CTRINC: .word 0, 1, 2, 3
diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c
deleted file mode 100644
index 59a7be0..0000000
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
-
-static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes)
-{
- u8 buf[CHACHA20_BLOCK_SIZE];
-
- while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
- chacha20_4block_xor_neon(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 4;
- src += CHACHA20_BLOCK_SIZE * 4;
- dst += CHACHA20_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_xor_neon(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha20_block_xor_neon(state, buf, buf);
- memcpy(dst, buf, bytes);
- }
-}
-
-static int chacha20_neon(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- u32 state[16];
- int err;
-
- if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
- return crypto_chacha20_crypt(req);
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_chacha20_init(state, ctx, walk.iv);
-
- kernel_neon_begin();
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes = round_down(nbytes, walk.stride);
-
- chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes);
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- }
- kernel_neon_end();
-
- return err;
-}
-
-static struct skcipher_alg alg = {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha20_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA20_KEY_SIZE,
- .max_keysize = CHACHA20_KEY_SIZE,
- .ivsize = CHACHA20_IV_SIZE,
- .chunksize = CHACHA20_BLOCK_SIZE,
- .walksize = 4 * CHACHA20_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
- .encrypt = chacha20_neon,
- .decrypt = chacha20_neon,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
- if (!(elf_hwcap & HWCAP_NEON))
- return -ENODEV;
-
- return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
- crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index 96e62ec..9559249 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/cpufeature.h>
@@ -16,6 +13,7 @@
#include <linux/string.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
@@ -113,7 +111,7 @@
u32 *crc = shash_desc_ctx(desc);
unsigned int l;
- if (may_use_simd()) {
+ if (crypto_simd_usable()) {
if ((u32)data % SCALE_F) {
l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
@@ -147,7 +145,7 @@
u32 *crc = shash_desc_ctx(desc);
unsigned int l;
- if (may_use_simd()) {
+ if (crypto_simd_usable()) {
if ((u32)data % SCALE_F) {
l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F));
@@ -236,7 +234,7 @@
ARRAY_SIZE(crc32_pmull_algs));
}
-static const struct cpu_feature crc32_cpu_feature[] = {
+static const struct cpu_feature __maybe_unused crc32_cpu_feature[] = {
{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
};
MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index ce45ba0..86be258 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
//
// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
// published by the Free Software Foundation.
//
+// Derived from the x86 version:
//
// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
//
@@ -54,19 +56,11 @@
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
-// Function API:
-// UINT16 crc_t10dif_pcl(
-// UINT16 init_crc, //initial CRC value, 16 bits
-// const unsigned char *buf, //buffer pointer to calculate CRC on
-// UINT64 len //buffer length in bytes (64-bit data)
-// );
-//
// Reference paper titled "Fast CRC Computation for Generic
// Polynomials Using PCLMULQDQ Instruction"
// URL: http://www.intel.com/content/dam/www/public/us/en/documents
// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
//
-//
#include <linux/linkage.h>
#include <asm/assembler.h>
@@ -78,13 +72,14 @@
#endif
.text
+ .arch armv7-a
.fpu crypto-neon-fp-armv8
- arg1_low32 .req r0
- arg2 .req r1
- arg3 .req r2
+ init_crc .req r0
+ buf .req r1
+ len .req r2
- qzr .req q13
+ fold_consts_ptr .req ip
q0l .req d0
q0h .req d1
@@ -102,82 +97,35 @@
q6h .req d13
q7l .req d14
q7h .req d15
+ q8l .req d16
+ q8h .req d17
+ q9l .req d18
+ q9h .req d19
+ q10l .req d20
+ q10h .req d21
+ q11l .req d22
+ q11h .req d23
+ q12l .req d24
+ q12h .req d25
-ENTRY(crc_t10dif_pmull)
- vmov.i8 qzr, #0 // init zero register
+ FOLD_CONSTS .req q10
+ FOLD_CONST_L .req q10l
+ FOLD_CONST_H .req q10h
- // adjust the 16-bit initial_crc value, scale it to 32 bits
- lsl arg1_low32, arg1_low32, #16
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, reg1, reg2
+ vld1.64 {q11-q12}, [buf]!
- // check if smaller than 256
- cmp arg3, #256
+ vmull.p64 q8, \reg1\()h, FOLD_CONST_H
+ vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
+ vmull.p64 q9, \reg2\()h, FOLD_CONST_H
+ vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
- // for sizes less than 128, we can't fold 64B at a time...
- blt _less_than_128
-
- // load the initial crc value
- // crc value does not need to be byte-reflected, but it needs
- // to be moved to the high part of the register.
- // because data will be byte-reflected and will align with
- // initial crc at correct place.
- vmov s0, arg1_low32 // initial crc
- vext.8 q10, qzr, q0, #4
-
- // receive the initial 64B data, xor the initial crc value
- vld1.64 {q0-q1}, [arg2, :128]!
- vld1.64 {q2-q3}, [arg2, :128]!
- vld1.64 {q4-q5}, [arg2, :128]!
- vld1.64 {q6-q7}, [arg2, :128]!
-CPU_LE( vrev64.8 q0, q0 )
-CPU_LE( vrev64.8 q1, q1 )
-CPU_LE( vrev64.8 q2, q2 )
-CPU_LE( vrev64.8 q3, q3 )
-CPU_LE( vrev64.8 q4, q4 )
-CPU_LE( vrev64.8 q5, q5 )
-CPU_LE( vrev64.8 q6, q6 )
-CPU_LE( vrev64.8 q7, q7 )
-
- vswp d0, d1
- vswp d2, d3
- vswp d4, d5
- vswp d6, d7
- vswp d8, d9
- vswp d10, d11
- vswp d12, d13
- vswp d14, d15
-
- // XOR the initial_crc value
- veor.8 q0, q0, q10
-
- adr ip, rk3
- vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
-
- //
- // we subtract 256 instead of 128 to save one instruction from the loop
- //
- sub arg3, arg3, #256
-
- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
- // buffer. The _fold_64_B_loop will fold 64B at a time
- // until we have 64+y Bytes of buffer
-
-
- // fold 64B at a time. This section of the code folds 4 vector
- // registers in parallel
-_fold_64_B_loop:
-
- .macro fold64, reg1, reg2
- vld1.64 {q11-q12}, [arg2, :128]!
-
- vmull.p64 q8, \reg1\()h, d21
- vmull.p64 \reg1, \reg1\()l, d20
- vmull.p64 q9, \reg2\()h, d21
- vmull.p64 \reg2, \reg2\()l, d20
-
-CPU_LE( vrev64.8 q11, q11 )
-CPU_LE( vrev64.8 q12, q12 )
- vswp d22, d23
- vswp d24, d25
+CPU_LE( vrev64.8 q11, q11 )
+CPU_LE( vrev64.8 q12, q12 )
+ vswp q11l, q11h
+ vswp q12l, q12h
veor.8 \reg1, \reg1, q8
veor.8 \reg2, \reg2, q9
@@ -185,242 +133,248 @@
veor.8 \reg2, \reg2, q12
.endm
- fold64 q0, q1
- fold64 q2, q3
- fold64 q4, q5
- fold64 q6, q7
-
- subs arg3, arg3, #128
-
- // check if there is another 64B in the buffer to be able to fold
- bge _fold_64_B_loop
-
- // at this point, the buffer pointer is pointing at the last y Bytes
- // of the buffer the 64B of folded data is in 4 of the vector
- // registers: v0, v1, v2, v3
-
- // fold the 8 vector registers to 1 vector register with different
- // constants
-
- adr ip, rk9
- vld1.64 {q10}, [ip, :128]!
-
- .macro fold16, reg, rk
- vmull.p64 q8, \reg\()l, d20
- vmull.p64 \reg, \reg\()h, d21
- .ifnb \rk
- vld1.64 {q10}, [ip, :128]!
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
+ vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
+ vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
+ .ifnb \load_next_consts
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
.endif
- veor.8 q7, q7, q8
- veor.8 q7, q7, \reg
+ veor.8 \dst_reg, \dst_reg, q8
+ veor.8 \dst_reg, \dst_reg, \src_reg
.endm
- fold16 q0, rk11
- fold16 q1, rk13
- fold16 q2, rk15
- fold16 q3, rk17
- fold16 q4, rk19
- fold16 q5, rk1
- fold16 q6
+ .macro __adrl, out, sym
+ movw \out, #:lower16:\sym
+ movt \out, #:upper16:\sym
+ .endm
- // instead of 64, we add 48 to the loop counter to save 1 instruction
- // from the loop instead of a cmp instruction, we use the negative
- // flag with the jl instruction
- adds arg3, arg3, #(128-16)
- blt _final_reduction_for_128
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull)
- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
- // continue folding 16B at a time
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ blt .Lless_than_256_bytes
-_16B_reduction_loop:
- vmull.p64 q8, d14, d20
- vmull.p64 q7, d15, d21
+ __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ vld1.64 {q0-q1}, [buf]!
+ vld1.64 {q2-q3}, [buf]!
+ vld1.64 {q4-q5}, [buf]!
+ vld1.64 {q6-q7}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+CPU_LE( vrev64.8 q1, q1 )
+CPU_LE( vrev64.8 q2, q2 )
+CPU_LE( vrev64.8 q3, q3 )
+CPU_LE( vrev64.8 q4, q4 )
+CPU_LE( vrev64.8 q5, q5 )
+CPU_LE( vrev64.8 q6, q6 )
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q0l, q0h
+ vswp q1l, q1h
+ vswp q2l, q2h
+ vswp q3l, q3h
+ vswp q4l, q4h
+ vswp q5l, q5h
+ vswp q6l, q6h
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q8h, #0
+ vmov.u16 q8h[3], init_crc
+ veor q0h, q0h, q8h
+
+ // Load the constants for folding across 128 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting q0-q7), fold the 128
+ // bytes q0-q7 into them, storing the result back into q0-q7.
+.Lfold_128_bytes_loop:
+ fold_32_bytes q0, q1
+ fold_32_bytes q2, q3
+ fold_32_bytes q4, q5
+ fold_32_bytes q6, q7
+ subs len, len, #128
+ bge .Lfold_128_bytes_loop
+
+ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
+
+ // Fold across 64 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+ fold_16_bytes q0, q4
+ fold_16_bytes q1, q5
+ fold_16_bytes q2, q6
+ fold_16_bytes q3, q7, 1
+ // Fold across 32 bytes.
+ fold_16_bytes q4, q6
+ fold_16_bytes q5, q7, 1
+ // Fold across 16 bytes.
+ fold_16_bytes q6, q7
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting q7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
+ // into them, storing the result back into q7.
+ blt .Lfold_16_bytes_loop_done
+.Lfold_16_bytes_loop:
+ vmull.p64 q8, q7l, FOLD_CONST_L
+ vmull.p64 q7, q7h, FOLD_CONST_H
veor.8 q7, q7, q8
-
- vld1.64 {q0}, [arg2, :128]!
-CPU_LE( vrev64.8 q0, q0 )
- vswp d0, d1
+ vld1.64 {q0}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
veor.8 q7, q7, q0
- subs arg3, arg3, #16
+ subs len, len, #16
+ bge .Lfold_16_bytes_loop
- // instead of a cmp instruction, we utilize the flags with the
- // jge instruction equivalent of: cmp arg3, 16-16
- // check if there is any more 16B in the buffer to be able to fold
- bge _16B_reduction_loop
+.Lfold_16_bytes_loop_done:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting q7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ beq .Lreduce_final_16_bytes
- // now we have 16+z bytes left to reduce, where 0<= z < 16.
- // first, we reduce the data in the xmm7 register
+.Lhandle_partial_segment:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
-_final_reduction_for_128:
- // check if any more data to fold. If not, compute the CRC of
- // the final 128 bits
- adds arg3, arg3, #16
- beq _128_done
+ // q0 = last 16 original data bytes
+ add buf, buf, len
+ sub buf, buf, #16
+ vld1.64 {q0}, [buf]
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
- // here we are getting data that is less than 16 bytes.
- // since we know that there was data before the pointer, we can
- // offset the input pointer before the actual point, to receive
- // exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
- add arg2, arg2, arg3
- sub arg2, arg2, #16
- vld1.64 {q1}, [arg2]
-CPU_LE( vrev64.8 q1, q1 )
- vswp d2, d3
+ // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
+ __adrl r3, .Lbyteshift_table + 16
+ sub r3, r3, len
+ vld1.8 {q2}, [r3]
+ vtbl.8 q1l, {q7l-q7h}, q2l
+ vtbl.8 q1h, {q7l-q7h}, q2h
- // get rid of the extra data that was loaded before
- // load the shift constant
- adr ip, tbl_shf_table + 16
- sub ip, ip, arg3
- vld1.8 {q0}, [ip]
+ // q3 = first chunk: q7 right-shifted by '16-len' bytes.
+ vmov.i8 q3, #0x80
+ veor.8 q2, q2, q3
+ vtbl.8 q3l, {q7l-q7h}, q2l
+ vtbl.8 q3h, {q7l-q7h}, q2h
- // shift v2 to the left by arg3 bytes
- vtbl.8 d4, {d14-d15}, d0
- vtbl.8 d5, {d14-d15}, d1
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ vshr.s8 q2, q2, #7
- // shift v7 to the right by 16-arg3 bytes
- vmov.i8 q9, #0x80
- veor.8 q0, q0, q9
- vtbl.8 d18, {d14-d15}, d0
- vtbl.8 d19, {d14-d15}, d1
+ // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
+ // then '16-len' bytes from q1 (high-order bytes).
+ vbsl.8 q2, q1, q0
- // blend
- vshr.s8 q0, q0, #7 // convert to 8-bit mask
- vbsl.8 q0, q2, q1
-
- // fold 16 Bytes
- vmull.p64 q8, d18, d20
- vmull.p64 q7, d19, d21
- veor.8 q7, q7, q8
+ // Fold the first chunk into the second chunk, storing the result in q7.
+ vmull.p64 q0, q3l, FOLD_CONST_L
+ vmull.p64 q7, q3h, FOLD_CONST_H
veor.8 q7, q7, q0
+ veor.8 q7, q7, q2
-_128_done:
- // compute crc of a 128-bit value
- vldr d20, rk5
- vldr d21, rk6 // rk5 and rk6 in xmm10
+.Lreduce_final_16_bytes:
+ // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
- // 64b fold
- vext.8 q0, qzr, q7, #8
- vmull.p64 q7, d15, d20
- veor.8 q7, q7, q0
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- // 32b fold
- vext.8 q0, q7, qzr, #12
- vmov s31, s3
- vmull.p64 q0, d0, d21
- veor.8 q7, q0, q7
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
+ veor.8 q0h, q0h, q7l // + low bits * x^64
- // barrett reduction
-_barrett:
- vldr d20, rk7
- vldr d21, rk8
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ vmov.i8 q1, #0
+ vmov s4, s3 // extract high 32 bits
+ vmov s3, s5 // zero high 32 bits
+ vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
+ veor.8 q0, q0, q1 // + low bits
- vmull.p64 q0, d15, d20
- vext.8 q0, qzr, q0, #12
- vmull.p64 q0, d1, d21
- vext.8 q0, qzr, q0, #12
- veor.8 q7, q7, q0
- vmov r0, s29
+ // Load G(x) and floor(x^48 / G(x)).
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
-_cleanup:
- // scale the result back to 16 bits
- lsr r0, r0, #16
+ // Use Barrett reduction to compute the final CRC value.
+ vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
+ vshr.u64 q1l, q1l, #32 // /= x^32
+ vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
+ vshr.u64 q0l, q0l, #48
+ veor.8 q0l, q0l, q1l // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
+
+ vmov.u16 r0, q0l[0]
bx lr
-_less_than_128:
- teq arg3, #0
- beq _cleanup
+.Lless_than_256_bytes:
+ // Checksumming a buffer of length 16...255 bytes
- vmov.i8 q0, #0
- vmov s3, arg1_low32 // get the initial crc value
+ __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
- vld1.64 {q7}, [arg2, :128]!
-CPU_LE( vrev64.8 q7, q7 )
- vswp d14, d15
- veor.8 q7, q7, q0
+ // Load the first 16 data bytes.
+ vld1.64 {q7}, [buf]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
- cmp arg3, #16
- beq _128_done // exactly 16 left
- blt _less_than_16_left
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q0h, #0
+ vmov.u16 q0h[3], init_crc
+ veor.8 q7h, q7h, q0h
- // now if there is, load the constants
- vldr d20, rk1
- vldr d21, rk2 // rk1 and rk2 in xmm10
+ // Load the fold-across-16-bytes constants.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- // check if there is enough buffer to be able to fold 16B at a time
- subs arg3, arg3, #32
- addlt arg3, arg3, #16
- blt _get_last_two_regs
- b _16B_reduction_loop
-
-_less_than_16_left:
- // shl r9, 4
- adr ip, tbl_shf_table + 16
- sub ip, ip, arg3
- vld1.8 {q0}, [ip]
- vmov.i8 q9, #0x80
- veor.8 q0, q0, q9
- vtbl.8 d18, {d14-d15}, d0
- vtbl.8 d15, {d14-d15}, d1
- vmov d14, d18
- b _128_done
+ cmp len, #16
+ beq .Lreduce_final_16_bytes // len == 16
+ subs len, len, #32
+ addlt len, len, #16
+ blt .Lhandle_partial_segment // 17 <= len <= 31
+ b .Lfold_16_bytes_loop // 32 <= len <= 255
ENDPROC(crc_t10dif_pmull)
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+ .section ".rodata", "a"
.align 4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
-rk3: .quad 0x9d9d000000000000
-rk4: .quad 0x7cf5000000000000
-rk5: .quad 0x2d56000000000000
-rk6: .quad 0x1368000000000000
-rk7: .quad 0x00000001f65a57f8
-rk8: .quad 0x000000018bb70000
-rk9: .quad 0xceae000000000000
-rk10: .quad 0xbfd6000000000000
-rk11: .quad 0x1e16000000000000
-rk12: .quad 0x713c000000000000
-rk13: .quad 0xf7f9000000000000
-rk14: .quad 0x80a6000000000000
-rk15: .quad 0x044c000000000000
-rk16: .quad 0xe658000000000000
-rk17: .quad 0xad18000000000000
-rk18: .quad 0xa497000000000000
-rk19: .quad 0x6ee3000000000000
-rk20: .quad 0xe7b5000000000000
-rk1: .quad 0x2d56000000000000
-rk2: .quad 0x06df000000000000
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
-
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index d428355..e9191a8 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/crc-t10dif.h>
@@ -15,13 +12,14 @@
#include <linux/string.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <asm/neon.h>
#include <asm/simd.h>
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u32 len);
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -35,26 +33,15 @@
unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
- unsigned int l;
- if (!may_use_simd()) {
- *crc = crc_t10dif_generic(*crc, data, length);
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull(*crc, data, length);
+ kernel_neon_end();
} else {
- if (unlikely((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE)) {
- l = min_t(u32, length, CRC_T10DIF_PMULL_CHUNK_SIZE -
- ((u32)data % CRC_T10DIF_PMULL_CHUNK_SIZE));
-
- *crc = crc_t10dif_generic(*crc, data, l);
-
- length -= l;
- data += l;
- }
- if (length > 0) {
- kernel_neon_begin();
- *crc = crc_t10dif_pmull(*crc, data, length);
- kernel_neon_end();
- }
+ *crc = crc_t10dif_generic(*crc, data, length);
}
+
return 0;
}
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 2f78c10..c47fe81 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
*
* Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -63,6 +60,33 @@
k48 .req d31
SHASH2_p64 .req d31
+ HH .req q10
+ HH3 .req q11
+ HH4 .req q12
+ HH34 .req q13
+
+ HH_L .req d20
+ HH_H .req d21
+ HH3_L .req d22
+ HH3_H .req d23
+ HH4_L .req d24
+ HH4_H .req d25
+ HH34_L .req d26
+ HH34_H .req d27
+ SHASH2_H .req d29
+
+ XL2 .req q5
+ XM2 .req q6
+ XH2 .req q7
+ T3 .req q8
+
+ XL2_L .req d10
+ XL2_H .req d11
+ XM2_L .req d12
+ XM2_H .req d13
+ T3_L .req d16
+ T3_H .req d17
+
.text
.fpu crypto-neon-fp-armv8
@@ -175,12 +199,77 @@
beq 0f
vld1.64 {T1}, [ip]
teq r0, #0
- b 1f
+ b 3f
-0: vld1.64 {T1}, [r2]!
+0: .ifc \pn, p64
+ tst r0, #3 // skip until #blocks is a
+ bne 2f // round multiple of 4
+
+ vld1.8 {XL2-XM2}, [r2]!
+1: vld1.8 {T3-T2}, [r2]!
+ vrev64.8 XL2, XL2
+ vrev64.8 XM2, XM2
+
+ subs r0, r0, #4
+
+ vext.8 T1, XL2, XL2, #8
+ veor XL2_H, XL2_H, XL_L
+ veor XL, XL, T1
+
+ vrev64.8 T3, T3
+ vrev64.8 T1, T2
+
+ vmull.p64 XH, HH4_H, XL_H // a1 * b1
+ veor XL2_H, XL2_H, XL_H
+ vmull.p64 XL, HH4_L, XL_L // a0 * b0
+ vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
+
+ vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
+ veor XM2_L, XM2_L, XM2_H
+ vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
+ vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
+
+ veor XH, XH, XH2
+ veor XL, XL, XL2
+ veor XM, XM, XM2
+
+ vmull.p64 XH2, HH_H, T3_L // a1 * b1
+ veor T3_L, T3_L, T3_H
+ vmull.p64 XL2, HH_L, T3_H // a0 * b0
+ vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
+
+ veor XH, XH, XH2
+ veor XL, XL, XL2
+ veor XM, XM, XM2
+
+ vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
+ veor T1_L, T1_L, T1_H
+ vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
+ vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
+
+ veor XH, XH, XH2
+ veor XL, XL, XL2
+ veor XM, XM, XM2
+
+ beq 4f
+
+ vld1.8 {XL2-XM2}, [r2]!
+
+ veor T1, XL, XH
+ veor XM, XM, T1
+
+ __pmull_reduce_p64
+
+ veor T1, T1, XH
+ veor XL, XL, T1
+
+ b 1b
+ .endif
+
+2: vld1.64 {T1}, [r2]!
subs r0, r0, #1
-1: /* multiply XL by SHASH in GF(2^128) */
+3: /* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1
#endif
@@ -193,7 +282,7 @@
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
- veor T1, XL, XH
+4: veor T1, XL, XH
veor XM, XM, T1
__pmull_reduce_\pn
@@ -212,8 +301,14 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update_p64)
- vld1.64 {SHASH}, [r3]
+ vld1.64 {SHASH}, [r3]!
+ vld1.64 {HH}, [r3]!
+ vld1.64 {HH3-HH4}, [r3]
+
veor SHASH2_p64, SHASH_L, SHASH_H
+ veor SHASH2_H, HH_L, HH_H
+ veor HH34_L, HH3_L, HH3_H
+ veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 8930fc4..c691077 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -1,25 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
*
- * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
+ * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
+#include <crypto/b128ops.h>
#include <crypto/cryptd.h>
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <crypto/gf128mul.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
-MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
+MODULE_DESCRIPTION("GHASH hash function using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("ghash");
@@ -28,8 +27,12 @@
#define GHASH_DIGEST_SIZE 16
struct ghash_key {
- u64 a;
- u64 b;
+ u64 h[2];
+ u64 h2[2];
+ u64 h3[2];
+ u64 h4[2];
+
+ be128 k;
};
struct ghash_desc_ctx {
@@ -62,6 +65,36 @@
return 0;
}
+static void ghash_do_update(int blocks, u64 dg[], const char *src,
+ struct ghash_key *key, const char *head)
+{
+ if (likely(crypto_simd_usable())) {
+ kernel_neon_begin();
+ pmull_ghash_update(blocks, dg, src, key, head);
+ kernel_neon_end();
+ } else {
+ be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
+
+ do {
+ const u8 *in = src;
+
+ if (head) {
+ in = head;
+ blocks++;
+ head = NULL;
+ } else {
+ src += GHASH_BLOCK_SIZE;
+ }
+
+ crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
+ gf128mul_lle(&dst, &key->k);
+ } while (--blocks);
+
+ dg[0] = be64_to_cpu(dst.b);
+ dg[1] = be64_to_cpu(dst.a);
+ }
+}
+
static int ghash_update(struct shash_desc *desc, const u8 *src,
unsigned int len)
{
@@ -85,10 +118,8 @@
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;
- kernel_neon_begin();
- pmull_ghash_update(blocks, ctx->digest, src, key,
- partial ? ctx->buf : NULL);
- kernel_neon_end();
+ ghash_do_update(blocks, ctx->digest, src, key,
+ partial ? ctx->buf : NULL);
src += blocks * GHASH_BLOCK_SIZE;
partial = 0;
}
@@ -106,9 +137,7 @@
struct ghash_key *key = crypto_shash_ctx(desc->tfm);
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
- kernel_neon_begin();
- pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
- kernel_neon_end();
+ ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
@@ -117,26 +146,41 @@
return 0;
}
+static void ghash_reflect(u64 h[], const be128 *k)
+{
+ u64 carry = be64_to_cpu(k->a) >> 63;
+
+ h[0] = (be64_to_cpu(k->b) << 1) | carry;
+ h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+
+ if (carry)
+ h[1] ^= 0xc200000000000000UL;
+}
+
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- u64 a, b;
+ be128 h;
if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
- /* perform multiplication by 'x' in GF(2^128) */
- b = get_unaligned_be64(inkey);
- a = get_unaligned_be64(inkey + 8);
+ /* needed for the fallback */
+ memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
+ ghash_reflect(key->h, &key->k);
- key->a = (a << 1) | (b >> 63);
- key->b = (b << 1) | (a >> 63);
+ h = key->k;
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h2, &h);
- if (b >> 63)
- key->b ^= 0xc200000000000000UL;
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h3, &h);
+
+ gf128mul_lle(&h, &key->k);
+ ghash_reflect(key->h4, &h);
return 0;
}
@@ -148,15 +192,13 @@
.final = ghash_final,
.setkey = ghash_setkey,
.descsize = sizeof(struct ghash_desc_ctx),
- .base = {
- .cra_name = "__ghash",
- .cra_driver_name = "__driver-ghash-ce",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_INTERNAL,
- .cra_blocksize = GHASH_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct ghash_key),
- .cra_module = THIS_MODULE,
- },
+
+ .base.cra_name = "ghash",
+ .base.cra_driver_name = "ghash-ce-sync",
+ .base.cra_priority = 300 - 1,
+ .base.cra_blocksize = GHASH_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct ghash_key),
+ .base.cra_module = THIS_MODULE,
};
static int ghash_async_init(struct ahash_request *req)
@@ -169,7 +211,6 @@
struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
desc->tfm = child;
- desc->flags = req->base.flags;
return crypto_shash_init(desc);
}
@@ -180,7 +221,7 @@
struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
memcpy(cryptd_req, req, sizeof(*req));
ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -198,7 +239,7 @@
struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
memcpy(cryptd_req, req, sizeof(*req));
ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -216,7 +257,7 @@
struct ahash_request *cryptd_req = ahash_request_ctx(req);
struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
memcpy(cryptd_req, req, sizeof(*req));
ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
@@ -226,7 +267,6 @@
struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
desc->tfm = child;
- desc->flags = req->base.flags;
return shash_ahash_digest(req, desc);
}
}
@@ -239,7 +279,6 @@
struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
desc->tfm = cryptd_ahash_child(ctx->cryptd_tfm);
- desc->flags = req->base.flags;
return crypto_shash_import(desc, in);
}
@@ -274,9 +313,7 @@
struct cryptd_ahash *cryptd_tfm;
struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
- cryptd_tfm = cryptd_alloc_ahash("__driver-ghash-ce",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
+ cryptd_tfm = cryptd_alloc_ahash("ghash-ce-sync", 0, 0);
if (IS_ERR(cryptd_tfm))
return PTR_ERR(cryptd_tfm);
ctx->cryptd_tfm = cryptd_tfm;
diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S
new file mode 100644
index 0000000..434d80a
--- /dev/null
+++ b/arch/arm/crypto/nh-neon-core.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, NEON accelerated version
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+ .text
+ .fpu neon
+
+ KEY .req r0
+ MESSAGE .req r1
+ MESSAGE_LEN .req r2
+ HASH .req r3
+
+ PASS0_SUMS .req q0
+ PASS0_SUM_A .req d0
+ PASS0_SUM_B .req d1
+ PASS1_SUMS .req q1
+ PASS1_SUM_A .req d2
+ PASS1_SUM_B .req d3
+ PASS2_SUMS .req q2
+ PASS2_SUM_A .req d4
+ PASS2_SUM_B .req d5
+ PASS3_SUMS .req q3
+ PASS3_SUM_A .req d6
+ PASS3_SUM_B .req d7
+ K0 .req q4
+ K1 .req q5
+ K2 .req q6
+ K3 .req q7
+ T0 .req q8
+ T0_L .req d16
+ T0_H .req d17
+ T1 .req q9
+ T1_L .req d18
+ T1_H .req d19
+ T2 .req q10
+ T2_L .req d20
+ T2_H .req d21
+ T3 .req q11
+ T3_L .req d22
+ T3_H .req d23
+
+.macro _nh_stride k0, k1, k2, k3
+
+ // Load next message stride
+ vld1.8 {T3}, [MESSAGE]!
+
+ // Load next key stride
+ vld1.32 {\k3}, [KEY]!
+
+ // Add message words to key words
+ vadd.u32 T0, T3, \k0
+ vadd.u32 T1, T3, \k1
+ vadd.u32 T2, T3, \k2
+ vadd.u32 T3, T3, \k3
+
+ // Multiply 32x32 => 64 and accumulate
+ vmlal.u32 PASS0_SUMS, T0_L, T0_H
+ vmlal.u32 PASS1_SUMS, T1_L, T1_H
+ vmlal.u32 PASS2_SUMS, T2_L, T2_H
+ vmlal.u32 PASS3_SUMS, T3_L, T3_H
+.endm
+
+/*
+ * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_neon)
+
+ vld1.32 {K0,K1}, [KEY]!
+ vmov.u64 PASS0_SUMS, #0
+ vmov.u64 PASS1_SUMS, #0
+ vld1.32 {K2}, [KEY]!
+ vmov.u64 PASS2_SUMS, #0
+ vmov.u64 PASS3_SUMS, #0
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #64
+ blt .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3
+ _nh_stride K1, K2, K3, K0
+ _nh_stride K2, K3, K0, K1
+ _nh_stride K3, K0, K1, K2
+ subs MESSAGE_LEN, MESSAGE_LEN, #64
+ bge .Lloop4
+
+.Lloop4_done:
+ ands MESSAGE_LEN, MESSAGE_LEN, #63
+ beq .Ldone
+ _nh_stride K0, K1, K2, K3
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #16
+ beq .Ldone
+ _nh_stride K1, K2, K3, K0
+
+ subs MESSAGE_LEN, MESSAGE_LEN, #16
+ beq .Ldone
+ _nh_stride K2, K3, K0, K1
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B
+ vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B
+ vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B
+ vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B
+ vst1.8 {T0-T1}, [HASH]
+ bx lr
+ENDPROC(nh_neon)
diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c
new file mode 100644
index 0000000..ae5aefc
--- /dev/null
+++ b/arch/arm/crypto/nhpoly1305-neon-glue.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (NEON accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+
+asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_neon(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_neon_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !crypto_simd_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+ kernel_neon_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
+ kernel_neon_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_neon_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!(elf_hwcap & HWCAP_NEON))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
index 2468fad..28d816a 100644
--- a/arch/arm/crypto/sha1-armv7-neon.S
+++ b/arch/arm/crypto/sha1-armv7-neon.S
@@ -1,11 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
*
* Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#include <linux/linkage.h>
diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S
index b623f51..49a74a4 100644
--- a/arch/arm/crypto/sha1-ce-core.S
+++ b/arch/arm/crypto/sha1-ce-core.S
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
index b732522..e79b1fb 100644
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ b/arch/arm/crypto/sha1-ce-glue.c
@@ -1,14 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha1_base.h>
#include <linux/cpufeature.h>
@@ -33,7 +31,7 @@
{
struct sha1_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
return sha1_update_arm(desc, data, len);
@@ -47,7 +45,7 @@
static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return sha1_finup_arm(desc, data, len, out);
kernel_neon_begin();
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
index 98ab823..c80b0eb 100644
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Cryptographic API.
* Glue code for the SHA1 Secure Hash Algorithm assembler implementation
@@ -8,12 +9,6 @@
* Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <crypto/internal/hash.h>
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
index d15e0ea..2c36273 100644
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ b/arch/arm/crypto/sha1_neon_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
* ARM NEON instructions.
@@ -10,15 +11,10 @@
* Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
* Copyright (c) Mathias Krause <minipli@googlemail.com>
* Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -39,7 +35,7 @@
{
struct sha1_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
return sha1_update_arm(desc, data, len);
@@ -54,7 +50,7 @@
static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return sha1_finup_arm(desc, data, len, out);
kernel_neon_begin();
diff --git a/arch/arm/crypto/sha2-ce-core.S b/arch/arm/crypto/sha2-ce-core.S
index 87ec11a..4ad5175 100644
--- a/arch/arm/crypto/sha2-ce-core.S
+++ b/arch/arm/crypto/sha2-ce-core.S
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha2-ce-core.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
index 1211a5c..87f0b62 100644
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@@ -1,14 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha256_base.h>
#include <linux/cpufeature.h>
@@ -34,7 +32,7 @@
{
struct sha256_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
return crypto_sha256_arm_update(desc, data, len);
@@ -49,7 +47,7 @@
static int sha2_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return crypto_sha256_arm_finup(desc, data, len, out);
kernel_neon_begin();
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
index b9ec440..a03cf4d 100644
--- a/arch/arm/crypto/sha256-armv4.pl
+++ b/arch/arm/crypto/sha256-armv4.pl
@@ -212,10 +212,11 @@
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+.Lsha256_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
#else
- adr r3,sha256_block_data_order
+ adr r3,.Lsha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped
index 3b58300..054aae0 100644
--- a/arch/arm/crypto/sha256-core.S_shipped
+++ b/arch/arm/crypto/sha256-core.S_shipped
@@ -93,10 +93,11 @@
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+.Lsha256_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
#else
- adr r3,sha256_block_data_order
+ adr r3,.Lsha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
index 0ae900e..215497f 100644
--- a/arch/arm/crypto/sha256_glue.c
+++ b/arch/arm/crypto/sha256_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue code for the SHA256 Secure Hash Algorithm assembly implementation
* using optimized ARM assembler and NEON instructions.
@@ -7,12 +8,6 @@
* This file is based on sha256_ssse3_glue.c:
* Copyright (C) 2013 Intel Corporation
* Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <crypto/internal/hash.h>
@@ -44,7 +39,7 @@
}
EXPORT_SYMBOL(crypto_sha256_arm_update);
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int crypto_sha256_arm_final(struct shash_desc *desc, u8 *out)
{
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha256_block_data_order);
@@ -56,7 +51,7 @@
{
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha256_block_data_order);
- return sha256_final(desc, out);
+ return crypto_sha256_arm_final(desc, out);
}
EXPORT_SYMBOL(crypto_sha256_arm_finup);
@@ -64,7 +59,7 @@
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = crypto_sha256_arm_update,
- .final = sha256_final,
+ .final = crypto_sha256_arm_final,
.finup = crypto_sha256_arm_finup,
.descsize = sizeof(struct sha256_state),
.base = {
@@ -78,7 +73,7 @@
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
.update = crypto_sha256_arm_update,
- .final = sha256_final,
+ .final = crypto_sha256_arm_final,
.finup = crypto_sha256_arm_finup,
.descsize = sizeof(struct sha256_state),
.base = {
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
index 1d82c6c..38645e4 100644
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ b/arch/arm/crypto/sha256_neon_glue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue code for the SHA256 Secure Hash Algorithm assembly implementation
* using NEON instructions.
@@ -6,15 +7,10 @@
*
* This file is based on sha512_neon_glue.c:
* Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -29,12 +25,12 @@
asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data,
unsigned int num_blks);
-static int sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int crypto_sha256_neon_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
return crypto_sha256_arm_update(desc, data, len);
@@ -46,10 +42,10 @@
return 0;
}
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
+static int crypto_sha256_neon_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return crypto_sha256_arm_finup(desc, data, len, out);
kernel_neon_begin();
@@ -63,17 +59,17 @@
return sha256_base_finish(desc, out);
}
-static int sha256_final(struct shash_desc *desc, u8 *out)
+static int crypto_sha256_neon_final(struct shash_desc *desc, u8 *out)
{
- return sha256_finup(desc, NULL, 0, out);
+ return crypto_sha256_neon_finup(desc, NULL, 0, out);
}
struct shash_alg sha256_neon_algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
- .update = sha256_update,
- .final = sha256_final,
- .finup = sha256_finup,
+ .update = crypto_sha256_neon_update,
+ .final = crypto_sha256_neon_final,
+ .finup = crypto_sha256_neon_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
@@ -85,9 +81,9 @@
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
- .update = sha256_update,
- .final = sha256_final,
- .finup = sha256_finup,
+ .update = crypto_sha256_neon_update,
+ .final = crypto_sha256_neon_final,
+ .finup = crypto_sha256_neon_finup,
.descsize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
index fb5d150..788c17b 100644
--- a/arch/arm/crypto/sha512-armv4.pl
+++ b/arch/arm/crypto/sha512-armv4.pl
@@ -274,10 +274,11 @@
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha512_block_data_order
#else
- adr r3,sha512_block_data_order
+ adr r3,.Lsha512_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped
index b1c334a..710ea30 100644
--- a/arch/arm/crypto/sha512-core.S_shipped
+++ b/arch/arm/crypto/sha512-core.S_shipped
@@ -141,10 +141,11 @@
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha512_block_data_order
#else
- adr r3,sha512_block_data_order
+ adr r3,.Lsha512_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
index 86540cd..8775aa4 100644
--- a/arch/arm/crypto/sha512-glue.c
+++ b/arch/arm/crypto/sha512-glue.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sha512-glue.c - accelerated SHA-384/512 for ARM
*
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <crypto/internal/hash.h>
@@ -37,7 +34,7 @@
(sha512_block_fn *)sha512_block_data_order);
}
-int sha512_arm_final(struct shash_desc *desc, u8 *out)
+static int sha512_arm_final(struct shash_desc *desc, u8 *out)
{
sha512_base_do_finalize(desc,
(sha512_block_fn *)sha512_block_data_order);
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
index 8a5642b..96cb944 100644
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
@@ -1,14 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
*
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha512_base.h>
#include <linux/crypto.h>
@@ -30,7 +28,7 @@
{
struct sha512_state *sctx = shash_desc_ctx(desc);
- if (!may_use_simd() ||
+ if (!crypto_simd_usable() ||
(sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
return sha512_arm_update(desc, data, len);
@@ -45,7 +43,7 @@
static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!may_use_simd())
+ if (!crypto_simd_usable())
return sha512_arm_finup(desc, data, len, out);
kernel_neon_begin();