blob: 17f09aa556b1dd45c3ccd84ed13f596af3ac9148 [file] [log] [blame]
Jerry Yu49231312023-01-10 16:57:21 +08001/*
Dave Rodgmanf918d422023-03-17 17:52:23 +00002 * Armv8-A Cryptographic Extension support functions for Aarch64
Jerry Yu49231312023-01-10 16:57:21 +08003 *
4 * Copyright The Mbed TLS Contributors
Dave Rodgman16799db2023-11-02 19:47:20 +00005 * SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
Jerry Yu49231312023-01-10 16:57:21 +08006 */
7
Dave Rodgman27e3c872023-10-08 10:29:26 +01008#if defined(__clang__) && (__clang_major__ >= 4)
9
Dave Rodgman9fd1b522023-10-10 15:23:44 +010010/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8_A in the following #if,
Dave Rodgman27e3c872023-10-08 10:29:26 +010011 * but that is defined by build_info.h, and we need this block to happen first. */
12#if defined(__ARM_ARCH)
13#if __ARM_ARCH >= 8
Dave Rodgman9fd1b522023-10-10 15:23:44 +010014#define MBEDTLS_AESCE_ARCH_IS_ARMV8_A
Dave Rodgman27e3c872023-10-08 10:29:26 +010015#endif
16#endif
17
Dave Rodgman9fd1b522023-10-10 15:23:44 +010018#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8_A) && !defined(__ARM_FEATURE_CRYPTO)
Jerry Yu48b999c2023-03-03 15:51:07 +080019/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
20 *
21 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
22 * these are normally only enabled by the -march option on the command line.
23 * By defining the macros ourselves we gain access to those declarations without
24 * requiring -march on the command line.
25 *
Dave Rodgmana0f10da2023-09-05 11:43:17 +010026 * `arm_neon.h` is included by common.h, so we put these defines
Jerry Yu48b999c2023-03-03 15:51:07 +080027 * at the top of this file, before any includes.
28 */
29#define __ARM_FEATURE_CRYPTO 1
Jerry Yuae129c32023-03-03 15:55:56 +080030/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
31 *
Jerry Yu490bf082023-03-06 15:21:44 +080032 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
33 * for older compilers.
Jerry Yuae129c32023-03-03 15:55:56 +080034 */
35#define __ARM_FEATURE_AES 1
Dave Rodgmandb6ab242023-03-14 16:03:57 +000036#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
Jerry Yu490bf082023-03-06 15:21:44 +080037#endif
Jerry Yu48b999c2023-03-03 15:51:07 +080038
Dave Rodgman27e3c872023-10-08 10:29:26 +010039#endif /* defined(__clang__) && (__clang_major__ >= 4) */
40
Jerry Yu49231312023-01-10 16:57:21 +080041#include <string.h>
42#include "common.h"
43
44#if defined(MBEDTLS_AESCE_C)
45
46#include "aesce.h"
47
Dave Rodgman410ad442023-11-28 13:42:17 +000048#if defined(MBEDTLS_AESCE_HAVE_CODE)
Jerry Yu49231312023-01-10 16:57:21 +080049
Jerry Yu61c4cfa2023-04-26 11:06:51 +080050/* Compiler version checks. */
Jerry Yudb368de2023-04-26 16:55:37 +080051#if defined(__clang__)
Dave Rodgman48b965d2023-10-09 12:19:44 +010052# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
Dave Rodgmanb34fe8b2023-10-10 09:52:46 +010053# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 11.0."
Dave Rodgman48b965d2023-10-09 12:19:44 +010054# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
55# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
Jerry Yudb368de2023-04-26 16:55:37 +080056# endif
57#elif defined(__GNUC__)
58# if __GNUC__ < 6
59# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
60# endif
61#elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +080062/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
63 * please update this and document of `MBEDTLS_AESCE_C` in
64 * `mbedtls_config.h`. */
Jerry Yudb368de2023-04-26 16:55:37 +080065# if _MSC_VER < 1929
66# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
67# endif
Dave Rodgman4b8e8dc2023-10-08 21:41:40 +010068#elif defined(__ARMCC_VERSION)
69# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
70/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
71 * If someone verified that, please update this and document of
72 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
73# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
74# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
75# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
76# endif
Jerry Yu61c4cfa2023-04-26 11:06:51 +080077#endif
78
Dave Rodgmana0f10da2023-09-05 11:43:17 +010079#if !defined(MBEDTLS_HAVE_NEON_INTRINSICS)
Jerry Yu6b00f5a2023-05-04 16:30:21 +080080#error "Target does not support NEON instructions"
81#endif
Jerry Yu08933d32023-04-27 18:28:00 +080082
Jerry Yu580e06f2023-04-28 17:42:40 +080083#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
84 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
Jerry Yub1d06bb2023-05-05 14:05:07 +080085# if defined(__ARMCOMPILER_VERSION)
86# if __ARMCOMPILER_VERSION <= 6090000
87# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
88# else
Jerry Yu893be8d2023-07-13 17:32:11 +080089# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yub1d06bb2023-05-05 14:05:07 +080090# define MBEDTLS_POP_TARGET_PRAGMA
91# endif
92# elif defined(__clang__)
Jerry Yu893be8d2023-07-13 17:32:11 +080093# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yuec9be842023-03-14 10:42:47 +080094# define MBEDTLS_POP_TARGET_PRAGMA
95# elif defined(__GNUC__)
Jerry Yuec9be842023-03-14 10:42:47 +080096# pragma GCC push_options
Beniamin Sandu471a9752023-06-25 20:16:16 +030097# pragma GCC target ("+crypto")
Jerry Yuec9be842023-03-14 10:42:47 +080098# define MBEDTLS_POP_TARGET_PRAGMA
Jerry Yu07d28d82023-03-20 18:12:36 +080099# elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +0800100# error "Required feature(__ARM_FEATURE_AES) is not enabled."
Jerry Yu49231312023-01-10 16:57:21 +0800101# endif
Jerry Yu580e06f2023-04-28 17:42:40 +0800102#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
103 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
Jerry Yu49231312023-01-10 16:57:21 +0800104
Dave Rodgman45661322023-08-04 12:31:58 +0100105#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
106
Jerry Yub95c7762023-01-10 16:59:51 +0800107#include <sys/auxv.h>
Dave Rodgman5e419372023-10-23 15:30:20 +0100108#if !defined(HWCAP_NEON)
109#define HWCAP_NEON (1 << 12)
110#endif
111#if !defined(HWCAP2_AES)
112#define HWCAP2_AES (1 << 0)
113#endif
114#if !defined(HWCAP_AES)
115#define HWCAP_AES (1 << 3)
116#endif
117#if !defined(HWCAP_ASIMD)
118#define HWCAP_ASIMD (1 << 1)
119#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100120
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100121signed char mbedtls_aesce_has_support_result = -1;
Jerry Yub95c7762023-01-10 16:59:51 +0800122
Jerry Yu36606232023-04-19 10:44:29 +0800123#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
Jerry Yub95c7762023-01-10 16:59:51 +0800124/*
125 * AES instruction support detection routine
126 */
Dave Rodgman45661322023-08-04 12:31:58 +0100127int mbedtls_aesce_has_support_impl(void)
Jerry Yub95c7762023-01-10 16:59:51 +0800128{
Dave Rodgman45661322023-08-04 12:31:58 +0100129 /* To avoid many calls to getauxval, cache the result. This is
130 * thread-safe, because we store the result in a char so cannot
131 * be vulnerable to non-atomic updates.
132 * It is possible that we could end up setting result more than
133 * once, but that is harmless.
134 */
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100135 if (mbedtls_aesce_has_support_result == -1) {
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100136#if defined(MBEDTLS_ARCH_IS_ARM32)
137 unsigned long auxval = getauxval(AT_HWCAP);
138 unsigned long auxval2 = getauxval(AT_HWCAP2);
139 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
140 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
141 mbedtls_aesce_has_support_result = 1;
142 } else {
143 mbedtls_aesce_has_support_result = 0;
144 }
145#else
Dave Rodgman45661322023-08-04 12:31:58 +0100146 unsigned long auxval = getauxval(AT_HWCAP);
147 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
148 (HWCAP_ASIMD | HWCAP_AES)) {
149 mbedtls_aesce_has_support_result = 1;
150 } else {
151 mbedtls_aesce_has_support_result = 0;
152 }
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100153#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100154 }
155 return mbedtls_aesce_has_support_result;
Jerry Yub95c7762023-01-10 16:59:51 +0800156}
Jerry Yu0d4f4e52023-03-31 14:32:47 +0800157#endif
Jerry Yub95c7762023-01-10 16:59:51 +0800158
Dave Rodgman45661322023-08-04 12:31:58 +0100159#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
160
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100161/* Single round of AESCE encryption */
162#define AESCE_ENCRYPT_ROUND \
163 block = vaeseq_u8(block, vld1q_u8(keys)); \
164 block = vaesmcq_u8(block); \
165 keys += 16
166/* Two rounds of AESCE encryption */
167#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
168
Dave Rodgman9bb7e6f2023-06-16 09:41:21 +0100169MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
Jerry Yu2bb3d812023-01-10 17:38:26 +0800170static uint8x16_t aesce_encrypt_block(uint8x16_t block,
171 unsigned char *keys,
172 int rounds)
173{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100174 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100175 if (rounds == 10) {
176 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800177 }
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100178 if (rounds == 12) {
179 goto rounds_12;
180 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100181 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100182rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100183 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100184rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100185 AESCE_ENCRYPT_ROUND_X2;
186 AESCE_ENCRYPT_ROUND_X2;
187 AESCE_ENCRYPT_ROUND_X2;
188 AESCE_ENCRYPT_ROUND_X2;
189 AESCE_ENCRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800190
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800191 /* AES AddRoundKey for the previous round.
192 * SubBytes, ShiftRows for the final round. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100193 block = vaeseq_u8(block, vld1q_u8(keys));
194 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800195
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800196 /* Final round: no MixColumns */
Jerry Yu3304c202023-02-22 14:37:11 +0800197
198 /* Final AddRoundKey */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100199 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800200
201 return block;
202}
203
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100204/* Single round of AESCE decryption
205 *
206 * AES AddRoundKey, SubBytes, ShiftRows
207 *
208 * block = vaesdq_u8(block, vld1q_u8(keys));
209 *
210 * AES inverse MixColumns for the next round.
211 *
212 * This means that we switch the order of the inverse AddRoundKey and
213 * inverse MixColumns operations. We have to do this as AddRoundKey is
214 * done in an atomic instruction together with the inverses of SubBytes
215 * and ShiftRows.
216 *
217 * It works because MixColumns is a linear operation over GF(2^8) and
218 * AddRoundKey is an exclusive or, which is equivalent to addition over
219 * GF(2^8). (The inverse of MixColumns needs to be applied to the
220 * affected round keys separately which has been done when the
221 * decryption round keys were calculated.)
222 *
223 * block = vaesimcq_u8(block);
224 */
225#define AESCE_DECRYPT_ROUND \
226 block = vaesdq_u8(block, vld1q_u8(keys)); \
227 block = vaesimcq_u8(block); \
228 keys += 16
229/* Two rounds of AESCE decryption */
230#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
231
Yanray Wangb67b4742023-10-31 17:10:32 +0800232#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
Jerry Yu2bb3d812023-01-10 17:38:26 +0800233static uint8x16_t aesce_decrypt_block(uint8x16_t block,
234 unsigned char *keys,
235 int rounds)
236{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100237 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100238 if (rounds == 10) {
239 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800240 }
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100241 if (rounds == 12) {
242 goto rounds_12;
243 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100244 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100245rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100246 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100247rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100248 AESCE_DECRYPT_ROUND_X2;
249 AESCE_DECRYPT_ROUND_X2;
250 AESCE_DECRYPT_ROUND_X2;
251 AESCE_DECRYPT_ROUND_X2;
252 AESCE_DECRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800253
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800254 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
255 * last full round. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100256 block = vaesdq_u8(block, vld1q_u8(keys));
257 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800258
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800259 /* Inverse AddRoundKey for inverting the initial round key addition. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100260 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800261
262 return block;
263}
Yanray Wang590c9b72023-08-28 15:40:23 +0800264#endif
Jerry Yu2bb3d812023-01-10 17:38:26 +0800265
266/*
267 * AES-ECB block en(de)cryption
268 */
269int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
270 int mode,
271 const unsigned char input[16],
272 unsigned char output[16])
273{
274 uint8x16_t block = vld1q_u8(&input[0]);
275 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
276
Yanray Wang0d76b6e2023-11-02 11:54:39 +0800277#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
Yanray Wang111159b2023-11-10 13:41:12 +0800278 if (mode == MBEDTLS_AES_DECRYPT) {
Jerry Yu2bb3d812023-01-10 17:38:26 +0800279 block = aesce_decrypt_block(block, keys, ctx->nr);
Yanray Wang111159b2023-11-10 13:41:12 +0800280 } else
Yanray Wangc4347912023-11-14 10:10:49 +0800281#else
282 (void) mode;
Yanray Wang0d76b6e2023-11-02 11:54:39 +0800283#endif
Yanray Wang111159b2023-11-10 13:41:12 +0800284 {
285 block = aesce_encrypt_block(block, keys, ctx->nr);
Jerry Yu2bb3d812023-01-10 17:38:26 +0800286 }
287 vst1q_u8(&output[0], block);
288
289 return 0;
290}
291
Jerry Yue096da12023-01-10 17:07:01 +0800292/*
293 * Compute decryption round keys from encryption round keys
294 */
Yanray Wangb67b4742023-10-31 17:10:32 +0800295#if !defined(MBEDTLS_BLOCK_CIPHER_NO_DECRYPT)
Jerry Yue096da12023-01-10 17:07:01 +0800296void mbedtls_aesce_inverse_key(unsigned char *invkey,
297 const unsigned char *fwdkey,
298 int nr)
299{
300 int i, j;
301 j = nr;
302 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
303 for (i = 1, j--; j > 0; i++, j--) {
304 vst1q_u8(invkey + i * 16,
305 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
306 }
307 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
308
309}
Yanray Wang590c9b72023-08-28 15:40:23 +0800310#endif
Jerry Yue096da12023-01-10 17:07:01 +0800311
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800312static inline uint32_t aes_rot_word(uint32_t word)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800313{
314 return (word << (32 - 8)) | (word >> 8);
315}
316
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800317static inline uint32_t aes_sub_word(uint32_t in)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800318{
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800319 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
Jerry Yu3f2fb712023-01-10 17:05:42 +0800320 uint8x16_t zero = vdupq_n_u8(0);
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800321
322 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
323 * the correct result as ShiftRows doesn't change the first row. */
324 v = vaeseq_u8(zero, v);
325 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800326}
327
328/*
Jerry Yubaae4012023-02-21 15:26:13 +0800329 * Key expansion function
Jerry Yu3f2fb712023-01-10 17:05:42 +0800330 */
Jerry Yubaae4012023-02-21 15:26:13 +0800331static void aesce_setkey_enc(unsigned char *rk,
332 const unsigned char *key,
333 const size_t key_bit_length)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800334{
Jerry Yubaae4012023-02-21 15:26:13 +0800335 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
336 0x20, 0x40, 0x80, 0x1b, 0x36 };
Jerry Yu947bf962023-02-23 11:07:57 +0800337 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
338 * - Section 5, Nr = Nk + 6
Jerry Yu2c266512023-03-01 11:18:20 +0800339 * - Section 5.2, the length of round keys is Nb*(Nr+1)
Jerry Yu947bf962023-02-23 11:07:57 +0800340 */
341 const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */
342 const size_t round_key_len_in_words = 4; /* Nb */
Jerry Yu2c266512023-03-01 11:18:20 +0800343 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
344 const size_t round_keys_len_in_words =
345 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
346 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800347
Jerry Yu3304c202023-02-22 14:37:11 +0800348 memcpy(rk, key, key_len_in_words * 4);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800349
Jerry Yu3304c202023-02-22 14:37:11 +0800350 for (uint32_t *rki = (uint32_t *) rk;
351 rki + key_len_in_words < rko_end;
352 rki += key_len_in_words) {
353
Dave Rodgmane4a6f5a2023-11-04 12:20:09 +0000354 size_t iteration = (size_t) (rki - (uint32_t *) rk) / key_len_in_words;
Jerry Yu3304c202023-02-22 14:37:11 +0800355 uint32_t *rko;
Jerry Yubaae4012023-02-21 15:26:13 +0800356 rko = rki + key_len_in_words;
357 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
Jerry Yu3304c202023-02-22 14:37:11 +0800358 rko[0] ^= rcon[iteration] ^ rki[0];
Jerry Yu3f2fb712023-01-10 17:05:42 +0800359 rko[1] = rko[0] ^ rki[1];
360 rko[2] = rko[1] ^ rki[2];
361 rko[3] = rko[2] ^ rki[3];
Jerry Yufac5a542023-02-23 10:13:40 +0800362 if (rko + key_len_in_words > rko_end) {
Jerry Yu3304c202023-02-22 14:37:11 +0800363 /* Do not write overflow words.*/
364 continue;
365 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800366#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
Jerry Yubaae4012023-02-21 15:26:13 +0800367 switch (key_bit_length) {
Jerry Yu3304c202023-02-22 14:37:11 +0800368 case 128:
369 break;
Jerry Yubaae4012023-02-21 15:26:13 +0800370 case 192:
Jerry Yu3304c202023-02-22 14:37:11 +0800371 rko[4] = rko[3] ^ rki[4];
372 rko[5] = rko[4] ^ rki[5];
Jerry Yubaae4012023-02-21 15:26:13 +0800373 break;
374 case 256:
Jerry Yu3304c202023-02-22 14:37:11 +0800375 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
376 rko[5] = rko[4] ^ rki[5];
377 rko[6] = rko[5] ^ rki[6];
378 rko[7] = rko[6] ^ rki[7];
Jerry Yubaae4012023-02-21 15:26:13 +0800379 break;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800380 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800381#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
Jerry Yu3f2fb712023-01-10 17:05:42 +0800382 }
383}
384
385/*
386 * Key expansion, wrapper
387 */
388int mbedtls_aesce_setkey_enc(unsigned char *rk,
389 const unsigned char *key,
390 size_t bits)
391{
392 switch (bits) {
Jerry Yubaae4012023-02-21 15:26:13 +0800393 case 128:
394 case 192:
395 case 256:
Jerry Yuba1e78f2023-02-24 11:18:16 +0800396 aesce_setkey_enc(rk, key, bits);
397 break;
398 default:
399 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800400 }
401
402 return 0;
403}
404
Jerry Yudf87a122023-01-10 18:17:15 +0800405#if defined(MBEDTLS_GCM_C)
406
Dave Rodgman46267f62023-10-09 14:47:50 +0100407#if defined(MBEDTLS_ARCH_IS_ARM32)
408
409#if defined(__clang__)
410/* On clang for A32/T32, work around some missing intrinsics and types which are listed in
411 * [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1)
412 * These are only required for GCM.
413 */
Dave Rodgman46267f62023-10-09 14:47:50 +0100414#define vreinterpretq_u64_p64(a) ((uint64x2_t) a)
415
416typedef uint8x16_t poly128_t;
417
418static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800419{
Dave Rodgman46267f62023-10-09 14:47:50 +0100420 poly128_t r;
421 asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
422 return r;
Jerry Yu132d0cb2023-03-02 17:35:53 +0800423}
Dave Rodgman46267f62023-10-09 14:47:50 +0100424
Dave Rodgman90291df2023-10-10 09:51:16 +0100425/* This is set to cause some more missing intrinsics to be defined below */
426#define COMMON_MISSING_INTRINSICS
Dave Rodgman46267f62023-10-09 14:47:50 +0100427
428static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
429{
430 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
431 (poly64_t) (vget_high_u64((uint64x2_t) b)));
432}
433
434#endif /* defined(__clang__) */
435
436static inline uint8x16_t vrbitq_u8(uint8x16_t x)
437{
438 /* There is no vrbitq_u8 instruction in A32/T32, so provide
439 * an equivalent non-Neon implementation. Reverse bit order in each
440 * byte with 4x rbit, rev. */
441 asm ("ldm %[p], { r2-r5 } \n\t"
442 "rbit r2, r2 \n\t"
443 "rev r2, r2 \n\t"
444 "rbit r3, r3 \n\t"
445 "rev r3, r3 \n\t"
446 "rbit r4, r4 \n\t"
447 "rev r4, r4 \n\t"
448 "rbit r5, r5 \n\t"
449 "rev r5, r5 \n\t"
450 "stm %[p], { r2-r5 } \n\t"
451 :
452 /* Output: 16 bytes of memory pointed to by &x */
453 "+m" (*(uint8_t(*)[16]) &x)
454 :
455 [p] "r" (&x)
456 :
457 "r2", "r3", "r4", "r5"
458 );
459 return x;
460}
Dave Rodgman46267f62023-10-09 14:47:50 +0100461
Dave Rodgman90291df2023-10-10 09:51:16 +0100462#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
Dave Rodgman46267f62023-10-09 14:47:50 +0100463
464#if defined(MBEDTLS_COMPILER_IS_GCC) && __GNUC__ == 5
Jerry Yu132d0cb2023-03-02 17:35:53 +0800465/* Some intrinsics are not available for GCC 5.X. */
Dave Rodgman90291df2023-10-10 09:51:16 +0100466#define COMMON_MISSING_INTRINSICS
467#endif /* MBEDTLS_COMPILER_IS_GCC && __GNUC__ == 5 */
468
469
470#if defined(COMMON_MISSING_INTRINSICS)
471
472/* Missing intrinsics common to both GCC 5, and Clang on 32-bit */
473
474#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800475#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
Dave Rodgman46267f62023-10-09 14:47:50 +0100476
Dave Rodgman90291df2023-10-10 09:51:16 +0100477static inline poly64x1_t vget_low_p64(poly64x2_t a)
Jerry Yu132d0cb2023-03-02 17:35:53 +0800478{
Dave Rodgman90291df2023-10-10 09:51:16 +0100479 uint64x1_t r = vget_low_u64(vreinterpretq_u64_p64(a));
480 return (poly64x1_t) r;
481
Jerry Yu132d0cb2023-03-02 17:35:53 +0800482}
Dave Rodgman90291df2023-10-10 09:51:16 +0100483
484#endif /* COMMON_MISSING_INTRINSICS */
Jerry Yu132d0cb2023-03-02 17:35:53 +0800485
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800486/* vmull_p64/vmull_high_p64 wrappers.
487 *
488 * Older compilers miss some intrinsic functions for `poly*_t`. We use
489 * uint8x16_t and uint8x16x3_t as input/output parameters.
490 */
Dave Rodgman46267f62023-10-09 14:47:50 +0100491#if defined(MBEDTLS_COMPILER_IS_GCC)
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800492/* GCC reports incompatible type error without cast. GCC think poly64_t and
493 * poly64x1_t are different, that is different with MSVC and Clang. */
494#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
495#else
496/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
497 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
498 * cast for clang also. */
499#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
Dave Rodgman46267f62023-10-09 14:47:50 +0100500#endif /* MBEDTLS_COMPILER_IS_GCC */
501
Jerry Yudf87a122023-01-10 18:17:15 +0800502static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
503{
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800504
Jerry Yudf87a122023-01-10 18:17:15 +0800505 return vreinterpretq_u8_p128(
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800506 MBEDTLS_VMULL_P64(
Dave Rodgmane467d622023-11-03 23:40:31 +0000507 (poly64_t) vget_low_p64(vreinterpretq_p64_u8(a)),
508 (poly64_t) vget_low_p64(vreinterpretq_p64_u8(b))
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800509 ));
Jerry Yudf87a122023-01-10 18:17:15 +0800510}
511
512static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
513{
514 return vreinterpretq_u8_p128(
515 vmull_high_p64(vreinterpretq_p64_u8(a),
516 vreinterpretq_p64_u8(b)));
517}
518
Jerry Yuf0526a92023-03-14 15:00:29 +0800519/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
Jerry Yu49b43672023-03-13 10:09:34 +0800520 * `x^128 + x^7 + x^2 + x + 1`.
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800521 *
522 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
523 * multiplies to generate a 128b.
524 *
525 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
526 * represented by 3 128b due to code size optimization.
527 *
528 * Output layout:
529 * | | | |
530 * |------------|-------------|-------------|
531 * | ret.val[0] | h3:h2:00:00 | high 128b |
Jerry Yu8f810602023-03-14 17:28:52 +0800532 * | ret.val[1] | :m2:m1:00 | middle 128b |
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800533 * | ret.val[2] | : :l1:l0 | low 128b |
534 */
Jerry Yudf87a122023-01-10 18:17:15 +0800535static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
536{
537 uint8x16x3_t ret;
Jerry Yu8f810602023-03-14 17:28:52 +0800538 uint8x16_t h, m, l; /* retval high/middle/low */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800539 uint8x16_t c, d, e;
540
541 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
542 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
543 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
544 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
545 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
546 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
547
548 ret.val[0] = h;
549 ret.val[1] = m;
550 ret.val[2] = l;
Jerry Yudf87a122023-01-10 18:17:15 +0800551 return ret;
552}
553
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800554/*
555 * Modulo reduction.
556 *
557 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
558 *
559 * Section 4.3
560 *
561 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
562 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
Jerry Yube4fdef2023-03-15 14:50:42 +0800563 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
564 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
565 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800566 * the result is still larger than 128 bits, we reduce again.
567 */
568static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
Jerry Yudf87a122023-01-10 18:17:15 +0800569{
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800570 uint8x16_t const ZERO = vdupq_n_u8(0);
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800571
Jerry Yudf87a122023-01-10 18:17:15 +0800572 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800573#if defined(__GNUC__)
574 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
575 * memory. It is for GNUC compatible compilers.
576 */
Dave Rodgmancb5c9fb2023-10-10 10:06:02 +0100577 asm volatile ("" : "+w" (r));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800578#endif
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800579 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
Jerry Yu8f810602023-03-14 17:28:52 +0800580 uint8x16_t h, m, l; /* input high/middle/low 128b */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800581 uint8x16_t c, d, e, f, g, n, o;
582 h = input.val[0]; /* h3:h2:00:00 */
583 m = input.val[1]; /* :m2:m1:00 */
584 l = input.val[2]; /* : :l1:l0 */
585 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
586 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
587 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
588 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
589 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
590 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
591 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
592 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
Jerry Yudf87a122023-01-10 18:17:15 +0800593}
594
595/*
596 * GCM multiplication: c = a times b in GF(2^128)
597 */
598void mbedtls_aesce_gcm_mult(unsigned char c[16],
599 const unsigned char a[16],
600 const unsigned char b[16])
601{
602 uint8x16_t va, vb, vc;
603 va = vrbitq_u8(vld1q_u8(&a[0]));
604 vb = vrbitq_u8(vld1q_u8(&b[0]));
605 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
606 vst1q_u8(&c[0], vc);
607}
608
609#endif /* MBEDTLS_GCM_C */
Jerry Yu48b999c2023-03-03 15:51:07 +0800610
611#if defined(MBEDTLS_POP_TARGET_PRAGMA)
612#if defined(__clang__)
613#pragma clang attribute pop
614#elif defined(__GNUC__)
615#pragma GCC pop_options
616#endif
617#undef MBEDTLS_POP_TARGET_PRAGMA
618#endif
619
Dave Rodgman6eee57b2023-11-30 11:01:04 +0000620#endif /* MBEDTLS_AESCE_HAVE_CODE */
Jerry Yu49231312023-01-10 16:57:21 +0800621
622#endif /* MBEDTLS_AESCE_C */