blob: 6a65043316567023cb6290c179e6a2c28dd03da5 [file] [log] [blame]
Jerry Yu49231312023-01-10 16:57:21 +08001/*
Dave Rodgmanf918d422023-03-17 17:52:23 +00002 * Armv8-A Cryptographic Extension support functions for Aarch64
Jerry Yu49231312023-01-10 16:57:21 +08003 *
4 * Copyright The Mbed TLS Contributors
5 * SPDX-License-Identifier: Apache-2.0
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License"); you may
8 * not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
Dave Rodgman27e3c872023-10-08 10:29:26 +010020#if defined(__clang__) && (__clang_major__ >= 4)
21
22/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8 in the following #if,
23 * but that is defined by build_info.h, and we need this block to happen first. */
24#if defined(__ARM_ARCH)
25#if __ARM_ARCH >= 8
26#define MBEDTLS_AESCE_ARCH_IS_ARMV8
27#endif
28#endif
29
30#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8) && !defined(__ARM_FEATURE_CRYPTO)
Jerry Yu48b999c2023-03-03 15:51:07 +080031/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
32 *
33 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
34 * these are normally only enabled by the -march option on the command line.
35 * By defining the macros ourselves we gain access to those declarations without
36 * requiring -march on the command line.
37 *
38 * `arm_neon.h` could be included by any header file, so we put these defines
39 * at the top of this file, before any includes.
40 */
41#define __ARM_FEATURE_CRYPTO 1
Jerry Yuae129c32023-03-03 15:55:56 +080042/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
43 *
Jerry Yu490bf082023-03-06 15:21:44 +080044 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
45 * for older compilers.
Jerry Yuae129c32023-03-03 15:55:56 +080046 */
47#define __ARM_FEATURE_AES 1
Dave Rodgmandb6ab242023-03-14 16:03:57 +000048#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
Jerry Yu490bf082023-03-06 15:21:44 +080049#endif
Jerry Yu48b999c2023-03-03 15:51:07 +080050
Dave Rodgman27e3c872023-10-08 10:29:26 +010051#endif /* defined(__clang__) && (__clang_major__ >= 4) */
52
Jerry Yu49231312023-01-10 16:57:21 +080053#include <string.h>
54#include "common.h"
55
56#if defined(MBEDTLS_AESCE_C)
57
58#include "aesce.h"
59
Dave Rodgmanece803b2023-10-08 20:24:48 +010060#if defined(MBEDTLS_ARCH_IS_ARMV8) && defined(__ARM_NEON)
Jerry Yu49231312023-01-10 16:57:21 +080061
Jerry Yu61c4cfa2023-04-26 11:06:51 +080062/* Compiler version checks. */
Jerry Yudb368de2023-04-26 16:55:37 +080063#if defined(__clang__)
Dave Rodgman48b965d2023-10-09 12:19:44 +010064# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
65# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 111.0."
66# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
67# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
Jerry Yudb368de2023-04-26 16:55:37 +080068# endif
69#elif defined(__GNUC__)
70# if __GNUC__ < 6
71# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
72# endif
73#elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +080074/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
75 * please update this and document of `MBEDTLS_AESCE_C` in
76 * `mbedtls_config.h`. */
Jerry Yudb368de2023-04-26 16:55:37 +080077# if _MSC_VER < 1929
78# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
79# endif
Dave Rodgman4b8e8dc2023-10-08 21:41:40 +010080#elif defined(__ARMCC_VERSION)
81# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
82/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
83 * If someone verified that, please update this and document of
84 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
85# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
86# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
87# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
88# endif
Jerry Yu61c4cfa2023-04-26 11:06:51 +080089#endif
90
Jerry Yu6b00f5a2023-05-04 16:30:21 +080091#ifdef __ARM_NEON
Jerry Yu08933d32023-04-27 18:28:00 +080092#include <arm_neon.h>
Dave Rodgman27e3c872023-10-08 10:29:26 +010093
94#if defined(MBEDTLS_ARCH_IS_ARM32)
95#if defined(__clang__)
Dave Rodgmanf60e44d2023-10-09 13:40:36 +010096/* On clang for A32/T32, work around some missing intrinsics and types which are listed in [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1) */
Dave Rodgman27e3c872023-10-08 10:29:26 +010097
98#ifndef vreinterpretq_p64_u8
99#define vreinterpretq_p64_u8 (poly64x2_t)
100#endif
101#ifndef vreinterpretq_u8_p128
102#define vreinterpretq_u8_p128 (uint8x16_t)
103#endif
104#ifndef vreinterpretq_u64_p64
105#define vreinterpretq_u64_p64 (uint64x2_t)
106#endif
107
108typedef uint8x16_t poly128_t;
109
110static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
111{
112 poly128_t r;
Dave Rodgman472a1902023-10-08 22:14:41 +0100113 asm ("vmull.p64 %[r], %[a], %[b]": [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
Dave Rodgman27e3c872023-10-08 10:29:26 +0100114 return r;
115}
116
117static inline poly64x1_t vget_low_p64(poly64x2_t a)
118{
119 return (poly64x1_t) vget_low_u64(vreinterpretq_u64_p64(a));
120}
121
122static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
123{
124 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
125 (poly64_t) (vget_high_u64((uint64x2_t) b)));
126}
127
128#endif /* defined(__clang__) */
129
130static inline uint8x16_t vrbitq_u8(uint8x16_t x)
131{
132 /* There is no vrbitq_u8 instruction in A32/T32, so provide
133 * an equivalent non-Neon implementation. Reverse bit order in each
134 * byte with 4x rbit, rev. */
135 asm ("ldm %[p], { r2-r5 } \n\t"
136 "rbit r2, r2 \n\t"
137 "rev r2, r2 \n\t"
138 "rbit r3, r3 \n\t"
139 "rev r3, r3 \n\t"
140 "rbit r4, r4 \n\t"
141 "rev r4, r4 \n\t"
142 "rbit r5, r5 \n\t"
143 "rev r5, r5 \n\t"
144 "stm %[p], { r2-r5 } \n\t"
145 :
146 /* Output: 16 bytes of memory pointed to by &x */
147 "+m" (*(uint8_t(*)[16]) &x)
148 :
149 [p] "r" (&x)
150 :
151 "r2", "r3", "r4", "r5"
152 );
153 return x;
154}
155
156#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
157
Jerry Yu6b00f5a2023-05-04 16:30:21 +0800158#else
159#error "Target does not support NEON instructions"
160#endif
Jerry Yu08933d32023-04-27 18:28:00 +0800161
Jerry Yu580e06f2023-04-28 17:42:40 +0800162#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
163 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800164# if defined(__ARMCOMPILER_VERSION)
165# if __ARMCOMPILER_VERSION <= 6090000
166# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
167# else
Jerry Yu893be8d2023-07-13 17:32:11 +0800168# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800169# define MBEDTLS_POP_TARGET_PRAGMA
170# endif
171# elif defined(__clang__)
Jerry Yu893be8d2023-07-13 17:32:11 +0800172# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yuec9be842023-03-14 10:42:47 +0800173# define MBEDTLS_POP_TARGET_PRAGMA
174# elif defined(__GNUC__)
Jerry Yuec9be842023-03-14 10:42:47 +0800175# pragma GCC push_options
Beniamin Sandu471a9752023-06-25 20:16:16 +0300176# pragma GCC target ("+crypto")
Jerry Yuec9be842023-03-14 10:42:47 +0800177# define MBEDTLS_POP_TARGET_PRAGMA
Jerry Yu07d28d82023-03-20 18:12:36 +0800178# elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +0800179# error "Required feature(__ARM_FEATURE_AES) is not enabled."
Jerry Yu49231312023-01-10 16:57:21 +0800180# endif
Jerry Yu580e06f2023-04-28 17:42:40 +0800181#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
182 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
Jerry Yu49231312023-01-10 16:57:21 +0800183
Dave Rodgman45661322023-08-04 12:31:58 +0100184#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
185
Jerry Yub95c7762023-01-10 16:59:51 +0800186#include <asm/hwcap.h>
187#include <sys/auxv.h>
Dave Rodgman45661322023-08-04 12:31:58 +0100188
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100189signed char mbedtls_aesce_has_support_result = -1;
Jerry Yub95c7762023-01-10 16:59:51 +0800190
Jerry Yu36606232023-04-19 10:44:29 +0800191#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
Jerry Yub95c7762023-01-10 16:59:51 +0800192/*
193 * AES instruction support detection routine
194 */
Dave Rodgman45661322023-08-04 12:31:58 +0100195int mbedtls_aesce_has_support_impl(void)
Jerry Yub95c7762023-01-10 16:59:51 +0800196{
Dave Rodgman45661322023-08-04 12:31:58 +0100197 /* To avoid many calls to getauxval, cache the result. This is
198 * thread-safe, because we store the result in a char so cannot
199 * be vulnerable to non-atomic updates.
200 * It is possible that we could end up setting result more than
201 * once, but that is harmless.
202 */
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100203 if (mbedtls_aesce_has_support_result == -1) {
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100204#if defined(MBEDTLS_ARCH_IS_ARM32)
205 unsigned long auxval = getauxval(AT_HWCAP);
206 unsigned long auxval2 = getauxval(AT_HWCAP2);
207 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
208 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
209 mbedtls_aesce_has_support_result = 1;
210 } else {
211 mbedtls_aesce_has_support_result = 0;
212 }
213#else
Dave Rodgman45661322023-08-04 12:31:58 +0100214 unsigned long auxval = getauxval(AT_HWCAP);
215 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
216 (HWCAP_ASIMD | HWCAP_AES)) {
217 mbedtls_aesce_has_support_result = 1;
218 } else {
219 mbedtls_aesce_has_support_result = 0;
220 }
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100221#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100222 }
223 return mbedtls_aesce_has_support_result;
Jerry Yub95c7762023-01-10 16:59:51 +0800224}
Jerry Yu0d4f4e52023-03-31 14:32:47 +0800225#endif
Jerry Yub95c7762023-01-10 16:59:51 +0800226
Dave Rodgman45661322023-08-04 12:31:58 +0100227#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
228
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100229/* Single round of AESCE encryption */
230#define AESCE_ENCRYPT_ROUND \
231 block = vaeseq_u8(block, vld1q_u8(keys)); \
232 block = vaesmcq_u8(block); \
233 keys += 16
234/* Two rounds of AESCE encryption */
235#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
236
Dave Rodgman9bb7e6f2023-06-16 09:41:21 +0100237MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
Jerry Yu2bb3d812023-01-10 17:38:26 +0800238static uint8x16_t aesce_encrypt_block(uint8x16_t block,
239 unsigned char *keys,
240 int rounds)
241{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100242 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100243 if (rounds == 10) {
244 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800245 }
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100246 if (rounds == 12) {
247 goto rounds_12;
248 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100249 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100250rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100251 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100252rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100253 AESCE_ENCRYPT_ROUND_X2;
254 AESCE_ENCRYPT_ROUND_X2;
255 AESCE_ENCRYPT_ROUND_X2;
256 AESCE_ENCRYPT_ROUND_X2;
257 AESCE_ENCRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800258
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800259 /* AES AddRoundKey for the previous round.
260 * SubBytes, ShiftRows for the final round. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100261 block = vaeseq_u8(block, vld1q_u8(keys));
262 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800263
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800264 /* Final round: no MixColumns */
Jerry Yu3304c202023-02-22 14:37:11 +0800265
266 /* Final AddRoundKey */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100267 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800268
269 return block;
270}
271
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100272/* Single round of AESCE decryption
273 *
274 * AES AddRoundKey, SubBytes, ShiftRows
275 *
276 * block = vaesdq_u8(block, vld1q_u8(keys));
277 *
278 * AES inverse MixColumns for the next round.
279 *
280 * This means that we switch the order of the inverse AddRoundKey and
281 * inverse MixColumns operations. We have to do this as AddRoundKey is
282 * done in an atomic instruction together with the inverses of SubBytes
283 * and ShiftRows.
284 *
285 * It works because MixColumns is a linear operation over GF(2^8) and
286 * AddRoundKey is an exclusive or, which is equivalent to addition over
287 * GF(2^8). (The inverse of MixColumns needs to be applied to the
288 * affected round keys separately which has been done when the
289 * decryption round keys were calculated.)
290 *
291 * block = vaesimcq_u8(block);
292 */
293#define AESCE_DECRYPT_ROUND \
294 block = vaesdq_u8(block, vld1q_u8(keys)); \
295 block = vaesimcq_u8(block); \
296 keys += 16
297/* Two rounds of AESCE decryption */
298#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
299
Jerry Yu2bb3d812023-01-10 17:38:26 +0800300static uint8x16_t aesce_decrypt_block(uint8x16_t block,
301 unsigned char *keys,
302 int rounds)
303{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100304 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100305 if (rounds == 10) {
306 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800307 }
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100308 if (rounds == 12) {
309 goto rounds_12;
310 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100311 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100312rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100313 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100314rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100315 AESCE_DECRYPT_ROUND_X2;
316 AESCE_DECRYPT_ROUND_X2;
317 AESCE_DECRYPT_ROUND_X2;
318 AESCE_DECRYPT_ROUND_X2;
319 AESCE_DECRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800320
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800321 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
322 * last full round. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100323 block = vaesdq_u8(block, vld1q_u8(keys));
324 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800325
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800326 /* Inverse AddRoundKey for inverting the initial round key addition. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100327 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800328
329 return block;
330}
331
332/*
333 * AES-ECB block en(de)cryption
334 */
335int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
336 int mode,
337 const unsigned char input[16],
338 unsigned char output[16])
339{
340 uint8x16_t block = vld1q_u8(&input[0]);
341 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
342
343 if (mode == MBEDTLS_AES_ENCRYPT) {
344 block = aesce_encrypt_block(block, keys, ctx->nr);
345 } else {
346 block = aesce_decrypt_block(block, keys, ctx->nr);
347 }
348 vst1q_u8(&output[0], block);
349
350 return 0;
351}
352
Jerry Yue096da12023-01-10 17:07:01 +0800353/*
354 * Compute decryption round keys from encryption round keys
355 */
356void mbedtls_aesce_inverse_key(unsigned char *invkey,
357 const unsigned char *fwdkey,
358 int nr)
359{
360 int i, j;
361 j = nr;
362 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
363 for (i = 1, j--; j > 0; i++, j--) {
364 vst1q_u8(invkey + i * 16,
365 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
366 }
367 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
368
369}
370
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800371static inline uint32_t aes_rot_word(uint32_t word)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800372{
373 return (word << (32 - 8)) | (word >> 8);
374}
375
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800376static inline uint32_t aes_sub_word(uint32_t in)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800377{
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800378 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
Jerry Yu3f2fb712023-01-10 17:05:42 +0800379 uint8x16_t zero = vdupq_n_u8(0);
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800380
381 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
382 * the correct result as ShiftRows doesn't change the first row. */
383 v = vaeseq_u8(zero, v);
384 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800385}
386
387/*
Jerry Yubaae4012023-02-21 15:26:13 +0800388 * Key expansion function
Jerry Yu3f2fb712023-01-10 17:05:42 +0800389 */
Jerry Yubaae4012023-02-21 15:26:13 +0800390static void aesce_setkey_enc(unsigned char *rk,
391 const unsigned char *key,
392 const size_t key_bit_length)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800393{
Jerry Yubaae4012023-02-21 15:26:13 +0800394 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
395 0x20, 0x40, 0x80, 0x1b, 0x36 };
Jerry Yu947bf962023-02-23 11:07:57 +0800396 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
397 * - Section 5, Nr = Nk + 6
Jerry Yu2c266512023-03-01 11:18:20 +0800398 * - Section 5.2, the length of round keys is Nb*(Nr+1)
Jerry Yu947bf962023-02-23 11:07:57 +0800399 */
400 const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */
401 const size_t round_key_len_in_words = 4; /* Nb */
Jerry Yu2c266512023-03-01 11:18:20 +0800402 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
403 const size_t round_keys_len_in_words =
404 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
405 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800406
Jerry Yu3304c202023-02-22 14:37:11 +0800407 memcpy(rk, key, key_len_in_words * 4);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800408
Jerry Yu3304c202023-02-22 14:37:11 +0800409 for (uint32_t *rki = (uint32_t *) rk;
410 rki + key_len_in_words < rko_end;
411 rki += key_len_in_words) {
412
Jerry Yufac5a542023-02-23 10:13:40 +0800413 size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words;
Jerry Yu3304c202023-02-22 14:37:11 +0800414 uint32_t *rko;
Jerry Yubaae4012023-02-21 15:26:13 +0800415 rko = rki + key_len_in_words;
416 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
Jerry Yu3304c202023-02-22 14:37:11 +0800417 rko[0] ^= rcon[iteration] ^ rki[0];
Jerry Yu3f2fb712023-01-10 17:05:42 +0800418 rko[1] = rko[0] ^ rki[1];
419 rko[2] = rko[1] ^ rki[2];
420 rko[3] = rko[2] ^ rki[3];
Jerry Yufac5a542023-02-23 10:13:40 +0800421 if (rko + key_len_in_words > rko_end) {
Jerry Yu3304c202023-02-22 14:37:11 +0800422 /* Do not write overflow words.*/
423 continue;
424 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800425#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
Jerry Yubaae4012023-02-21 15:26:13 +0800426 switch (key_bit_length) {
Jerry Yu3304c202023-02-22 14:37:11 +0800427 case 128:
428 break;
Jerry Yubaae4012023-02-21 15:26:13 +0800429 case 192:
Jerry Yu3304c202023-02-22 14:37:11 +0800430 rko[4] = rko[3] ^ rki[4];
431 rko[5] = rko[4] ^ rki[5];
Jerry Yubaae4012023-02-21 15:26:13 +0800432 break;
433 case 256:
Jerry Yu3304c202023-02-22 14:37:11 +0800434 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
435 rko[5] = rko[4] ^ rki[5];
436 rko[6] = rko[5] ^ rki[6];
437 rko[7] = rko[6] ^ rki[7];
Jerry Yubaae4012023-02-21 15:26:13 +0800438 break;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800439 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800440#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
Jerry Yu3f2fb712023-01-10 17:05:42 +0800441 }
442}
443
444/*
445 * Key expansion, wrapper
446 */
447int mbedtls_aesce_setkey_enc(unsigned char *rk,
448 const unsigned char *key,
449 size_t bits)
450{
451 switch (bits) {
Jerry Yubaae4012023-02-21 15:26:13 +0800452 case 128:
453 case 192:
454 case 256:
Jerry Yuba1e78f2023-02-24 11:18:16 +0800455 aesce_setkey_enc(rk, key, bits);
456 break;
457 default:
458 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800459 }
460
461 return 0;
462}
463
Jerry Yudf87a122023-01-10 18:17:15 +0800464#if defined(MBEDTLS_GCM_C)
465
Jerry Yu132d0cb2023-03-02 17:35:53 +0800466#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800467/* Some intrinsics are not available for GCC 5.X. */
Jerry Yu132d0cb2023-03-02 17:35:53 +0800468#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
469#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
470static inline poly64_t vget_low_p64(poly64x2_t __a)
471{
472 uint64x2_t tmp = (uint64x2_t) (__a);
473 uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0));
474 return (poly64_t) (lo);
475}
476#endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/
477
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800478/* vmull_p64/vmull_high_p64 wrappers.
479 *
480 * Older compilers miss some intrinsic functions for `poly*_t`. We use
481 * uint8x16_t and uint8x16x3_t as input/output parameters.
482 */
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800483#if defined(__GNUC__) && !defined(__clang__)
484/* GCC reports incompatible type error without cast. GCC think poly64_t and
485 * poly64x1_t are different, that is different with MSVC and Clang. */
486#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
487#else
488/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
489 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
490 * cast for clang also. */
491#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
492#endif
Jerry Yudf87a122023-01-10 18:17:15 +0800493static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
494{
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800495
Jerry Yudf87a122023-01-10 18:17:15 +0800496 return vreinterpretq_u8_p128(
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800497 MBEDTLS_VMULL_P64(
498 vget_low_p64(vreinterpretq_p64_u8(a)),
499 vget_low_p64(vreinterpretq_p64_u8(b))
500 ));
Jerry Yudf87a122023-01-10 18:17:15 +0800501}
502
503static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
504{
505 return vreinterpretq_u8_p128(
506 vmull_high_p64(vreinterpretq_p64_u8(a),
507 vreinterpretq_p64_u8(b)));
508}
509
Jerry Yuf0526a92023-03-14 15:00:29 +0800510/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
Jerry Yu49b43672023-03-13 10:09:34 +0800511 * `x^128 + x^7 + x^2 + x + 1`.
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800512 *
513 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
514 * multiplies to generate a 128b.
515 *
516 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
517 * represented by 3 128b due to code size optimization.
518 *
519 * Output layout:
520 * | | | |
521 * |------------|-------------|-------------|
522 * | ret.val[0] | h3:h2:00:00 | high 128b |
Jerry Yu8f810602023-03-14 17:28:52 +0800523 * | ret.val[1] | :m2:m1:00 | middle 128b |
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800524 * | ret.val[2] | : :l1:l0 | low 128b |
525 */
Jerry Yudf87a122023-01-10 18:17:15 +0800526static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
527{
528 uint8x16x3_t ret;
Jerry Yu8f810602023-03-14 17:28:52 +0800529 uint8x16_t h, m, l; /* retval high/middle/low */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800530 uint8x16_t c, d, e;
531
532 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
533 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
534 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
535 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
536 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
537 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
538
539 ret.val[0] = h;
540 ret.val[1] = m;
541 ret.val[2] = l;
Jerry Yudf87a122023-01-10 18:17:15 +0800542 return ret;
543}
544
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800545/*
546 * Modulo reduction.
547 *
548 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
549 *
550 * Section 4.3
551 *
552 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
553 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
Jerry Yube4fdef2023-03-15 14:50:42 +0800554 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
555 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
556 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800557 * the result is still larger than 128 bits, we reduce again.
558 */
559static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
Jerry Yudf87a122023-01-10 18:17:15 +0800560{
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800561 uint8x16_t const ZERO = vdupq_n_u8(0);
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800562
Jerry Yudf87a122023-01-10 18:17:15 +0800563 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800564#if defined(__GNUC__)
565 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
566 * memory. It is for GNUC compatible compilers.
567 */
Jerry Yudf87a122023-01-10 18:17:15 +0800568 asm ("" : "+w" (r));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800569#endif
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800570 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
Jerry Yu8f810602023-03-14 17:28:52 +0800571 uint8x16_t h, m, l; /* input high/middle/low 128b */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800572 uint8x16_t c, d, e, f, g, n, o;
573 h = input.val[0]; /* h3:h2:00:00 */
574 m = input.val[1]; /* :m2:m1:00 */
575 l = input.val[2]; /* : :l1:l0 */
576 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
577 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
578 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
579 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
580 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
581 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
582 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
583 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
Jerry Yudf87a122023-01-10 18:17:15 +0800584}
585
586/*
587 * GCM multiplication: c = a times b in GF(2^128)
588 */
589void mbedtls_aesce_gcm_mult(unsigned char c[16],
590 const unsigned char a[16],
591 const unsigned char b[16])
592{
593 uint8x16_t va, vb, vc;
594 va = vrbitq_u8(vld1q_u8(&a[0]));
595 vb = vrbitq_u8(vld1q_u8(&b[0]));
596 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
597 vst1q_u8(&c[0], vc);
598}
599
600#endif /* MBEDTLS_GCM_C */
Jerry Yu48b999c2023-03-03 15:51:07 +0800601
602#if defined(MBEDTLS_POP_TARGET_PRAGMA)
603#if defined(__clang__)
604#pragma clang attribute pop
605#elif defined(__GNUC__)
606#pragma GCC pop_options
607#endif
608#undef MBEDTLS_POP_TARGET_PRAGMA
609#endif
610
Dave Rodgman27e3c872023-10-08 10:29:26 +0100611#endif /* MBEDTLS_ARCH_IS_ARMV8 */
Jerry Yu49231312023-01-10 16:57:21 +0800612
613#endif /* MBEDTLS_AESCE_C */