blob: 4c85941b2aaca8af7774566173251a34826df5c6 [file] [log] [blame]
Jerry Yu49231312023-01-10 16:57:21 +08001/*
Dave Rodgmanf918d422023-03-17 17:52:23 +00002 * Armv8-A Cryptographic Extension support functions for Aarch64
Jerry Yu49231312023-01-10 16:57:21 +08003 *
4 * Copyright The Mbed TLS Contributors
5 * SPDX-License-Identifier: Apache-2.0
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License"); you may
8 * not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
Dave Rodgman27e3c872023-10-08 10:29:26 +010020#if defined(__clang__) && (__clang_major__ >= 4)
21
22/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8 in the following #if,
23 * but that is defined by build_info.h, and we need this block to happen first. */
24#if defined(__ARM_ARCH)
25#if __ARM_ARCH >= 8
26#define MBEDTLS_AESCE_ARCH_IS_ARMV8
27#endif
28#endif
29
30#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8) && !defined(__ARM_FEATURE_CRYPTO)
Jerry Yu48b999c2023-03-03 15:51:07 +080031/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
32 *
33 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
34 * these are normally only enabled by the -march option on the command line.
35 * By defining the macros ourselves we gain access to those declarations without
36 * requiring -march on the command line.
37 *
38 * `arm_neon.h` could be included by any header file, so we put these defines
39 * at the top of this file, before any includes.
40 */
41#define __ARM_FEATURE_CRYPTO 1
Jerry Yuae129c32023-03-03 15:55:56 +080042/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
43 *
Jerry Yu490bf082023-03-06 15:21:44 +080044 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
45 * for older compilers.
Jerry Yuae129c32023-03-03 15:55:56 +080046 */
47#define __ARM_FEATURE_AES 1
Dave Rodgmandb6ab242023-03-14 16:03:57 +000048#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
Jerry Yu490bf082023-03-06 15:21:44 +080049#endif
Jerry Yu48b999c2023-03-03 15:51:07 +080050
Dave Rodgman27e3c872023-10-08 10:29:26 +010051#endif /* defined(__clang__) && (__clang_major__ >= 4) */
52
Jerry Yu49231312023-01-10 16:57:21 +080053#include <string.h>
54#include "common.h"
55
56#if defined(MBEDTLS_AESCE_C)
57
58#include "aesce.h"
59
Dave Rodgmanece803b2023-10-08 20:24:48 +010060#if defined(MBEDTLS_ARCH_IS_ARMV8) && defined(__ARM_NEON)
Jerry Yu49231312023-01-10 16:57:21 +080061
Jerry Yu61c4cfa2023-04-26 11:06:51 +080062/* Compiler version checks. */
Jerry Yudb368de2023-04-26 16:55:37 +080063#if defined(__clang__)
Dave Rodgman48b965d2023-10-09 12:19:44 +010064# if defined(MBEDTLS_ARCH_IS_ARM32) && (__clang_major__ < 11)
65# error "Minimum version of Clang for MBEDTLS_AESCE_C on 32-bit Arm or Thumb is 111.0."
66# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__clang_major__ < 4)
67# error "Minimum version of Clang for MBEDTLS_AESCE_C on aarch64 is 4.0."
Jerry Yudb368de2023-04-26 16:55:37 +080068# endif
69#elif defined(__GNUC__)
70# if __GNUC__ < 6
71# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
72# endif
73#elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +080074/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
75 * please update this and document of `MBEDTLS_AESCE_C` in
76 * `mbedtls_config.h`. */
Jerry Yudb368de2023-04-26 16:55:37 +080077# if _MSC_VER < 1929
78# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
79# endif
Dave Rodgman4b8e8dc2023-10-08 21:41:40 +010080#elif defined(__ARMCC_VERSION)
81# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
82/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
83 * If someone verified that, please update this and document of
84 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
85# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
86# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
87# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
88# endif
Jerry Yu61c4cfa2023-04-26 11:06:51 +080089#endif
90
Jerry Yu6b00f5a2023-05-04 16:30:21 +080091#ifdef __ARM_NEON
Jerry Yu08933d32023-04-27 18:28:00 +080092#include <arm_neon.h>
Dave Rodgman27e3c872023-10-08 10:29:26 +010093
94#if defined(MBEDTLS_ARCH_IS_ARM32)
95#if defined(__clang__)
Dave Rodgmanf60e44d2023-10-09 13:40:36 +010096/* On clang for A32/T32, work around some missing intrinsics and types which are listed in [ACLE](https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#polynomial-1) */
Dave Rodgman27e3c872023-10-08 10:29:26 +010097
Dave Rodgman27e3c872023-10-08 10:29:26 +010098#define vreinterpretq_p64_u8 (poly64x2_t)
Dave Rodgman27e3c872023-10-08 10:29:26 +010099#define vreinterpretq_u8_p128 (uint8x16_t)
Dave Rodgman27e3c872023-10-08 10:29:26 +0100100#define vreinterpretq_u64_p64 (uint64x2_t)
Dave Rodgman27e3c872023-10-08 10:29:26 +0100101
102typedef uint8x16_t poly128_t;
103
104static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
105{
106 poly128_t r;
Dave Rodgmanf4ee5d42023-10-09 13:42:38 +0100107 asm ("vmull.p64 %[r], %[a], %[b]" : [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
Dave Rodgman27e3c872023-10-08 10:29:26 +0100108 return r;
109}
110
111static inline poly64x1_t vget_low_p64(poly64x2_t a)
112{
113 return (poly64x1_t) vget_low_u64(vreinterpretq_u64_p64(a));
114}
115
116static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
117{
118 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
119 (poly64_t) (vget_high_u64((uint64x2_t) b)));
120}
121
122#endif /* defined(__clang__) */
123
124static inline uint8x16_t vrbitq_u8(uint8x16_t x)
125{
126 /* There is no vrbitq_u8 instruction in A32/T32, so provide
127 * an equivalent non-Neon implementation. Reverse bit order in each
128 * byte with 4x rbit, rev. */
129 asm ("ldm %[p], { r2-r5 } \n\t"
130 "rbit r2, r2 \n\t"
131 "rev r2, r2 \n\t"
132 "rbit r3, r3 \n\t"
133 "rev r3, r3 \n\t"
134 "rbit r4, r4 \n\t"
135 "rev r4, r4 \n\t"
136 "rbit r5, r5 \n\t"
137 "rev r5, r5 \n\t"
138 "stm %[p], { r2-r5 } \n\t"
139 :
140 /* Output: 16 bytes of memory pointed to by &x */
141 "+m" (*(uint8_t(*)[16]) &x)
142 :
143 [p] "r" (&x)
144 :
145 "r2", "r3", "r4", "r5"
146 );
147 return x;
148}
149
150#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
151
Jerry Yu6b00f5a2023-05-04 16:30:21 +0800152#else
153#error "Target does not support NEON instructions"
154#endif
Jerry Yu08933d32023-04-27 18:28:00 +0800155
Jerry Yu580e06f2023-04-28 17:42:40 +0800156#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
157 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800158# if defined(__ARMCOMPILER_VERSION)
159# if __ARMCOMPILER_VERSION <= 6090000
160# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
161# else
Jerry Yu893be8d2023-07-13 17:32:11 +0800162# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800163# define MBEDTLS_POP_TARGET_PRAGMA
164# endif
165# elif defined(__clang__)
Jerry Yu893be8d2023-07-13 17:32:11 +0800166# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yuec9be842023-03-14 10:42:47 +0800167# define MBEDTLS_POP_TARGET_PRAGMA
168# elif defined(__GNUC__)
Jerry Yuec9be842023-03-14 10:42:47 +0800169# pragma GCC push_options
Beniamin Sandu471a9752023-06-25 20:16:16 +0300170# pragma GCC target ("+crypto")
Jerry Yuec9be842023-03-14 10:42:47 +0800171# define MBEDTLS_POP_TARGET_PRAGMA
Jerry Yu07d28d82023-03-20 18:12:36 +0800172# elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +0800173# error "Required feature(__ARM_FEATURE_AES) is not enabled."
Jerry Yu49231312023-01-10 16:57:21 +0800174# endif
Jerry Yu580e06f2023-04-28 17:42:40 +0800175#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
176 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
Jerry Yu49231312023-01-10 16:57:21 +0800177
Dave Rodgman45661322023-08-04 12:31:58 +0100178#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
179
Jerry Yub95c7762023-01-10 16:59:51 +0800180#include <asm/hwcap.h>
181#include <sys/auxv.h>
Dave Rodgman45661322023-08-04 12:31:58 +0100182
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100183signed char mbedtls_aesce_has_support_result = -1;
Jerry Yub95c7762023-01-10 16:59:51 +0800184
Jerry Yu36606232023-04-19 10:44:29 +0800185#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
Jerry Yub95c7762023-01-10 16:59:51 +0800186/*
187 * AES instruction support detection routine
188 */
Dave Rodgman45661322023-08-04 12:31:58 +0100189int mbedtls_aesce_has_support_impl(void)
Jerry Yub95c7762023-01-10 16:59:51 +0800190{
Dave Rodgman45661322023-08-04 12:31:58 +0100191 /* To avoid many calls to getauxval, cache the result. This is
192 * thread-safe, because we store the result in a char so cannot
193 * be vulnerable to non-atomic updates.
194 * It is possible that we could end up setting result more than
195 * once, but that is harmless.
196 */
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100197 if (mbedtls_aesce_has_support_result == -1) {
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100198#if defined(MBEDTLS_ARCH_IS_ARM32)
199 unsigned long auxval = getauxval(AT_HWCAP);
200 unsigned long auxval2 = getauxval(AT_HWCAP2);
201 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
202 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
203 mbedtls_aesce_has_support_result = 1;
204 } else {
205 mbedtls_aesce_has_support_result = 0;
206 }
207#else
Dave Rodgman45661322023-08-04 12:31:58 +0100208 unsigned long auxval = getauxval(AT_HWCAP);
209 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
210 (HWCAP_ASIMD | HWCAP_AES)) {
211 mbedtls_aesce_has_support_result = 1;
212 } else {
213 mbedtls_aesce_has_support_result = 0;
214 }
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100215#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100216 }
217 return mbedtls_aesce_has_support_result;
Jerry Yub95c7762023-01-10 16:59:51 +0800218}
Jerry Yu0d4f4e52023-03-31 14:32:47 +0800219#endif
Jerry Yub95c7762023-01-10 16:59:51 +0800220
Dave Rodgman45661322023-08-04 12:31:58 +0100221#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
222
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100223/* Single round of AESCE encryption */
224#define AESCE_ENCRYPT_ROUND \
225 block = vaeseq_u8(block, vld1q_u8(keys)); \
226 block = vaesmcq_u8(block); \
227 keys += 16
228/* Two rounds of AESCE encryption */
229#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
230
Dave Rodgman9bb7e6f2023-06-16 09:41:21 +0100231MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
Jerry Yu2bb3d812023-01-10 17:38:26 +0800232static uint8x16_t aesce_encrypt_block(uint8x16_t block,
233 unsigned char *keys,
234 int rounds)
235{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100236 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100237 if (rounds == 10) {
238 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800239 }
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100240 if (rounds == 12) {
241 goto rounds_12;
242 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100243 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100244rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100245 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100246rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100247 AESCE_ENCRYPT_ROUND_X2;
248 AESCE_ENCRYPT_ROUND_X2;
249 AESCE_ENCRYPT_ROUND_X2;
250 AESCE_ENCRYPT_ROUND_X2;
251 AESCE_ENCRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800252
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800253 /* AES AddRoundKey for the previous round.
254 * SubBytes, ShiftRows for the final round. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100255 block = vaeseq_u8(block, vld1q_u8(keys));
256 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800257
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800258 /* Final round: no MixColumns */
Jerry Yu3304c202023-02-22 14:37:11 +0800259
260 /* Final AddRoundKey */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100261 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800262
263 return block;
264}
265
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100266/* Single round of AESCE decryption
267 *
268 * AES AddRoundKey, SubBytes, ShiftRows
269 *
270 * block = vaesdq_u8(block, vld1q_u8(keys));
271 *
272 * AES inverse MixColumns for the next round.
273 *
274 * This means that we switch the order of the inverse AddRoundKey and
275 * inverse MixColumns operations. We have to do this as AddRoundKey is
276 * done in an atomic instruction together with the inverses of SubBytes
277 * and ShiftRows.
278 *
279 * It works because MixColumns is a linear operation over GF(2^8) and
280 * AddRoundKey is an exclusive or, which is equivalent to addition over
281 * GF(2^8). (The inverse of MixColumns needs to be applied to the
282 * affected round keys separately which has been done when the
283 * decryption round keys were calculated.)
284 *
285 * block = vaesimcq_u8(block);
286 */
287#define AESCE_DECRYPT_ROUND \
288 block = vaesdq_u8(block, vld1q_u8(keys)); \
289 block = vaesimcq_u8(block); \
290 keys += 16
291/* Two rounds of AESCE decryption */
292#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
293
Jerry Yu2bb3d812023-01-10 17:38:26 +0800294static uint8x16_t aesce_decrypt_block(uint8x16_t block,
295 unsigned char *keys,
296 int rounds)
297{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100298 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100299 if (rounds == 10) {
300 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800301 }
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100302 if (rounds == 12) {
303 goto rounds_12;
304 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100305 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100306rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100307 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100308rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100309 AESCE_DECRYPT_ROUND_X2;
310 AESCE_DECRYPT_ROUND_X2;
311 AESCE_DECRYPT_ROUND_X2;
312 AESCE_DECRYPT_ROUND_X2;
313 AESCE_DECRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800314
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800315 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
316 * last full round. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100317 block = vaesdq_u8(block, vld1q_u8(keys));
318 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800319
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800320 /* Inverse AddRoundKey for inverting the initial round key addition. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100321 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800322
323 return block;
324}
325
326/*
327 * AES-ECB block en(de)cryption
328 */
329int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
330 int mode,
331 const unsigned char input[16],
332 unsigned char output[16])
333{
334 uint8x16_t block = vld1q_u8(&input[0]);
335 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
336
337 if (mode == MBEDTLS_AES_ENCRYPT) {
338 block = aesce_encrypt_block(block, keys, ctx->nr);
339 } else {
340 block = aesce_decrypt_block(block, keys, ctx->nr);
341 }
342 vst1q_u8(&output[0], block);
343
344 return 0;
345}
346
Jerry Yue096da12023-01-10 17:07:01 +0800347/*
348 * Compute decryption round keys from encryption round keys
349 */
350void mbedtls_aesce_inverse_key(unsigned char *invkey,
351 const unsigned char *fwdkey,
352 int nr)
353{
354 int i, j;
355 j = nr;
356 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
357 for (i = 1, j--; j > 0; i++, j--) {
358 vst1q_u8(invkey + i * 16,
359 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
360 }
361 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
362
363}
364
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800365static inline uint32_t aes_rot_word(uint32_t word)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800366{
367 return (word << (32 - 8)) | (word >> 8);
368}
369
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800370static inline uint32_t aes_sub_word(uint32_t in)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800371{
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800372 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
Jerry Yu3f2fb712023-01-10 17:05:42 +0800373 uint8x16_t zero = vdupq_n_u8(0);
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800374
375 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
376 * the correct result as ShiftRows doesn't change the first row. */
377 v = vaeseq_u8(zero, v);
378 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800379}
380
381/*
Jerry Yubaae4012023-02-21 15:26:13 +0800382 * Key expansion function
Jerry Yu3f2fb712023-01-10 17:05:42 +0800383 */
Jerry Yubaae4012023-02-21 15:26:13 +0800384static void aesce_setkey_enc(unsigned char *rk,
385 const unsigned char *key,
386 const size_t key_bit_length)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800387{
Jerry Yubaae4012023-02-21 15:26:13 +0800388 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
389 0x20, 0x40, 0x80, 0x1b, 0x36 };
Jerry Yu947bf962023-02-23 11:07:57 +0800390 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
391 * - Section 5, Nr = Nk + 6
Jerry Yu2c266512023-03-01 11:18:20 +0800392 * - Section 5.2, the length of round keys is Nb*(Nr+1)
Jerry Yu947bf962023-02-23 11:07:57 +0800393 */
394 const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */
395 const size_t round_key_len_in_words = 4; /* Nb */
Jerry Yu2c266512023-03-01 11:18:20 +0800396 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
397 const size_t round_keys_len_in_words =
398 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
399 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800400
Jerry Yu3304c202023-02-22 14:37:11 +0800401 memcpy(rk, key, key_len_in_words * 4);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800402
Jerry Yu3304c202023-02-22 14:37:11 +0800403 for (uint32_t *rki = (uint32_t *) rk;
404 rki + key_len_in_words < rko_end;
405 rki += key_len_in_words) {
406
Jerry Yufac5a542023-02-23 10:13:40 +0800407 size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words;
Jerry Yu3304c202023-02-22 14:37:11 +0800408 uint32_t *rko;
Jerry Yubaae4012023-02-21 15:26:13 +0800409 rko = rki + key_len_in_words;
410 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
Jerry Yu3304c202023-02-22 14:37:11 +0800411 rko[0] ^= rcon[iteration] ^ rki[0];
Jerry Yu3f2fb712023-01-10 17:05:42 +0800412 rko[1] = rko[0] ^ rki[1];
413 rko[2] = rko[1] ^ rki[2];
414 rko[3] = rko[2] ^ rki[3];
Jerry Yufac5a542023-02-23 10:13:40 +0800415 if (rko + key_len_in_words > rko_end) {
Jerry Yu3304c202023-02-22 14:37:11 +0800416 /* Do not write overflow words.*/
417 continue;
418 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800419#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
Jerry Yubaae4012023-02-21 15:26:13 +0800420 switch (key_bit_length) {
Jerry Yu3304c202023-02-22 14:37:11 +0800421 case 128:
422 break;
Jerry Yubaae4012023-02-21 15:26:13 +0800423 case 192:
Jerry Yu3304c202023-02-22 14:37:11 +0800424 rko[4] = rko[3] ^ rki[4];
425 rko[5] = rko[4] ^ rki[5];
Jerry Yubaae4012023-02-21 15:26:13 +0800426 break;
427 case 256:
Jerry Yu3304c202023-02-22 14:37:11 +0800428 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
429 rko[5] = rko[4] ^ rki[5];
430 rko[6] = rko[5] ^ rki[6];
431 rko[7] = rko[6] ^ rki[7];
Jerry Yubaae4012023-02-21 15:26:13 +0800432 break;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800433 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800434#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
Jerry Yu3f2fb712023-01-10 17:05:42 +0800435 }
436}
437
438/*
439 * Key expansion, wrapper
440 */
441int mbedtls_aesce_setkey_enc(unsigned char *rk,
442 const unsigned char *key,
443 size_t bits)
444{
445 switch (bits) {
Jerry Yubaae4012023-02-21 15:26:13 +0800446 case 128:
447 case 192:
448 case 256:
Jerry Yuba1e78f2023-02-24 11:18:16 +0800449 aesce_setkey_enc(rk, key, bits);
450 break;
451 default:
452 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800453 }
454
455 return 0;
456}
457
Jerry Yudf87a122023-01-10 18:17:15 +0800458#if defined(MBEDTLS_GCM_C)
459
Jerry Yu132d0cb2023-03-02 17:35:53 +0800460#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800461/* Some intrinsics are not available for GCC 5.X. */
Jerry Yu132d0cb2023-03-02 17:35:53 +0800462#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
463#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
464static inline poly64_t vget_low_p64(poly64x2_t __a)
465{
466 uint64x2_t tmp = (uint64x2_t) (__a);
467 uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0));
468 return (poly64_t) (lo);
469}
470#endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/
471
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800472/* vmull_p64/vmull_high_p64 wrappers.
473 *
474 * Older compilers miss some intrinsic functions for `poly*_t`. We use
475 * uint8x16_t and uint8x16x3_t as input/output parameters.
476 */
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800477#if defined(__GNUC__) && !defined(__clang__)
478/* GCC reports incompatible type error without cast. GCC think poly64_t and
479 * poly64x1_t are different, that is different with MSVC and Clang. */
480#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
481#else
482/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
483 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
484 * cast for clang also. */
485#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
486#endif
Jerry Yudf87a122023-01-10 18:17:15 +0800487static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
488{
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800489
Jerry Yudf87a122023-01-10 18:17:15 +0800490 return vreinterpretq_u8_p128(
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800491 MBEDTLS_VMULL_P64(
492 vget_low_p64(vreinterpretq_p64_u8(a)),
493 vget_low_p64(vreinterpretq_p64_u8(b))
494 ));
Jerry Yudf87a122023-01-10 18:17:15 +0800495}
496
497static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
498{
499 return vreinterpretq_u8_p128(
500 vmull_high_p64(vreinterpretq_p64_u8(a),
501 vreinterpretq_p64_u8(b)));
502}
503
Jerry Yuf0526a92023-03-14 15:00:29 +0800504/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
Jerry Yu49b43672023-03-13 10:09:34 +0800505 * `x^128 + x^7 + x^2 + x + 1`.
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800506 *
507 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
508 * multiplies to generate a 128b.
509 *
510 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
511 * represented by 3 128b due to code size optimization.
512 *
513 * Output layout:
514 * | | | |
515 * |------------|-------------|-------------|
516 * | ret.val[0] | h3:h2:00:00 | high 128b |
Jerry Yu8f810602023-03-14 17:28:52 +0800517 * | ret.val[1] | :m2:m1:00 | middle 128b |
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800518 * | ret.val[2] | : :l1:l0 | low 128b |
519 */
Jerry Yudf87a122023-01-10 18:17:15 +0800520static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
521{
522 uint8x16x3_t ret;
Jerry Yu8f810602023-03-14 17:28:52 +0800523 uint8x16_t h, m, l; /* retval high/middle/low */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800524 uint8x16_t c, d, e;
525
526 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
527 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
528 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
529 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
530 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
531 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
532
533 ret.val[0] = h;
534 ret.val[1] = m;
535 ret.val[2] = l;
Jerry Yudf87a122023-01-10 18:17:15 +0800536 return ret;
537}
538
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800539/*
540 * Modulo reduction.
541 *
542 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
543 *
544 * Section 4.3
545 *
546 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
547 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
Jerry Yube4fdef2023-03-15 14:50:42 +0800548 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
549 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
550 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800551 * the result is still larger than 128 bits, we reduce again.
552 */
553static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
Jerry Yudf87a122023-01-10 18:17:15 +0800554{
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800555 uint8x16_t const ZERO = vdupq_n_u8(0);
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800556
Jerry Yudf87a122023-01-10 18:17:15 +0800557 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800558#if defined(__GNUC__)
559 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
560 * memory. It is for GNUC compatible compilers.
561 */
Jerry Yudf87a122023-01-10 18:17:15 +0800562 asm ("" : "+w" (r));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800563#endif
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800564 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
Jerry Yu8f810602023-03-14 17:28:52 +0800565 uint8x16_t h, m, l; /* input high/middle/low 128b */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800566 uint8x16_t c, d, e, f, g, n, o;
567 h = input.val[0]; /* h3:h2:00:00 */
568 m = input.val[1]; /* :m2:m1:00 */
569 l = input.val[2]; /* : :l1:l0 */
570 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
571 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
572 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
573 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
574 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
575 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
576 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
577 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
Jerry Yudf87a122023-01-10 18:17:15 +0800578}
579
580/*
581 * GCM multiplication: c = a times b in GF(2^128)
582 */
583void mbedtls_aesce_gcm_mult(unsigned char c[16],
584 const unsigned char a[16],
585 const unsigned char b[16])
586{
587 uint8x16_t va, vb, vc;
588 va = vrbitq_u8(vld1q_u8(&a[0]));
589 vb = vrbitq_u8(vld1q_u8(&b[0]));
590 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
591 vst1q_u8(&c[0], vc);
592}
593
594#endif /* MBEDTLS_GCM_C */
Jerry Yu48b999c2023-03-03 15:51:07 +0800595
596#if defined(MBEDTLS_POP_TARGET_PRAGMA)
597#if defined(__clang__)
598#pragma clang attribute pop
599#elif defined(__GNUC__)
600#pragma GCC pop_options
601#endif
602#undef MBEDTLS_POP_TARGET_PRAGMA
603#endif
604
Dave Rodgman27e3c872023-10-08 10:29:26 +0100605#endif /* MBEDTLS_ARCH_IS_ARMV8 */
Jerry Yu49231312023-01-10 16:57:21 +0800606
607#endif /* MBEDTLS_AESCE_C */