blob: 3ba6a60594668fa4a9d6c63f53c90b8e6a8360f0 [file] [log] [blame]
Jerry Yu49231312023-01-10 16:57:21 +08001/*
Dave Rodgmanf918d422023-03-17 17:52:23 +00002 * Armv8-A Cryptographic Extension support functions for Aarch64
Jerry Yu49231312023-01-10 16:57:21 +08003 *
4 * Copyright The Mbed TLS Contributors
5 * SPDX-License-Identifier: Apache-2.0
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License"); you may
8 * not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
Dave Rodgman27e3c872023-10-08 10:29:26 +010020#if defined(__clang__) && (__clang_major__ >= 4)
21
22/* Ideally, we would simply use MBEDTLS_ARCH_IS_ARMV8 in the following #if,
23 * but that is defined by build_info.h, and we need this block to happen first. */
24#if defined(__ARM_ARCH)
25#if __ARM_ARCH >= 8
26#define MBEDTLS_AESCE_ARCH_IS_ARMV8
27#endif
28#endif
29
30#if defined(MBEDTLS_AESCE_ARCH_IS_ARMV8) && !defined(__ARM_FEATURE_CRYPTO)
Jerry Yu48b999c2023-03-03 15:51:07 +080031/* TODO: Re-consider above after https://reviews.llvm.org/D131064 merged.
32 *
33 * The intrinsic declaration are guarded by predefined ACLE macros in clang:
34 * these are normally only enabled by the -march option on the command line.
35 * By defining the macros ourselves we gain access to those declarations without
36 * requiring -march on the command line.
37 *
38 * `arm_neon.h` could be included by any header file, so we put these defines
39 * at the top of this file, before any includes.
40 */
41#define __ARM_FEATURE_CRYPTO 1
Jerry Yuae129c32023-03-03 15:55:56 +080042/* See: https://arm-software.github.io/acle/main/acle.html#cryptographic-extensions
43 *
Jerry Yu490bf082023-03-06 15:21:44 +080044 * `__ARM_FEATURE_CRYPTO` is deprecated, but we need to continue to specify it
45 * for older compilers.
Jerry Yuae129c32023-03-03 15:55:56 +080046 */
47#define __ARM_FEATURE_AES 1
Dave Rodgmandb6ab242023-03-14 16:03:57 +000048#define MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG
Jerry Yu490bf082023-03-06 15:21:44 +080049#endif
Jerry Yu48b999c2023-03-03 15:51:07 +080050
Dave Rodgman27e3c872023-10-08 10:29:26 +010051#endif /* defined(__clang__) && (__clang_major__ >= 4) */
52
Jerry Yu49231312023-01-10 16:57:21 +080053#include <string.h>
54#include "common.h"
55
56#if defined(MBEDTLS_AESCE_C)
57
58#include "aesce.h"
59
Dave Rodgmanece803b2023-10-08 20:24:48 +010060#if defined(MBEDTLS_ARCH_IS_ARMV8) && defined(__ARM_NEON)
Jerry Yu49231312023-01-10 16:57:21 +080061
Jerry Yu61c4cfa2023-04-26 11:06:51 +080062/* Compiler version checks. */
Jerry Yudb368de2023-04-26 16:55:37 +080063#if defined(__clang__)
64# if __clang_major__ < 4
65# error "Minimum version of Clang for MBEDTLS_AESCE_C is 4.0."
66# endif
67#elif defined(__GNUC__)
68# if __GNUC__ < 6
69# error "Minimum version of GCC for MBEDTLS_AESCE_C is 6.0."
70# endif
71#elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +080072/* TODO: We haven't verified MSVC from 1920 to 1928. If someone verified that,
73 * please update this and document of `MBEDTLS_AESCE_C` in
74 * `mbedtls_config.h`. */
Jerry Yudb368de2023-04-26 16:55:37 +080075# if _MSC_VER < 1929
76# error "Minimum version of MSVC for MBEDTLS_AESCE_C is 2019 version 16.11.2."
77# endif
Dave Rodgman4b8e8dc2023-10-08 21:41:40 +010078#elif defined(__ARMCC_VERSION)
79# if defined(MBEDTLS_ARCH_IS_ARM32) && (__ARMCC_VERSION < 6200002)
80/* TODO: We haven't verified armclang for 32-bit Arm/Thumb prior to 6.20.
81 * If someone verified that, please update this and document of
82 * `MBEDTLS_AESCE_C` in `mbedtls_config.h`. */
83# error "Minimum version of armclang for MBEDTLS_AESCE_C on 32-bit Arm is 6.20."
84# elif defined(MBEDTLS_ARCH_IS_ARM64) && (__ARMCC_VERSION < 6060000)
85# error "Minimum version of armclang for MBEDTLS_AESCE_C on aarch64 is 6.6."
86# endif
Jerry Yu61c4cfa2023-04-26 11:06:51 +080087#endif
88
Jerry Yu6b00f5a2023-05-04 16:30:21 +080089#ifdef __ARM_NEON
Jerry Yu08933d32023-04-27 18:28:00 +080090#include <arm_neon.h>
Dave Rodgman27e3c872023-10-08 10:29:26 +010091
92#if defined(MBEDTLS_ARCH_IS_ARM32)
93#if defined(__clang__)
94/* On clang for A32/T32, work around some missing intrinsics and types */
95
96#ifndef vreinterpretq_p64_u8
97#define vreinterpretq_p64_u8 (poly64x2_t)
98#endif
99#ifndef vreinterpretq_u8_p128
100#define vreinterpretq_u8_p128 (uint8x16_t)
101#endif
102#ifndef vreinterpretq_u64_p64
103#define vreinterpretq_u64_p64 (uint64x2_t)
104#endif
105
106typedef uint8x16_t poly128_t;
107
108static inline poly128_t vmull_p64(poly64_t a, poly64_t b)
109{
110 poly128_t r;
Dave Rodgman472a1902023-10-08 22:14:41 +0100111 asm ("vmull.p64 %[r], %[a], %[b]": [r] "=w" (r) : [a] "w" (a), [b] "w" (b) :);
Dave Rodgman27e3c872023-10-08 10:29:26 +0100112 return r;
113}
114
115static inline poly64x1_t vget_low_p64(poly64x2_t a)
116{
117 return (poly64x1_t) vget_low_u64(vreinterpretq_u64_p64(a));
118}
119
120static inline poly128_t vmull_high_p64(poly64x2_t a, poly64x2_t b)
121{
122 return vmull_p64((poly64_t) (vget_high_u64((uint64x2_t) a)),
123 (poly64_t) (vget_high_u64((uint64x2_t) b)));
124}
125
126#endif /* defined(__clang__) */
127
128static inline uint8x16_t vrbitq_u8(uint8x16_t x)
129{
130 /* There is no vrbitq_u8 instruction in A32/T32, so provide
131 * an equivalent non-Neon implementation. Reverse bit order in each
132 * byte with 4x rbit, rev. */
133 asm ("ldm %[p], { r2-r5 } \n\t"
134 "rbit r2, r2 \n\t"
135 "rev r2, r2 \n\t"
136 "rbit r3, r3 \n\t"
137 "rev r3, r3 \n\t"
138 "rbit r4, r4 \n\t"
139 "rev r4, r4 \n\t"
140 "rbit r5, r5 \n\t"
141 "rev r5, r5 \n\t"
142 "stm %[p], { r2-r5 } \n\t"
143 :
144 /* Output: 16 bytes of memory pointed to by &x */
145 "+m" (*(uint8_t(*)[16]) &x)
146 :
147 [p] "r" (&x)
148 :
149 "r2", "r3", "r4", "r5"
150 );
151 return x;
152}
153
154#endif /* defined(MBEDTLS_ARCH_IS_ARM32) */
155
Jerry Yu6b00f5a2023-05-04 16:30:21 +0800156#else
157#error "Target does not support NEON instructions"
158#endif
Jerry Yu08933d32023-04-27 18:28:00 +0800159
Jerry Yu580e06f2023-04-28 17:42:40 +0800160#if !(defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_AES)) || \
161 defined(MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800162# if defined(__ARMCOMPILER_VERSION)
163# if __ARMCOMPILER_VERSION <= 6090000
164# error "Must use minimum -march=armv8-a+crypto for MBEDTLS_AESCE_C"
165# else
Jerry Yu893be8d2023-07-13 17:32:11 +0800166# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yub1d06bb2023-05-05 14:05:07 +0800167# define MBEDTLS_POP_TARGET_PRAGMA
168# endif
169# elif defined(__clang__)
Jerry Yu893be8d2023-07-13 17:32:11 +0800170# pragma clang attribute push (__attribute__((target("aes"))), apply_to=function)
Jerry Yuec9be842023-03-14 10:42:47 +0800171# define MBEDTLS_POP_TARGET_PRAGMA
172# elif defined(__GNUC__)
Jerry Yuec9be842023-03-14 10:42:47 +0800173# pragma GCC push_options
Beniamin Sandu471a9752023-06-25 20:16:16 +0300174# pragma GCC target ("+crypto")
Jerry Yuec9be842023-03-14 10:42:47 +0800175# define MBEDTLS_POP_TARGET_PRAGMA
Jerry Yu07d28d82023-03-20 18:12:36 +0800176# elif defined(_MSC_VER)
Jerry Yu61c4cfa2023-04-26 11:06:51 +0800177# error "Required feature(__ARM_FEATURE_AES) is not enabled."
Jerry Yu49231312023-01-10 16:57:21 +0800178# endif
Jerry Yu580e06f2023-04-28 17:42:40 +0800179#endif /* !(__ARM_FEATURE_CRYPTO || __ARM_FEATURE_AES) ||
180 MBEDTLS_ENABLE_ARM_CRYPTO_EXTENSIONS_COMPILER_FLAG */
Jerry Yu49231312023-01-10 16:57:21 +0800181
Dave Rodgman45661322023-08-04 12:31:58 +0100182#if defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
183
Jerry Yub95c7762023-01-10 16:59:51 +0800184#include <asm/hwcap.h>
185#include <sys/auxv.h>
Dave Rodgman45661322023-08-04 12:31:58 +0100186
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100187signed char mbedtls_aesce_has_support_result = -1;
Jerry Yub95c7762023-01-10 16:59:51 +0800188
Jerry Yu36606232023-04-19 10:44:29 +0800189#if !defined(MBEDTLS_AES_USE_HARDWARE_ONLY)
Jerry Yub95c7762023-01-10 16:59:51 +0800190/*
191 * AES instruction support detection routine
192 */
Dave Rodgman45661322023-08-04 12:31:58 +0100193int mbedtls_aesce_has_support_impl(void)
Jerry Yub95c7762023-01-10 16:59:51 +0800194{
Dave Rodgman45661322023-08-04 12:31:58 +0100195 /* To avoid many calls to getauxval, cache the result. This is
196 * thread-safe, because we store the result in a char so cannot
197 * be vulnerable to non-atomic updates.
198 * It is possible that we could end up setting result more than
199 * once, but that is harmless.
200 */
Dave Rodgmanb30adce2023-08-04 12:52:51 +0100201 if (mbedtls_aesce_has_support_result == -1) {
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100202#if defined(MBEDTLS_ARCH_IS_ARM32)
203 unsigned long auxval = getauxval(AT_HWCAP);
204 unsigned long auxval2 = getauxval(AT_HWCAP2);
205 if (((auxval & HWCAP_NEON) == HWCAP_NEON) &&
206 ((auxval2 & HWCAP2_AES) == HWCAP2_AES)) {
207 mbedtls_aesce_has_support_result = 1;
208 } else {
209 mbedtls_aesce_has_support_result = 0;
210 }
211#else
Dave Rodgman45661322023-08-04 12:31:58 +0100212 unsigned long auxval = getauxval(AT_HWCAP);
213 if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) ==
214 (HWCAP_ASIMD | HWCAP_AES)) {
215 mbedtls_aesce_has_support_result = 1;
216 } else {
217 mbedtls_aesce_has_support_result = 0;
218 }
Dave Rodgman851cf5a2023-10-08 12:26:41 +0100219#endif
Dave Rodgman45661322023-08-04 12:31:58 +0100220 }
221 return mbedtls_aesce_has_support_result;
Jerry Yub95c7762023-01-10 16:59:51 +0800222}
Jerry Yu0d4f4e52023-03-31 14:32:47 +0800223#endif
Jerry Yub95c7762023-01-10 16:59:51 +0800224
Dave Rodgman45661322023-08-04 12:31:58 +0100225#endif /* defined(__linux__) && !defined(MBEDTLS_AES_USE_HARDWARE_ONLY) */
226
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100227/* Single round of AESCE encryption */
228#define AESCE_ENCRYPT_ROUND \
229 block = vaeseq_u8(block, vld1q_u8(keys)); \
230 block = vaesmcq_u8(block); \
231 keys += 16
232/* Two rounds of AESCE encryption */
233#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
234
Dave Rodgman9bb7e6f2023-06-16 09:41:21 +0100235MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
Jerry Yu2bb3d812023-01-10 17:38:26 +0800236static uint8x16_t aesce_encrypt_block(uint8x16_t block,
237 unsigned char *keys,
238 int rounds)
239{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100240 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100241 if (rounds == 10) {
242 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800243 }
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100244 if (rounds == 12) {
245 goto rounds_12;
246 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100247 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100248rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100249 AESCE_ENCRYPT_ROUND_X2;
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100250rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100251 AESCE_ENCRYPT_ROUND_X2;
252 AESCE_ENCRYPT_ROUND_X2;
253 AESCE_ENCRYPT_ROUND_X2;
254 AESCE_ENCRYPT_ROUND_X2;
255 AESCE_ENCRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800256
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800257 /* AES AddRoundKey for the previous round.
258 * SubBytes, ShiftRows for the final round. */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100259 block = vaeseq_u8(block, vld1q_u8(keys));
260 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800261
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800262 /* Final round: no MixColumns */
Jerry Yu3304c202023-02-22 14:37:11 +0800263
264 /* Final AddRoundKey */
Dave Rodgman96fdfb82023-06-15 16:21:31 +0100265 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800266
267 return block;
268}
269
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100270/* Single round of AESCE decryption
271 *
272 * AES AddRoundKey, SubBytes, ShiftRows
273 *
274 * block = vaesdq_u8(block, vld1q_u8(keys));
275 *
276 * AES inverse MixColumns for the next round.
277 *
278 * This means that we switch the order of the inverse AddRoundKey and
279 * inverse MixColumns operations. We have to do this as AddRoundKey is
280 * done in an atomic instruction together with the inverses of SubBytes
281 * and ShiftRows.
282 *
283 * It works because MixColumns is a linear operation over GF(2^8) and
284 * AddRoundKey is an exclusive or, which is equivalent to addition over
285 * GF(2^8). (The inverse of MixColumns needs to be applied to the
286 * affected round keys separately which has been done when the
287 * decryption round keys were calculated.)
288 *
289 * block = vaesimcq_u8(block);
290 */
291#define AESCE_DECRYPT_ROUND \
292 block = vaesdq_u8(block, vld1q_u8(keys)); \
293 block = vaesimcq_u8(block); \
294 keys += 16
295/* Two rounds of AESCE decryption */
296#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
297
Jerry Yu2bb3d812023-01-10 17:38:26 +0800298static uint8x16_t aesce_decrypt_block(uint8x16_t block,
299 unsigned char *keys,
300 int rounds)
301{
Dave Rodgman73b0c0b2023-06-16 14:48:14 +0100302 /* 10, 12 or 14 rounds. Unroll loop. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100303 if (rounds == 10) {
304 goto rounds_10;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800305 }
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100306 if (rounds == 12) {
307 goto rounds_12;
308 }
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100309 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100310rounds_12:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100311 AESCE_DECRYPT_ROUND_X2;
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100312rounds_10:
Dave Rodgman48fd2ab2023-06-16 09:36:50 +0100313 AESCE_DECRYPT_ROUND_X2;
314 AESCE_DECRYPT_ROUND_X2;
315 AESCE_DECRYPT_ROUND_X2;
316 AESCE_DECRYPT_ROUND_X2;
317 AESCE_DECRYPT_ROUND;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800318
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800319 /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
320 * last full round. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100321 block = vaesdq_u8(block, vld1q_u8(keys));
322 keys += 16;
Jerry Yu2bb3d812023-01-10 17:38:26 +0800323
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800324 /* Inverse AddRoundKey for inverting the initial round key addition. */
Dave Rodgman1c4451d2023-06-15 16:28:00 +0100325 block = veorq_u8(block, vld1q_u8(keys));
Jerry Yu2bb3d812023-01-10 17:38:26 +0800326
327 return block;
328}
329
330/*
331 * AES-ECB block en(de)cryption
332 */
333int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
334 int mode,
335 const unsigned char input[16],
336 unsigned char output[16])
337{
338 uint8x16_t block = vld1q_u8(&input[0]);
339 unsigned char *keys = (unsigned char *) (ctx->buf + ctx->rk_offset);
340
341 if (mode == MBEDTLS_AES_ENCRYPT) {
342 block = aesce_encrypt_block(block, keys, ctx->nr);
343 } else {
344 block = aesce_decrypt_block(block, keys, ctx->nr);
345 }
346 vst1q_u8(&output[0], block);
347
348 return 0;
349}
350
Jerry Yue096da12023-01-10 17:07:01 +0800351/*
352 * Compute decryption round keys from encryption round keys
353 */
354void mbedtls_aesce_inverse_key(unsigned char *invkey,
355 const unsigned char *fwdkey,
356 int nr)
357{
358 int i, j;
359 j = nr;
360 vst1q_u8(invkey, vld1q_u8(fwdkey + j * 16));
361 for (i = 1, j--; j > 0; i++, j--) {
362 vst1q_u8(invkey + i * 16,
363 vaesimcq_u8(vld1q_u8(fwdkey + j * 16)));
364 }
365 vst1q_u8(invkey + i * 16, vld1q_u8(fwdkey + j * 16));
366
367}
368
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800369static inline uint32_t aes_rot_word(uint32_t word)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800370{
371 return (word << (32 - 8)) | (word >> 8);
372}
373
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800374static inline uint32_t aes_sub_word(uint32_t in)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800375{
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800376 uint8x16_t v = vreinterpretq_u8_u32(vdupq_n_u32(in));
Jerry Yu3f2fb712023-01-10 17:05:42 +0800377 uint8x16_t zero = vdupq_n_u8(0);
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800378
379 /* vaeseq_u8 does both SubBytes and ShiftRows. Taking the first row yields
380 * the correct result as ShiftRows doesn't change the first row. */
381 v = vaeseq_u8(zero, v);
382 return vgetq_lane_u32(vreinterpretq_u32_u8(v), 0);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800383}
384
385/*
Jerry Yubaae4012023-02-21 15:26:13 +0800386 * Key expansion function
Jerry Yu3f2fb712023-01-10 17:05:42 +0800387 */
Jerry Yubaae4012023-02-21 15:26:13 +0800388static void aesce_setkey_enc(unsigned char *rk,
389 const unsigned char *key,
390 const size_t key_bit_length)
Jerry Yu3f2fb712023-01-10 17:05:42 +0800391{
Jerry Yubaae4012023-02-21 15:26:13 +0800392 static uint8_t const rcon[] = { 0x01, 0x02, 0x04, 0x08, 0x10,
393 0x20, 0x40, 0x80, 0x1b, 0x36 };
Jerry Yu947bf962023-02-23 11:07:57 +0800394 /* See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.197.pdf
395 * - Section 5, Nr = Nk + 6
Jerry Yu2c266512023-03-01 11:18:20 +0800396 * - Section 5.2, the length of round keys is Nb*(Nr+1)
Jerry Yu947bf962023-02-23 11:07:57 +0800397 */
398 const uint32_t key_len_in_words = key_bit_length / 32; /* Nk */
399 const size_t round_key_len_in_words = 4; /* Nb */
Jerry Yu2c266512023-03-01 11:18:20 +0800400 const size_t rounds_needed = key_len_in_words + 6; /* Nr */
401 const size_t round_keys_len_in_words =
402 round_key_len_in_words * (rounds_needed + 1); /* Nb*(Nr+1) */
403 const uint32_t *rko_end = (uint32_t *) rk + round_keys_len_in_words;
Jerry Yuc8bcdc82023-02-21 14:49:02 +0800404
Jerry Yu3304c202023-02-22 14:37:11 +0800405 memcpy(rk, key, key_len_in_words * 4);
Jerry Yu3f2fb712023-01-10 17:05:42 +0800406
Jerry Yu3304c202023-02-22 14:37:11 +0800407 for (uint32_t *rki = (uint32_t *) rk;
408 rki + key_len_in_words < rko_end;
409 rki += key_len_in_words) {
410
Jerry Yufac5a542023-02-23 10:13:40 +0800411 size_t iteration = (rki - (uint32_t *) rk) / key_len_in_words;
Jerry Yu3304c202023-02-22 14:37:11 +0800412 uint32_t *rko;
Jerry Yubaae4012023-02-21 15:26:13 +0800413 rko = rki + key_len_in_words;
414 rko[0] = aes_rot_word(aes_sub_word(rki[key_len_in_words - 1]));
Jerry Yu3304c202023-02-22 14:37:11 +0800415 rko[0] ^= rcon[iteration] ^ rki[0];
Jerry Yu3f2fb712023-01-10 17:05:42 +0800416 rko[1] = rko[0] ^ rki[1];
417 rko[2] = rko[1] ^ rki[2];
418 rko[3] = rko[2] ^ rki[3];
Jerry Yufac5a542023-02-23 10:13:40 +0800419 if (rko + key_len_in_words > rko_end) {
Jerry Yu3304c202023-02-22 14:37:11 +0800420 /* Do not write overflow words.*/
421 continue;
422 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800423#if !defined(MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH)
Jerry Yubaae4012023-02-21 15:26:13 +0800424 switch (key_bit_length) {
Jerry Yu3304c202023-02-22 14:37:11 +0800425 case 128:
426 break;
Jerry Yubaae4012023-02-21 15:26:13 +0800427 case 192:
Jerry Yu3304c202023-02-22 14:37:11 +0800428 rko[4] = rko[3] ^ rki[4];
429 rko[5] = rko[4] ^ rki[5];
Jerry Yubaae4012023-02-21 15:26:13 +0800430 break;
431 case 256:
Jerry Yu3304c202023-02-22 14:37:11 +0800432 rko[4] = aes_sub_word(rko[3]) ^ rki[4];
433 rko[5] = rko[4] ^ rki[5];
434 rko[6] = rko[5] ^ rki[6];
435 rko[7] = rko[6] ^ rki[7];
Jerry Yubaae4012023-02-21 15:26:13 +0800436 break;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800437 }
Yanray Wange2bc1582023-05-08 10:28:53 +0800438#endif /* !MBEDTLS_AES_ONLY_128_BIT_KEY_LENGTH */
Jerry Yu3f2fb712023-01-10 17:05:42 +0800439 }
440}
441
442/*
443 * Key expansion, wrapper
444 */
445int mbedtls_aesce_setkey_enc(unsigned char *rk,
446 const unsigned char *key,
447 size_t bits)
448{
449 switch (bits) {
Jerry Yubaae4012023-02-21 15:26:13 +0800450 case 128:
451 case 192:
452 case 256:
Jerry Yuba1e78f2023-02-24 11:18:16 +0800453 aesce_setkey_enc(rk, key, bits);
454 break;
455 default:
456 return MBEDTLS_ERR_AES_INVALID_KEY_LENGTH;
Jerry Yu3f2fb712023-01-10 17:05:42 +0800457 }
458
459 return 0;
460}
461
Jerry Yudf87a122023-01-10 18:17:15 +0800462#if defined(MBEDTLS_GCM_C)
463
Jerry Yu132d0cb2023-03-02 17:35:53 +0800464#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 5
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800465/* Some intrinsics are not available for GCC 5.X. */
Jerry Yu132d0cb2023-03-02 17:35:53 +0800466#define vreinterpretq_p64_u8(a) ((poly64x2_t) a)
467#define vreinterpretq_u8_p128(a) ((uint8x16_t) a)
468static inline poly64_t vget_low_p64(poly64x2_t __a)
469{
470 uint64x2_t tmp = (uint64x2_t) (__a);
471 uint64x1_t lo = vcreate_u64(vgetq_lane_u64(tmp, 0));
472 return (poly64_t) (lo);
473}
474#endif /* !__clang__ && __GNUC__ && __GNUC__ == 5*/
475
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800476/* vmull_p64/vmull_high_p64 wrappers.
477 *
478 * Older compilers miss some intrinsic functions for `poly*_t`. We use
479 * uint8x16_t and uint8x16x3_t as input/output parameters.
480 */
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800481#if defined(__GNUC__) && !defined(__clang__)
482/* GCC reports incompatible type error without cast. GCC think poly64_t and
483 * poly64x1_t are different, that is different with MSVC and Clang. */
484#define MBEDTLS_VMULL_P64(a, b) vmull_p64((poly64_t) a, (poly64_t) b)
485#else
486/* MSVC reports `error C2440: 'type cast'` with cast. Clang does not report
487 * error with/without cast. And I think poly64_t and poly64x1_t are same, no
488 * cast for clang also. */
489#define MBEDTLS_VMULL_P64(a, b) vmull_p64(a, b)
490#endif
Jerry Yudf87a122023-01-10 18:17:15 +0800491static inline uint8x16_t pmull_low(uint8x16_t a, uint8x16_t b)
492{
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800493
Jerry Yudf87a122023-01-10 18:17:15 +0800494 return vreinterpretq_u8_p128(
Jerry Yu9db4b1f2023-03-21 16:56:43 +0800495 MBEDTLS_VMULL_P64(
496 vget_low_p64(vreinterpretq_p64_u8(a)),
497 vget_low_p64(vreinterpretq_p64_u8(b))
498 ));
Jerry Yudf87a122023-01-10 18:17:15 +0800499}
500
501static inline uint8x16_t pmull_high(uint8x16_t a, uint8x16_t b)
502{
503 return vreinterpretq_u8_p128(
504 vmull_high_p64(vreinterpretq_p64_u8(a),
505 vreinterpretq_p64_u8(b)));
506}
507
Jerry Yuf0526a92023-03-14 15:00:29 +0800508/* GHASH does 128b polynomial multiplication on block in GF(2^128) defined by
Jerry Yu49b43672023-03-13 10:09:34 +0800509 * `x^128 + x^7 + x^2 + x + 1`.
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800510 *
511 * Arm64 only has 64b->128b polynomial multipliers, we need to do 4 64b
512 * multiplies to generate a 128b.
513 *
514 * `poly_mult_128` executes polynomial multiplication and outputs 256b that
515 * represented by 3 128b due to code size optimization.
516 *
517 * Output layout:
518 * | | | |
519 * |------------|-------------|-------------|
520 * | ret.val[0] | h3:h2:00:00 | high 128b |
Jerry Yu8f810602023-03-14 17:28:52 +0800521 * | ret.val[1] | :m2:m1:00 | middle 128b |
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800522 * | ret.val[2] | : :l1:l0 | low 128b |
523 */
Jerry Yudf87a122023-01-10 18:17:15 +0800524static inline uint8x16x3_t poly_mult_128(uint8x16_t a, uint8x16_t b)
525{
526 uint8x16x3_t ret;
Jerry Yu8f810602023-03-14 17:28:52 +0800527 uint8x16_t h, m, l; /* retval high/middle/low */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800528 uint8x16_t c, d, e;
529
530 h = pmull_high(a, b); /* h3:h2:00:00 = a1*b1 */
531 l = pmull_low(a, b); /* : :l1:l0 = a0*b0 */
532 c = vextq_u8(b, b, 8); /* :c1:c0 = b0:b1 */
533 d = pmull_high(a, c); /* :d2:d1:00 = a1*b0 */
534 e = pmull_low(a, c); /* :e2:e1:00 = a0*b1 */
535 m = veorq_u8(d, e); /* :m2:m1:00 = d + e */
536
537 ret.val[0] = h;
538 ret.val[1] = m;
539 ret.val[2] = l;
Jerry Yudf87a122023-01-10 18:17:15 +0800540 return ret;
541}
542
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800543/*
544 * Modulo reduction.
545 *
546 * See: https://www.researchgate.net/publication/285612706_Implementing_GCM_on_ARMv8
547 *
548 * Section 4.3
549 *
550 * Modular reduction is slightly more complex. Write the GCM modulus as f(z) =
551 * z^128 +r(z), where r(z) = z^7+z^2+z+ 1. The well known approach is to
Jerry Yube4fdef2023-03-15 14:50:42 +0800552 * consider that z^128 ≡r(z) (mod z^128 +r(z)), allowing us to write the 256-bit
553 * operand to be reduced as a(z) = h(z)z^128 +l(z)≡h(z)r(z) + l(z). That is, we
554 * simply multiply the higher part of the operand by r(z) and add it to l(z). If
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800555 * the result is still larger than 128 bits, we reduce again.
556 */
557static inline uint8x16_t poly_mult_reduce(uint8x16x3_t input)
Jerry Yudf87a122023-01-10 18:17:15 +0800558{
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800559 uint8x16_t const ZERO = vdupq_n_u8(0);
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800560
Jerry Yudf87a122023-01-10 18:17:15 +0800561 uint64x2_t r = vreinterpretq_u64_u8(vdupq_n_u8(0x87));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800562#if defined(__GNUC__)
563 /* use 'asm' as an optimisation barrier to prevent loading MODULO from
564 * memory. It is for GNUC compatible compilers.
565 */
Jerry Yudf87a122023-01-10 18:17:15 +0800566 asm ("" : "+w" (r));
Jerry Yu8b6df3f2023-03-21 16:59:13 +0800567#endif
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800568 uint8x16_t const MODULO = vreinterpretq_u8_u64(vshrq_n_u64(r, 64 - 8));
Jerry Yu8f810602023-03-14 17:28:52 +0800569 uint8x16_t h, m, l; /* input high/middle/low 128b */
Jerry Yu1ac7f6b2023-03-07 15:44:59 +0800570 uint8x16_t c, d, e, f, g, n, o;
571 h = input.val[0]; /* h3:h2:00:00 */
572 m = input.val[1]; /* :m2:m1:00 */
573 l = input.val[2]; /* : :l1:l0 */
574 c = pmull_high(h, MODULO); /* :c2:c1:00 = reduction of h3 */
575 d = pmull_low(h, MODULO); /* : :d1:d0 = reduction of h2 */
576 e = veorq_u8(c, m); /* :e2:e1:00 = m2:m1:00 + c2:c1:00 */
577 f = pmull_high(e, MODULO); /* : :f1:f0 = reduction of e2 */
578 g = vextq_u8(ZERO, e, 8); /* : :g1:00 = e1:00 */
579 n = veorq_u8(d, l); /* : :n1:n0 = d1:d0 + l1:l0 */
580 o = veorq_u8(n, f); /* o1:o0 = f1:f0 + n1:n0 */
581 return veorq_u8(o, g); /* = o1:o0 + g1:00 */
Jerry Yudf87a122023-01-10 18:17:15 +0800582}
583
584/*
585 * GCM multiplication: c = a times b in GF(2^128)
586 */
587void mbedtls_aesce_gcm_mult(unsigned char c[16],
588 const unsigned char a[16],
589 const unsigned char b[16])
590{
591 uint8x16_t va, vb, vc;
592 va = vrbitq_u8(vld1q_u8(&a[0]));
593 vb = vrbitq_u8(vld1q_u8(&b[0]));
594 vc = vrbitq_u8(poly_mult_reduce(poly_mult_128(va, vb)));
595 vst1q_u8(&c[0], vc);
596}
597
598#endif /* MBEDTLS_GCM_C */
Jerry Yu48b999c2023-03-03 15:51:07 +0800599
600#if defined(MBEDTLS_POP_TARGET_PRAGMA)
601#if defined(__clang__)
602#pragma clang attribute pop
603#elif defined(__GNUC__)
604#pragma GCC pop_options
605#endif
606#undef MBEDTLS_POP_TARGET_PRAGMA
607#endif
608
Dave Rodgman27e3c872023-10-08 10:29:26 +0100609#endif /* MBEDTLS_ARCH_IS_ARMV8 */
Jerry Yu49231312023-01-10 16:57:21 +0800610
611#endif /* MBEDTLS_AESCE_C */