Merge pull request #7624 from daverodgman/aes-perf
AES perf improvements
diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt
new file mode 100644
index 0000000..ca2ced9
--- /dev/null
+++ b/ChangeLog.d/aes-perf.txt
@@ -0,0 +1,4 @@
+Features
+ * AES performance improvements on 64-bit architectures. Uplift
+ varies by platform, toolchain, optimisation flags and mode,
+ in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
diff --git a/library/aes.c b/library/aes.c
index 3efe930..0a61d1b 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1039,6 +1039,24 @@
}
#if defined(MBEDTLS_CIPHER_MODE_CBC)
+
+#if defined(__ARM_NEON) && defined(__aarch64__)
+/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
+ * the result for the next block in CBC, and the cost of transferring that data from
+ * NEON registers, it is faster to use the following on aarch64.
+ * For 32-bit arm, NEON should be faster. */
+#define CBC_XOR_16(r, a, b) do { \
+ mbedtls_put_unaligned_uint64(r, \
+ mbedtls_get_unaligned_uint64(a) ^ \
+ mbedtls_get_unaligned_uint64(b)); \
+ mbedtls_put_unaligned_uint64(r + 8, \
+ mbedtls_get_unaligned_uint64(a + 8) ^ \
+ mbedtls_get_unaligned_uint64(b + 8)); \
+} while (0)
+#else
+#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
+#endif
+
/*
* AES-CBC buffer encryption/decryption
*/
@@ -1072,6 +1090,8 @@
}
#endif
+ const unsigned char *ivp = iv;
+
if (mode == MBEDTLS_AES_DECRYPT) {
while (length > 0) {
memcpy(temp, input, 16);
@@ -1079,8 +1099,7 @@
if (ret != 0) {
goto exit;
}
-
- mbedtls_xor(output, output, iv, 16);
+ CBC_XOR_16(output, output, iv);
memcpy(iv, temp, 16);
@@ -1090,18 +1109,19 @@
}
} else {
while (length > 0) {
- mbedtls_xor(output, input, iv, 16);
+ CBC_XOR_16(output, input, ivp);
ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
if (ret != 0) {
goto exit;
}
- memcpy(iv, output, 16);
+ ivp = output;
input += 16;
output += 16;
length -= 16;
}
+ memcpy(iv, ivp, 16);
}
ret = 0;
@@ -1176,7 +1196,7 @@
}
while (blocks--) {
- if (leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0) {
+ if (MBEDTLS_UNLIKELY(leftover && (mode == MBEDTLS_AES_DECRYPT) && blocks == 0)) {
/* We are on the last block in a decrypt operation that has
* leftover bytes, so we need to use the next tweak for this block,
* and this tweak for the leftover bytes. Save the current tweak for
diff --git a/library/common.h b/library/common.h
index eb159a7..b48a1fc 100644
--- a/library/common.h
+++ b/library/common.h
@@ -31,6 +31,10 @@
#include <stdint.h>
#include <stddef.h>
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif /* __ARM_NEON */
+
/** Helper to define a function as static except when building invasive tests.
*
* If a function is only used inside its own source file and should be
@@ -125,11 +129,26 @@
{
size_t i = 0;
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(__ARM_NEON)
+ for (; (i + 16) <= n; i += 16) {
+ uint8x16_t v1 = vld1q_u8(a + i);
+ uint8x16_t v2 = vld1q_u8(b + i);
+ uint8x16_t x = veorq_u8(v1, v2);
+ vst1q_u8(r + i, x);
+ }
+#elif defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
+ /* This codepath probably only makes sense on architectures with 64-bit registers */
+ for (; (i + 8) <= n; i += 8) {
+ uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
+ mbedtls_put_unaligned_uint64(r + i, x);
+ }
+#else
for (; (i + 4) <= n; i += 4) {
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
mbedtls_put_unaligned_uint32(r + i, x);
}
#endif
+#endif
for (; i < n; i++) {
r[i] = a[i] ^ b[i];
}
@@ -164,4 +183,16 @@
#define MBEDTLS_STATIC_ASSERT(expr, msg)
#endif
+/* Define compiler branch hints */
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_expect)
+#define MBEDTLS_LIKELY(x) __builtin_expect((x), 1)
+#define MBEDTLS_UNLIKELY(x) __builtin_expect((x), 0)
+#endif
+#endif
+#if !defined(MBEDTLS_LIKELY)
+#define MBEDTLS_LIKELY(x) x
+#define MBEDTLS_UNLIKELY(x) x
+#endif
+
#endif /* MBEDTLS_LIBRARY_COMMON_H */
diff --git a/tests/suites/test_suite_common.data b/tests/suites/test_suite_common.data
index 500852d..bd2c413 100644
--- a/tests/suites/test_suite_common.data
+++ b/tests/suites/test_suite_common.data
@@ -18,3 +18,45 @@
Block xor, length 16
mbedtls_xor:16
+
+Block xor, length 64
+mbedtls_xor:64
+
+Block xor, length 256
+mbedtls_xor:256
+
+Block xor, length 257
+mbedtls_xor:257
+
+Block xor, length 16+8
+mbedtls_xor:24
+
+Block xor, length 16+8+4
+mbedtls_xor:28
+
+Block xor, length 16+8+4+1
+mbedtls_xor:29
+
+Block xor, length 16+8+1
+mbedtls_xor:25
+
+Block xor, length 16+4
+mbedtls_xor:20
+
+Block xor, length 16+4+1
+mbedtls_xor:21
+
+Block xor, length 16+1
+mbedtls_xor:17
+
+Block xor, length 8+4
+mbedtls_xor:12
+
+Block xor, length 8+4+1
+mbedtls_xor:13
+
+Block xor, length 8+1
+mbedtls_xor:9
+
+Block xor, length 4+1
+mbedtls_xor:5