Add optimized bignum multiplication for Aarch64.
x0-x3 are skipped such that function parameters to not have to be moved.
MULADDC_INIT and MULADDC_STOP are mostly empty because it is more
efficient to keep everything in registers (and that should easily be
possible). I considered a MULADDC_HUIT implementation, but could not
think of something that would be more efficient than basically 8
consecutive MULADDC_CORE. You could combine the loads and stores, but
it's probably more efficient to interleave them with arithmetic,
depending on the specific microarchitecture. NEON allows to do a
64x64->128 bit multiplication (and optional accumulation) in one
instruction, but is not great at handling carries.
diff --git a/include/mbedtls/bn_mul.h b/include/mbedtls/bn_mul.h
index f7cb072..4200ad4 100644
--- a/include/mbedtls/bn_mul.h
+++ b/include/mbedtls/bn_mul.h
@@ -198,6 +198,30 @@
#endif /* AMD64 */
+#if defined(__aarch64__)
+
+#define MULADDC_INIT \
+ asm(
+
+#define MULADDC_CORE \
+ "ldr x4, [%3], #8 \n\t" \
+ "ldr x5, [%4] \n\t" \
+ "mul x6, x4, %6 \n\t" \
+ "umulh x7, x4, %6 \n\t" \
+ "adds x5, x5, x6 \n\t" \
+ "adc x7, x7, xzr \n\t" \
+ "adds x5, x5, %5 \n\t" \
+ "adc %0, x7, xzr \n\t" \
+ "str x5, [%1], #8 \n\t"
+
+#define MULADDC_STOP \
+ : "+r" (c), "=r" (d), "=r" (s) \
+ : "r" (s), "r" (d), "r" (c), "r" (b) \
+ : "x4", "x5", "x6", "x7", "cc" \
+ );
+
+#endif /* Aarch64 */
+
#if defined(__mc68020__) || defined(__mcpu32__)
#define MULADDC_INIT \