Implements AES and GCM with ARMv8 Crypto Extensions
A compact patch that provides AES and GCM implementations that utilize the
ARMv8 Crypto Extensions. The config flag is MBEDTLS_ARMV8CE_AES_C, which
is disabled by default as we don't do runtime checking for the feature.
The new implementation lives in armv8ce_aes.c.
Provides similar functionality to https://github.com/ARMmbed/mbedtls/pull/432
Thanks to Barry O'Rourke and others for that contribtion.
Tested on a Cortex A53 device and QEMU. On a midrange phone the real AES-GCM
throughput increases about 4x, while raw AES speed is up to 10x faster.
When cross-compiling, you want to set something like:
export CC='aarch64-linux-gnu-gcc'
export CFLAGS='-Ofast -march=armv8-a+crypto'
scripts/config.pl set MBEDTLS_ARMV8CE_AES_C
QEMU seems to also need
export LDFLAGS='-static'
Then run normal make or cmake etc.
diff --git a/library/armv8ce_aes.c b/library/armv8ce_aes.c
new file mode 100644
index 0000000..de04d1b
--- /dev/null
+++ b/library/armv8ce_aes.c
@@ -0,0 +1,143 @@
+/*
+ * ARMv8 Cryptography Extensions -- Optimized code for AES and GCM
+ *
+ * Copyright (C) 2006-2017, ARM Limited, All Rights Reserved
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * This file is part of mbed TLS (https://tls.mbed.org)
+ */
+
+#if !defined(MBEDTLS_CONFIG_FILE)
+#include "mbedtls/config.h"
+#else
+#include MBEDTLS_CONFIG_FILE
+#endif
+
+#if defined(MBEDTLS_ARMV8CE_AES_C)
+
+#include <arm_neon.h>
+#include "mbedtls/armv8ce_aes.h"
+
+#ifndef asm
+#define asm __asm
+#endif
+
+/*
+ * [ARMv8 Crypto Extensions] AES-ECB block en(de)cryption
+ */
+
+#if defined(MBEDTLS_AES_C)
+
+int mbedtls_armv8ce_aes_crypt_ecb( mbedtls_aes_context *ctx,
+ int mode,
+ const unsigned char input[16],
+ unsigned char output[16] )
+{
+ unsigned int i;
+ const uint8_t *rk;
+ uint8x16_t x, k;
+
+ x = vld1q_u8( input ); // input block
+ rk = (const uint8_t *) ctx->rk; // round keys
+
+ if( mode == MBEDTLS_AES_ENCRYPT )
+ {
+ for( i = ctx->nr - 1; i ; i-- ) // encryption loop
+ {
+ k = vld1q_u8( rk );
+ rk += 16;
+ x = vaeseq_u8( x, k );
+ x = vaesmcq_u8( x );
+ }
+ k = vld1q_u8( rk );
+ rk += 16;
+ x = vaeseq_u8( x, k );
+ }
+ else
+ {
+ for( i = ctx->nr - 1; i ; i-- ) // decryption loop
+ {
+ k = vld1q_u8( rk );
+ rk += 16;
+ x = vaesdq_u8( x, k );
+ x = vaesimcq_u8( x );
+ }
+ k = vld1q_u8( rk );
+ rk += 16;
+ x = vaesdq_u8( x, k );
+ }
+
+ k = vld1q_u8( rk ); // final key just XORed
+ x = veorq_u8( x, k );
+ vst1q_u8( output, x ); // write out
+
+ return ( 0 );
+}
+
+#endif /* MBEDTLS_AES_C */
+
+
+/*
+ * [ARMv8 Crypto Extensions] Multiply in GF(2^128) for GCM
+ */
+
+#if defined(MBEDTLS_GCM_C)
+
+void mbedtls_armv8ce_gcm_mult( unsigned char c[16],
+ const unsigned char a[16],
+ const unsigned char b[16] )
+{
+ // GCM's GF(2^128) polynomial basis is x^128 + x^7 + x^2 + x + 1
+ const uint64x2_t base = { 0, 0x86 }; // note missing LS bit
+
+ register uint8x16_t vc asm( "v0" ); // named registers
+ register uint8x16_t va asm( "v1" ); // (to avoid conflict)
+ register uint8x16_t vb asm( "v2" );
+ register uint64x2_t vp asm( "v3" );
+
+ va = vld1q_u8( a ); // load inputs
+ vb = vld1q_u8( b );
+ vp = base;
+
+ asm (
+ "rbit %1.16b, %1.16b \n\t" // reverse bit order
+ "rbit %2.16b, %2.16b \n\t"
+ "pmull2 %0.1q, %1.2d, %2.2d \n\t" // v0 = a.hi * b.hi
+ "pmull2 v4.1q, %0.2d, %3.2d \n\t" // mul v0 by x^64, reduce
+ "ext %0.16b, %0.16b, %0.16b, #8 \n\t"
+ "eor %0.16b, %0.16b, v4.16b \n\t"
+ "ext v5.16b, %2.16b, %2.16b, #8 \n\t" // (swap hi and lo in b)
+ "pmull v4.1q, %1.1d, v5.1d \n\t" // v0 ^= a.lo * b.hi
+ "eor %0.16b, %0.16b, v4.16b \n\t"
+ "pmull2 v4.1q, %1.2d, v5.2d \n\t" // v0 ^= a.hi * b.lo
+ "eor %0.16b, %0.16b, v4.16b \n\t"
+ "pmull2 v4.1q, %0.2d, %3.2d \n\t" // mul v0 by x^64, reduce
+ "ext %0.16b, %0.16b, %0.16b, #8 \n\t"
+ "eor %0.16b, %0.16b, v4.16b \n\t"
+ "pmull v4.1q, %1.1d, %2.1d \n\t" // v0 ^= a.lo * b.lo
+ "eor %0.16b, %0.16b, v4.16b \n\t"
+ "rbit %0.16b, %0.16b \n\t" // reverse bits for output
+ : "=w" (vc) // q0: output
+ : "w" (va), "w" (vb), "w" (vp) // q1, q2: input
+ : "v4", "v5" // q4, q5: clobbered
+ );
+
+ vst1q_u8( c, vc ); // write out
+}
+
+#endif /* MBEDTLS_GCM_C */
+
+#endif /* MBEDTLS_ARMV8CE_AES_C */
+