AESNI: add implementation with intrinsics

As of this commit, to use the intrinsics for MBEDTLS_AESNI_C:

* With MSVC, this should be the default.
* With Clang, build with `clang -maes -mpclmul` or equivalent.
* With GCC, build with `gcc -mpclmul -msse2` or equivalent.

In particular, for now, with a GCC-like compiler, when building specifically
for a target that supports both the AES and GCM instructions, the old
implementation using assembly is selected.

This method for platform selection will likely be improved in the future.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
diff --git a/library/aes.c b/library/aes.c
index a81332d..36aa7f2 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -552,6 +552,14 @@
 
 #if defined(MBEDTLS_AESNI_HAVE_CODE)
     if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) {
+        /* The intrinsics-based implementation needs 16-byte alignment
+         * for the round key array. */
+        unsigned delta = (uintptr_t) ctx->buf & 0x0000000f;
+        size_t rk_offset = 0;
+        if (delta != 0) {
+            rk_offset = 4 - delta / 4; // 16 bytes = 4 uint32_t
+        }
+        ctx->rk = RK = ctx->buf + rk_offset;
         return mbedtls_aesni_setkey_enc((unsigned char *) ctx->rk, key, keybits);
     }
 #endif
@@ -665,6 +673,17 @@
         goto exit;
     }
 #endif
+#if defined(MBEDTLS_AESNI_HAVE_CODE)
+    if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) {
+        /* The intrinsics-based implementation needs 16-byte alignment
+         * for the round key array. */
+        unsigned delta = (uintptr_t) ctx->buf & 0x0000000f;
+        if (delta != 0) {
+            size_t rk_offset = 4 - delta / 4; // 16 bytes = 4 uint32_t
+            ctx->rk = RK = ctx->buf + rk_offset;
+        }
+    }
+#endif
 
     SK = cty.rk + cty.nr * 4;