AESNI: add implementation with intrinsics

As of this commit, to use the intrinsics for MBEDTLS_AESNI_C:

* With MSVC, this should be the default.
* With Clang, build with `clang -maes -mpclmul` or equivalent.
* With GCC, build with `gcc -mpclmul -msse2` or equivalent.

In particular, for now, with a GCC-like compiler, when building specifically
for a target that supports both the AES and GCM instructions, the old
implementation using assembly is selected.

This method for platform selection will likely be improved in the future.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
diff --git a/library/aes.c b/library/aes.c
index e41810a..69d4ead 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -543,6 +543,13 @@
 
 #if defined(MBEDTLS_AESNI_HAVE_CODE)
     if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) {
+        /* The intrinsics-based implementation needs 16-byte alignment
+         * for the round key array. */
+        unsigned delta = (uintptr_t) ctx->buf & 0x0000000f;
+        if (delta != 0) {
+            ctx->rk_offset = 4 - delta / 4; // 16 bytes = 4 uint32_t
+        }
+        RK = ctx->buf + ctx->rk_offset;
         return mbedtls_aesni_setkey_enc((unsigned char *) RK, key, keybits);
     }
 #endif
@@ -644,6 +651,16 @@
         ctx->rk_offset = MBEDTLS_PADLOCK_ALIGN16(ctx->buf) - ctx->buf;
     }
 #endif
+#if defined(MBEDTLS_AESNI_HAVE_CODE)
+    if (mbedtls_aesni_has_support(MBEDTLS_AESNI_AES)) {
+        /* The intrinsics-based implementation needs 16-byte alignment
+         * for the round key array. */
+        unsigned delta = (uintptr_t) ctx->buf & 0x0000000f;
+        if (delta != 0) {
+            ctx->rk_offset = 4 - delta / 4; // 16 bytes = 4 uint32_t
+        }
+    }
+#endif
     RK = ctx->buf + ctx->rk_offset;
 
     /* Also checks keybits */