Merge pull request #6832 from daverodgman/fast-unaligned-ct

Improve efficiency of some constant time functions
diff --git a/include/mbedtls/mbedtls_config.h b/include/mbedtls/mbedtls_config.h
index 11c3139..2a2c039 100644
--- a/include/mbedtls/mbedtls_config.h
+++ b/include/mbedtls/mbedtls_config.h
@@ -48,8 +48,11 @@
  * Requires support for asm() in compiler.
  *
  * Used in:
+ *      library/aesni.h
  *      library/aria.c
  *      library/bn_mul.h
+ *      library/constant_time.c
+ *      library/padlock.h
  *
  * Required by:
  *      MBEDTLS_AESNI_C
diff --git a/library/aesni.c b/library/aesni.c
index d4abb4d..f6b304d 100644
--- a/library/aesni.c
+++ b/library/aesni.c
@@ -37,12 +37,6 @@
 
 #include <string.h>
 
-/* *INDENT-OFF* */
-#ifndef asm
-#define asm __asm
-#endif
-/* *INDENT-ON* */
-
 #if defined(MBEDTLS_HAVE_X86_64)
 
 /*
diff --git a/library/alignment.h b/library/alignment.h
index bfc965e..aa09ff8 100644
--- a/library/alignment.h
+++ b/library/alignment.h
@@ -29,6 +29,23 @@
 
 #include "mbedtls/build_info.h"
 
+/*
+ * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory
+ * accesses are known to be efficient.
+ *
+ * All functions defined here will behave correctly regardless, but might be less
+ * efficient when this is not defined.
+ */
+#if defined(__ARM_FEATURE_UNALIGNED) \
+    || defined(__i386__) || defined(__amd64__) || defined(__x86_64__)
+/*
+ * __ARM_FEATURE_UNALIGNED is defined where appropriate by armcc, gcc 7, clang 9
+ * (and later versions) for Arm v7 and later; all x86 platforms should have
+ * efficient unaligned access.
+ */
+#define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS
+#endif
+
 /**
  * Read the unsigned 16 bits integer from the given address, which need not
  * be aligned.
diff --git a/library/bn_mul.h b/library/bn_mul.h
index 307c241..ab59fbd 100644
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@@ -83,10 +83,6 @@
 /* *INDENT-OFF* */
 #if defined(MBEDTLS_HAVE_ASM)
 
-#ifndef asm
-#define asm __asm
-#endif
-
 /* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
 #if defined(__GNUC__) && \
     ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 )
diff --git a/library/common.h b/library/common.h
index fd3ddba..46af79f 100644
--- a/library/common.h
+++ b/library/common.h
@@ -122,11 +122,13 @@
  */
 inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n)
 {
-    size_t i;
-    for (i = 0; (i + 4) <= n; i += 4) {
+    size_t i = 0;
+#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
+    for (; (i + 4) <= n; i += 4) {
         uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
         mbedtls_put_unaligned_uint32(r + i, x);
     }
+#endif
     for (; i < n; i++) {
         r[i] = a[i] ^ b[i];
     }
@@ -140,4 +142,11 @@
 #define /*no-check-names*/ __func__ __FUNCTION__
 #endif
 
+/* Define `asm` for compilers which don't define it. */
+/* *INDENT-OFF* */
+#ifndef asm
+#define asm __asm__
+#endif
+/* *INDENT-ON* */
+
 #endif /* MBEDTLS_LIBRARY_COMMON_H */
diff --git a/library/constant_time.c b/library/constant_time.c
index 442eb0e..7f4d509 100644
--- a/library/constant_time.c
+++ b/library/constant_time.c
@@ -47,16 +47,63 @@
 
 #include <string.h>
 
+/*
+ * Define MBEDTLS_EFFICIENT_UNALIGNED_VOLATILE_ACCESS where assembly is present to
+ * perform fast unaligned access to volatile data.
+ *
+ * This is needed because mbedtls_get_unaligned_uintXX etc don't support volatile
+ * memory accesses.
+ *
+ * Some of these definitions could be moved into alignment.h but for now they are
+ * only used here.
+ */
+#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) && defined(MBEDTLS_HAVE_ASM)
+#if defined(__arm__) || defined(__thumb__) || defined(__thumb2__) || defined(__aarch64__)
+#define MBEDTLS_EFFICIENT_UNALIGNED_VOLATILE_ACCESS
+#endif
+#endif
+
+#if defined(MBEDTLS_EFFICIENT_UNALIGNED_VOLATILE_ACCESS)
+static inline uint32_t mbedtls_get_unaligned_volatile_uint32(volatile const unsigned char *p)
+{
+    /* This is UB, even where it's safe:
+     *    return *((volatile uint32_t*)p);
+     * so instead the same thing is expressed in assembly below.
+     */
+    uint32_t r;
+#if defined(__arm__) || defined(__thumb__) || defined(__thumb2__)
+    asm ("ldr %0, [%1]" : "=r" (r) : "r" (p) :);
+#elif defined(__aarch64__)
+    asm ("ldr %w0, [%1]" : "=r" (r) : "r" (p) :);
+#endif
+    return r;
+}
+#endif /* MBEDTLS_EFFICIENT_UNALIGNED_VOLATILE_ACCESS */
+
 int mbedtls_ct_memcmp(const void *a,
                       const void *b,
                       size_t n)
 {
-    size_t i;
+    size_t i = 0;
+    /*
+     * `A` and `B` are cast to volatile to ensure that the compiler
+     * generates code that always fully reads both buffers.
+     * Otherwise it could generate a test to exit early if `diff` has all
+     * bits set early in the loop.
+     */
     volatile const unsigned char *A = (volatile const unsigned char *) a;
     volatile const unsigned char *B = (volatile const unsigned char *) b;
-    volatile unsigned char diff = 0;
+    uint32_t diff = 0;
 
-    for (i = 0; i < n; i++) {
+#if defined(MBEDTLS_EFFICIENT_UNALIGNED_VOLATILE_ACCESS)
+    for (; (i + 4) <= n; i += 4) {
+        uint32_t x = mbedtls_get_unaligned_volatile_uint32(A + i);
+        uint32_t y = mbedtls_get_unaligned_volatile_uint32(B + i);
+        diff |= x ^ y;
+    }
+#endif
+
+    for (; i < n; i++) {
         /* Read volatile data in order before computing diff.
          * This avoids IAR compiler warning:
          * 'the order of volatile accesses is undefined ..' */
@@ -414,10 +461,22 @@
 {
     /* mask = c1 == c2 ? 0xff : 0x00 */
     const size_t equal = mbedtls_ct_size_bool_eq(c1, c2);
-    const unsigned char mask = (unsigned char) mbedtls_ct_size_mask(equal);
 
     /* dest[i] = c1 == c2 ? src[i] : dest[i] */
-    for (size_t i = 0; i < len; i++) {
+    size_t i = 0;
+#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
+    const uint32_t mask32 = (uint32_t) mbedtls_ct_size_mask(equal);
+    const unsigned char mask = (unsigned char) mask32 & 0xff;
+
+    for (; (i + 4) <= len; i += 4) {
+        uint32_t a = mbedtls_get_unaligned_uint32(src  + i) &  mask32;
+        uint32_t b = mbedtls_get_unaligned_uint32(dest + i) & ~mask32;
+        mbedtls_put_unaligned_uint32(dest + i, a | b);
+    }
+#else
+    const unsigned char mask = (unsigned char) mbedtls_ct_size_mask(equal);
+#endif /* MBEDTLS_EFFICIENT_UNALIGNED_ACCESS */
+    for (; i < len; i++) {
         dest[i] = (src[i] & mask) | (dest[i] & ~mask);
     }
 }
diff --git a/library/padlock.c b/library/padlock.c
index b6c6919..f42c40f 100644
--- a/library/padlock.c
+++ b/library/padlock.c
@@ -31,12 +31,6 @@
 
 #include <string.h>
 
-/* *INDENT-OFF* */
-#ifndef asm
-#define asm __asm
-#endif
-/* *INDENT-ON* */
-
 #if defined(MBEDTLS_HAVE_X86)
 
 /*
diff --git a/library/sha256.c b/library/sha256.c
index 16fd20d..cb09a71 100644
--- a/library/sha256.c
+++ b/library/sha256.c
@@ -89,12 +89,6 @@
 #include <signal.h>
 #include <setjmp.h>
 
-/* *INDENT-OFF* */
-#ifndef asm
-#define asm __asm__
-#endif
-/* *INDENT-ON* */
-
 static jmp_buf return_from_sigill;
 
 /*
diff --git a/library/sha512.c b/library/sha512.c
index 0ea6421..efcbed4 100644
--- a/library/sha512.c
+++ b/library/sha512.c
@@ -104,12 +104,6 @@
 #include <signal.h>
 #include <setjmp.h>
 
-/* *INDENT-OFF* */
-#ifndef asm
-#define asm __asm__
-#endif
-/* *INDENT-ON* */
-
 static jmp_buf return_from_sigill;
 
 /*
@@ -300,12 +294,6 @@
 #  define mbedtls_internal_sha512_process_a64_crypto      mbedtls_internal_sha512_process
 #endif
 
-/* *INDENT-OFF* */
-#ifndef asm
-#define asm __asm__
-#endif
-/* *INDENT-ON* */
-
 /* Accelerated SHA-512 implementation originally written by Simon Tatham for PuTTY,
  * under the MIT licence; dual-licensed as Apache 2 with his kind permission.
  */
diff --git a/tests/suites/test_suite_alignment.function b/tests/suites/test_suite_alignment.function
index 6c98f23..f670331 100644
--- a/tests/suites/test_suite_alignment.function
+++ b/tests/suites/test_suite_alignment.function
@@ -6,7 +6,6 @@
 #if defined(__clang__)
 #pragma clang diagnostic ignored "-Wunreachable-code"
 #endif
-#include <stdio.h>
 
 /*
  * Convert a string of the form "abcd" (case-insensitive) to a uint64_t.
diff --git a/tests/suites/test_suite_constant_time.data b/tests/suites/test_suite_constant_time.data
index 4504aa4..91a25fa 100644
--- a/tests/suites/test_suite_constant_time.data
+++ b/tests/suites/test_suite_constant_time.data
@@ -9,3 +9,129 @@
 # we could get this with 255-bytes plaintext and untruncated SHA-384
 Constant-flow memcpy from offset: large
 ssl_cf_memcpy_offset:100:339:48
+
+mbedtls_ct_memcmp NULL
+mbedtls_ct_memcmp_null
+
+mbedtls_ct_memcmp len 1
+mbedtls_ct_memcmp:-1:1:0
+
+mbedtls_ct_memcmp len 3
+mbedtls_ct_memcmp:-1:3:0
+
+mbedtls_ct_memcmp len 4
+mbedtls_ct_memcmp:-1:4:0
+
+mbedtls_ct_memcmp len 5
+mbedtls_ct_memcmp:-1:5:0
+
+mbedtls_ct_memcmp len 15
+mbedtls_ct_memcmp:-1:15:0
+
+mbedtls_ct_memcmp len 16
+mbedtls_ct_memcmp:-1:16:0
+
+mbedtls_ct_memcmp len 17
+mbedtls_ct_memcmp:-1:17:0
+
+mbedtls_ct_memcmp len 1 different
+mbedtls_ct_memcmp:0:1:0
+
+mbedtls_ct_memcmp len 17 different
+mbedtls_ct_memcmp:0:17:0
+
+mbedtls_ct_memcmp len 17 different 1
+mbedtls_ct_memcmp:1:17:0
+
+mbedtls_ct_memcmp len 17 different 4
+mbedtls_ct_memcmp:4:17:0
+
+mbedtls_ct_memcmp len 17 different 10
+mbedtls_ct_memcmp:10:17:0
+
+mbedtls_ct_memcmp len 17 different 16
+mbedtls_ct_memcmp:16:17:0
+
+mbedtls_ct_memcmp len 1 offset 1 different
+mbedtls_ct_memcmp:0:1:1
+
+mbedtls_ct_memcmp len 17 offset 1 different
+mbedtls_ct_memcmp:0:17:1
+
+mbedtls_ct_memcmp len 17 offset 1 different 1
+mbedtls_ct_memcmp:1:17:1
+
+mbedtls_ct_memcmp len 17 offset 1 different 5
+mbedtls_ct_memcmp:5:17:1
+
+mbedtls_ct_memcmp len 1 offset 1
+mbedtls_ct_memcmp:-1:1:1
+
+mbedtls_ct_memcmp len 1 offset 2
+mbedtls_ct_memcmp:-1:1:2
+
+mbedtls_ct_memcmp len 1 offset 3
+mbedtls_ct_memcmp:-1:1:3
+
+mbedtls_ct_memcmp len 5 offset 1
+mbedtls_ct_memcmp:-1:5:1
+
+mbedtls_ct_memcmp len 5 offset 2
+mbedtls_ct_memcmp:-1:5:2
+
+mbedtls_ct_memcmp len 5 offset 3
+mbedtls_ct_memcmp:-1:5:3
+
+mbedtls_ct_memcmp len 17 offset 1
+mbedtls_ct_memcmp:-1:17:1
+
+mbedtls_ct_memcmp len 17 offset 2
+mbedtls_ct_memcmp:-1:17:2
+
+mbedtls_ct_memcmp len 17 offset 3
+mbedtls_ct_memcmp:-1:17:3
+
+mbedtls_ct_memcpy_if_eq len 1 offset 0
+mbedtls_ct_memcpy_if_eq:1:1:0
+
+mbedtls_ct_memcpy_if_eq len 1 offset 1
+mbedtls_ct_memcpy_if_eq:1:1:1
+
+mbedtls_ct_memcpy_if_eq len 4 offset 0
+mbedtls_ct_memcpy_if_eq:1:1:0
+
+mbedtls_ct_memcpy_if_eq len 4 offset 1
+mbedtls_ct_memcpy_if_eq:1:1:1
+
+mbedtls_ct_memcpy_if_eq len 4 offset 2
+mbedtls_ct_memcpy_if_eq:1:1:2
+
+mbedtls_ct_memcpy_if_eq len 4 offset 3
+mbedtls_ct_memcpy_if_eq:1:1:3
+
+mbedtls_ct_memcpy_if_eq len 15 offset 0
+mbedtls_ct_memcpy_if_eq:1:15:0
+
+mbedtls_ct_memcpy_if_eq len 15 offset 1
+mbedtls_ct_memcpy_if_eq:1:15:1
+
+mbedtls_ct_memcpy_if_eq len 16 offset 0
+mbedtls_ct_memcpy_if_eq:1:16:0
+
+mbedtls_ct_memcpy_if_eq len 16 offset 1
+mbedtls_ct_memcpy_if_eq:1:16:1
+
+mbedtls_ct_memcpy_if_eq len 17 offset 0
+mbedtls_ct_memcpy_if_eq:1:17:0
+
+mbedtls_ct_memcpy_if_eq len 17 offset 1
+mbedtls_ct_memcpy_if_eq:1:17:1
+
+mbedtls_ct_memcpy_if_eq len 0 not eq
+mbedtls_ct_memcpy_if_eq:0:17:0
+
+mbedtls_ct_memcpy_if_eq len 5 offset 1 not eq
+mbedtls_ct_memcpy_if_eq:0:5:1
+
+mbedtls_ct_memcpy_if_eq len 17 offset 3 not eq
+mbedtls_ct_memcpy_if_eq:0:17:3
diff --git a/tests/suites/test_suite_constant_time.function b/tests/suites/test_suite_constant_time.function
index a40149a..14dc8ae 100644
--- a/tests/suites/test_suite_constant_time.function
+++ b/tests/suites/test_suite_constant_time.function
@@ -15,6 +15,108 @@
 #include <test/constant_flow.h>
 /* END_HEADER */
 
+/* BEGIN_CASE */
+void mbedtls_ct_memcmp_null()
+{
+    uint32_t x;
+    TEST_ASSERT(mbedtls_ct_memcmp(&x, NULL, 0) == 0);
+    TEST_ASSERT(mbedtls_ct_memcmp(NULL, &x, 0) == 0);
+    TEST_ASSERT(mbedtls_ct_memcmp(NULL, NULL, 0) == 0);
+}
+/* END_CASE */
+
+/* BEGIN_CASE */
+void mbedtls_ct_memcmp(int same, int size, int offset)
+{
+    uint8_t *a = NULL, *b = NULL;
+    ASSERT_ALLOC(a, size + offset);
+    ASSERT_ALLOC(b, size + offset);
+
+    TEST_CF_SECRET(a + offset, size);
+    TEST_CF_SECRET(b + offset, size);
+
+    /* Construct data that matches, if same == -1, otherwise
+     * same gives the number of bytes (after the initial offset)
+     * that will match; after that it will differ.
+     */
+    for (int i = 0; i < size + offset; i++) {
+        a[i] = i & 0xff;
+        if (same == -1 || (i - offset) < same) {
+            b[i] = a[i];
+        } else {
+            b[i] = (i + 1) & 0xff;
+        }
+    }
+
+    int reference = memcmp(a + offset, b + offset, size);
+    int actual = mbedtls_ct_memcmp(a + offset, b + offset, size);
+    TEST_CF_PUBLIC(a + offset, size);
+    TEST_CF_PUBLIC(b + offset, size);
+
+    if (same == -1 || same >= size) {
+        TEST_ASSERT(reference == 0);
+        TEST_ASSERT(actual == 0);
+    } else {
+        TEST_ASSERT(reference != 0);
+        TEST_ASSERT(actual != 0);
+    }
+exit:
+    mbedtls_free(a);
+    mbedtls_free(b);
+}
+/* END_CASE */
+
+/* BEGIN_CASE depends_on:MBEDTLS_SSL_SOME_SUITES_USE_MAC */
+void mbedtls_ct_memcpy_if_eq(int eq, int size, int offset)
+{
+    uint8_t *src = NULL, *result = NULL, *expected = NULL;
+    ASSERT_ALLOC(src, size + offset);
+    ASSERT_ALLOC(result, size + offset);
+    ASSERT_ALLOC(expected, size + offset);
+
+    for (int i = 0; i < size + offset; i++) {
+        src[i]    = 1;
+        result[i] = 0xff;
+        expected[i] = eq ? 1 : 0xff;
+    }
+
+    int one, secret_eq;
+    TEST_CF_SECRET(&one, sizeof(one));
+    TEST_CF_SECRET(&secret_eq,  sizeof(secret_eq));
+    one = 1;
+    secret_eq = eq;
+
+    mbedtls_ct_memcpy_if_eq(result + offset, src, size, secret_eq, one);
+
+    TEST_CF_PUBLIC(&one, sizeof(one));
+    TEST_CF_PUBLIC(&secret_eq, sizeof(secret_eq));
+
+    ASSERT_COMPARE(expected, size, result + offset, size);
+
+    for (int i = 0; i < size + offset; i++) {
+        src[i]    = 1;
+        result[i] = 0xff;
+        expected[i] = eq ? 1 : 0xff;
+    }
+
+    TEST_CF_SECRET(&one, sizeof(one));
+    TEST_CF_SECRET(&secret_eq,  sizeof(secret_eq));
+    one = 1;
+    secret_eq = eq;
+
+    mbedtls_ct_memcpy_if_eq(result, src + offset, size, secret_eq, one);
+
+    TEST_CF_PUBLIC(&one, sizeof(one));
+    TEST_CF_PUBLIC(&secret_eq, sizeof(secret_eq));
+
+    ASSERT_COMPARE(expected, size, result, size);
+exit:
+    mbedtls_free(src);
+    mbedtls_free(result);
+    mbedtls_free(expected);
+}
+/* END_CASE */
+
 /* BEGIN_CASE depends_on:MBEDTLS_SSL_SOME_SUITES_USE_TLS_CBC:MBEDTLS_TEST_HOOKS */
 void ssl_cf_memcpy_offset(int offset_min, int offset_max, int len)
 {