Merge pull request #6751 from ZachFleck42/development

Fix typo in `library/entropy.c`
diff --git a/ChangeLog b/ChangeLog
index 80b8617..cb277dc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -3,7 +3,7 @@
 = Mbed TLS 3.2.1 branch released 2022-07-12
 
 Bugfix
-   *  Re-add missing generated file library/ssl_debug_helpers_generated.c
+   *  Re-add missing generated file library/psa_crypto_driver_wrappers.c
 
 = Mbed TLS 3.2.0 branch released 2022-07-11
 
diff --git a/ChangeLog.d/alignment-perf.txt b/ChangeLog.d/alignment-perf.txt
new file mode 100644
index 0000000..7a8e6fb
--- /dev/null
+++ b/ChangeLog.d/alignment-perf.txt
@@ -0,0 +1,8 @@
+Features
+   * General performance improvements by accessing multiple bytes at a time.
+     Fixes #1666.
+   * Improvements to use of unaligned and byte-swapped memory, reducing code
+     size and improving performance (depending on compiler and target
+     architecture).
+Changes
+   * Mixed-endian systems are explicitly not supported any more.
diff --git a/README.md b/README.md
index 1a4edb0..8a23bd2 100644
--- a/README.md
+++ b/README.md
@@ -261,6 +261,7 @@
 - Signed integers must be represented using two's complement.
 - `int` and `size_t` must be at least 32 bits wide.
 - The types `uint8_t`, `uint16_t`, `uint32_t` and their signed equivalents must be available.
+- Mixed-endian platforms are not supported.
 
 PSA cryptography API
 --------------------
diff --git a/library/aes.c b/library/aes.c
index 319d9bb..56dc5cf 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -978,7 +978,6 @@
                     const unsigned char *input,
                     unsigned char *output )
 {
-    int i;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char temp[16];
 
@@ -1009,8 +1008,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 16 );
 
             memcpy( iv, temp, 16 );
 
@@ -1023,8 +1021,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 16 );
 
             ret = mbedtls_aes_crypt_ecb( ctx, mode, output, output );
             if( ret != 0 )
@@ -1106,8 +1103,6 @@
 
     while( blocks-- )
     {
-        size_t i;
-
         if( leftover && ( mode == MBEDTLS_AES_DECRYPT ) && blocks == 0 )
         {
             /* We are on the last block in a decrypt operation that has
@@ -1119,15 +1114,13 @@
             mbedtls_gf128mul_x_ble( tweak, tweak );
         }
 
-        for( i = 0; i < 16; i++ )
-            tmp[i] = input[i] ^ tweak[i];
+        mbedtls_xor( tmp, input, tweak, 16 );
 
         ret = mbedtls_aes_crypt_ecb( &ctx->crypt, mode, tmp, tmp );
         if( ret != 0 )
             return( ret );
 
-        for( i = 0; i < 16; i++ )
-            output[i] = tmp[i] ^ tweak[i];
+        mbedtls_xor( output, tmp, tweak, 16 );
 
         /* Update the tweak for the next block. */
         mbedtls_gf128mul_x_ble( tweak, tweak );
@@ -1148,19 +1141,18 @@
         unsigned char *prev_output = output - 16;
 
         /* Copy ciphertext bytes from the previous block to our output for each
-         * byte of ciphertext we won't steal. At the same time, copy the
-         * remainder of the input for this final round (since the loop bounds
-         * are the same). */
+         * byte of ciphertext we won't steal. */
         for( i = 0; i < leftover; i++ )
         {
             output[i] = prev_output[i];
-            tmp[i] = input[i] ^ t[i];
         }
 
+        /* Copy the remainder of the input for this final round. */
+        mbedtls_xor( tmp, input, t, leftover );
+
         /* Copy ciphertext bytes from the previous block for input in this
          * round. */
-        for( ; i < 16; i++ )
-            tmp[i] = prev_output[i] ^ t[i];
+        mbedtls_xor( tmp + i, prev_output + i, t + i, 16 - i );
 
         ret = mbedtls_aes_crypt_ecb( &ctx->crypt, mode, tmp, tmp );
         if( ret != 0 )
@@ -1168,8 +1160,7 @@
 
         /* Write the result back to the previous block, overriding the previous
          * output we copied. */
-        for( i = 0; i < 16; i++ )
-            prev_output[i] = tmp[i] ^ t[i];
+        mbedtls_xor( prev_output, tmp, t, 16 );
     }
 
     return( 0 );
diff --git a/library/alignment.h b/library/alignment.h
new file mode 100644
index 0000000..3c5fa23
--- /dev/null
+++ b/library/alignment.h
@@ -0,0 +1,494 @@
+/**
+ * \file alignment.h
+ *
+ * \brief Utility code for dealing with unaligned memory accesses
+ */
+/*
+ *  Copyright The Mbed TLS Contributors
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#ifndef MBEDTLS_LIBRARY_ALIGNMENT_H
+#define MBEDTLS_LIBRARY_ALIGNMENT_H
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "mbedtls/build_info.h"
+
+/**
+ * Read the unsigned 16 bits integer from the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 2 bytes of data
+ * \return  Data at the given address
+ */
+inline uint16_t mbedtls_get_unaligned_uint16( const void *p )
+{
+    uint16_t r;
+    memcpy( &r, p, sizeof( r ) );
+    return r;
+}
+
+/**
+ * Write the unsigned 16 bits integer to the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 2 bytes of data
+ * \param   x data to write
+ */
+inline void mbedtls_put_unaligned_uint16( void *p, uint16_t x )
+{
+    memcpy( p, &x, sizeof( x ) );
+}
+
+/**
+ * Read the unsigned 32 bits integer from the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 4 bytes of data
+ * \return  Data at the given address
+ */
+inline uint32_t mbedtls_get_unaligned_uint32( const void *p )
+{
+    uint32_t r;
+    memcpy( &r, p, sizeof( r ) );
+    return r;
+}
+
+/**
+ * Write the unsigned 32 bits integer to the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 4 bytes of data
+ * \param   x data to write
+ */
+inline void mbedtls_put_unaligned_uint32( void *p, uint32_t x )
+{
+    memcpy( p, &x, sizeof( x ) );
+}
+
+/**
+ * Read the unsigned 64 bits integer from the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 8 bytes of data
+ * \return  Data at the given address
+ */
+inline uint64_t mbedtls_get_unaligned_uint64( const void *p )
+{
+    uint64_t r;
+    memcpy( &r, p, sizeof( r ) );
+    return r;
+}
+
+/**
+ * Write the unsigned 64 bits integer to the given address, which need not
+ * be aligned.
+ *
+ * \param   p pointer to 8 bytes of data
+ * \param   x data to write
+ */
+inline void mbedtls_put_unaligned_uint64( void *p, uint64_t x )
+{
+    memcpy( p, &x, sizeof( x ) );
+}
+
+/** Byte Reading Macros
+ *
+ * Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th
+ * byte from x, where byte 0 is the least significant byte.
+ */
+#define MBEDTLS_BYTE_0( x ) ( (uint8_t) (   ( x )         & 0xff ) )
+#define MBEDTLS_BYTE_1( x ) ( (uint8_t) ( ( ( x ) >> 8  ) & 0xff ) )
+#define MBEDTLS_BYTE_2( x ) ( (uint8_t) ( ( ( x ) >> 16 ) & 0xff ) )
+#define MBEDTLS_BYTE_3( x ) ( (uint8_t) ( ( ( x ) >> 24 ) & 0xff ) )
+#define MBEDTLS_BYTE_4( x ) ( (uint8_t) ( ( ( x ) >> 32 ) & 0xff ) )
+#define MBEDTLS_BYTE_5( x ) ( (uint8_t) ( ( ( x ) >> 40 ) & 0xff ) )
+#define MBEDTLS_BYTE_6( x ) ( (uint8_t) ( ( ( x ) >> 48 ) & 0xff ) )
+#define MBEDTLS_BYTE_7( x ) ( (uint8_t) ( ( ( x ) >> 56 ) & 0xff ) )
+
+/*
+ * Detect GCC built-in byteswap routines
+ */
+#if defined(__GNUC__) && defined(__GNUC_PREREQ)
+#if __GNUC_PREREQ(4,8)
+#define MBEDTLS_BSWAP16 __builtin_bswap16
+#endif /* __GNUC_PREREQ(4,8) */
+#if __GNUC_PREREQ(4,3)
+#define MBEDTLS_BSWAP32 __builtin_bswap32
+#define MBEDTLS_BSWAP64 __builtin_bswap64
+#endif /* __GNUC_PREREQ(4,3) */
+#endif /* defined(__GNUC__) && defined(__GNUC_PREREQ) */
+
+/*
+ * Detect Clang built-in byteswap routines
+ */
+#if defined(__clang__) && defined(__has_builtin)
+#if __has_builtin(__builtin_bswap16)
+#define MBEDTLS_BSWAP16 __builtin_bswap16
+#endif /* __has_builtin(__builtin_bswap16) */
+#if __has_builtin(__builtin_bswap32)
+#define MBEDTLS_BSWAP32 __builtin_bswap32
+#endif /* __has_builtin(__builtin_bswap32) */
+#if __has_builtin(__builtin_bswap64)
+#define MBEDTLS_BSWAP64 __builtin_bswap64
+#endif /* __has_builtin(__builtin_bswap64) */
+#endif /* defined(__clang__) && defined(__has_builtin) */
+
+/*
+ * Detect MSVC built-in byteswap routines
+ */
+#if defined(_MSC_VER)
+#define MBEDTLS_BSWAP16 _byteswap_ushort
+#define MBEDTLS_BSWAP32 _byteswap_ulong
+#define MBEDTLS_BSWAP64 _byteswap_uint64
+#endif /* defined(_MSC_VER) */
+
+/* Detect armcc built-in byteswap routine */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 410000)
+#define MBEDTLS_BSWAP32 __rev
+#endif
+
+/*
+ * Where compiler built-ins are not present, fall back to C code that the
+ * compiler may be able to detect and transform into the relevant bswap or
+ * similar instruction.
+ */
+#if !defined(MBEDTLS_BSWAP16)
+static inline uint16_t mbedtls_bswap16( uint16_t x ) {
+    return
+         ( x & 0x00ff ) << 8 |
+         ( x & 0xff00 ) >> 8;
+}
+#define MBEDTLS_BSWAP16 mbedtls_bswap16
+#endif /* !defined(MBEDTLS_BSWAP16) */
+
+#if !defined(MBEDTLS_BSWAP32)
+static inline uint32_t mbedtls_bswap32( uint32_t x ) {
+    return
+         ( x & 0x000000ff ) << 24 |
+         ( x & 0x0000ff00 ) <<  8 |
+         ( x & 0x00ff0000 ) >>  8 |
+         ( x & 0xff000000 ) >> 24;
+}
+#define MBEDTLS_BSWAP32 mbedtls_bswap32
+#endif /* !defined(MBEDTLS_BSWAP32) */
+
+#if !defined(MBEDTLS_BSWAP64)
+static inline uint64_t mbedtls_bswap64( uint64_t x ) {
+    return
+         ( x & 0x00000000000000ff ) << 56 |
+         ( x & 0x000000000000ff00 ) << 40 |
+         ( x & 0x0000000000ff0000 ) << 24 |
+         ( x & 0x00000000ff000000 ) <<  8 |
+         ( x & 0x000000ff00000000 ) >>  8 |
+         ( x & 0x0000ff0000000000 ) >> 24 |
+         ( x & 0x00ff000000000000 ) >> 40 |
+         ( x & 0xff00000000000000 ) >> 56;
+}
+#define MBEDTLS_BSWAP64 mbedtls_bswap64
+#endif /* !defined(MBEDTLS_BSWAP64) */
+
+#if !defined(__BYTE_ORDER__)
+static const uint16_t mbedtls_byte_order_detector = { 0x100 };
+#define MBEDTLS_IS_BIG_ENDIAN (*((unsigned char *) (&mbedtls_byte_order_detector)) == 0x01)
+#else
+#define MBEDTLS_IS_BIG_ENDIAN ((__BYTE_ORDER__) == (__ORDER_BIG_ENDIAN__))
+#endif /* !defined(__BYTE_ORDER__) */
+
+/**
+ * Get the unsigned 32 bits integer corresponding to four bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the four bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the four bytes to build the 32 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT32_BE( data, offset )                              \
+    ( ( MBEDTLS_IS_BIG_ENDIAN )                                            \
+        ? mbedtls_get_unaligned_uint32((data) + (offset))                  \
+        : MBEDTLS_BSWAP32(mbedtls_get_unaligned_uint32((data) + (offset))) \
+    )
+
+/**
+ * Put in memory a 32 bits unsigned integer in big-endian order.
+ *
+ * \param   n       32 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 32
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 32 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT32_BE( n, data, offset )                             \
+{                                                                            \
+    if ( MBEDTLS_IS_BIG_ENDIAN )                                             \
+    {                                                                        \
+        mbedtls_put_unaligned_uint32((data) + (offset), (uint32_t)(n));      \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        mbedtls_put_unaligned_uint32((data) + (offset), MBEDTLS_BSWAP32((uint32_t)(n))); \
+    }                                                                        \
+}
+
+/**
+ * Get the unsigned 32 bits integer corresponding to four bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the four bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the four bytes to build the 32 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT32_LE( data, offset )                              \
+    ( ( MBEDTLS_IS_BIG_ENDIAN )                                            \
+        ? MBEDTLS_BSWAP32(mbedtls_get_unaligned_uint32((data) + (offset))) \
+        : mbedtls_get_unaligned_uint32((data) + (offset))                  \
+    )
+
+
+/**
+ * Put in memory a 32 bits unsigned integer in little-endian order.
+ *
+ * \param   n       32 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 32
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 32 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT32_LE( n, data, offset )                             \
+{                                                                            \
+    if ( MBEDTLS_IS_BIG_ENDIAN )                                             \
+    {                                                                        \
+        mbedtls_put_unaligned_uint32((data) + (offset), MBEDTLS_BSWAP32((uint32_t)(n))); \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        mbedtls_put_unaligned_uint32((data) + (offset), ((uint32_t)(n)));      \
+    }                                                                        \
+}
+
+/**
+ * Get the unsigned 16 bits integer corresponding to two bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the two bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the two bytes to build the 16 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT16_LE( data, offset )                              \
+    ( ( MBEDTLS_IS_BIG_ENDIAN )                                            \
+        ? MBEDTLS_BSWAP16(mbedtls_get_unaligned_uint16((data) + (offset))) \
+        : mbedtls_get_unaligned_uint16((data) + (offset))                  \
+    )
+
+/**
+ * Put in memory a 16 bits unsigned integer in little-endian order.
+ *
+ * \param   n       16 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 16
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 16 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT16_LE( n, data, offset )                             \
+{                                                                            \
+    if ( MBEDTLS_IS_BIG_ENDIAN )                                             \
+    {                                                                        \
+        mbedtls_put_unaligned_uint16((data) + (offset), MBEDTLS_BSWAP16((uint16_t)(n))); \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        mbedtls_put_unaligned_uint16((data) + (offset), (uint16_t)(n));      \
+    }                                                                        \
+}
+
+/**
+ * Get the unsigned 16 bits integer corresponding to two bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the two bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the two bytes to build the 16 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT16_BE( data, offset )                              \
+    ( ( MBEDTLS_IS_BIG_ENDIAN )                                            \
+        ? mbedtls_get_unaligned_uint16((data) + (offset))                  \
+        : MBEDTLS_BSWAP16(mbedtls_get_unaligned_uint16((data) + (offset))) \
+    )
+
+/**
+ * Put in memory a 16 bits unsigned integer in big-endian order.
+ *
+ * \param   n       16 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 16
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 16 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT16_BE( n, data, offset )                             \
+{                                                                            \
+    if ( MBEDTLS_IS_BIG_ENDIAN )                                             \
+    {                                                                        \
+        mbedtls_put_unaligned_uint16((data) + (offset), (uint16_t)(n));      \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        mbedtls_put_unaligned_uint16((data) + (offset), MBEDTLS_BSWAP16((uint16_t)(n))); \
+    }                                                                        \
+}
+
+/**
+ * Get the unsigned 24 bits integer corresponding to three bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the three bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the three bytes to build the 24 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT24_BE( data , offset )                  \
+    (                                                           \
+          ( (uint32_t) ( data )[( offset )    ] << 16 )         \
+        | ( (uint32_t) ( data )[( offset ) + 1] << 8  )         \
+        | ( (uint32_t) ( data )[( offset ) + 2]       )         \
+    )
+
+/**
+ * Put in memory a 24 bits unsigned integer in big-endian order.
+ *
+ * \param   n       24 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 24
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 24 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT24_BE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_2( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_0( n );             \
+}
+
+/**
+ * Get the unsigned 24 bits integer corresponding to three bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the three bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the three bytes to build the 24 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT24_LE( data, offset )                   \
+    (                                                           \
+          ( (uint32_t) ( data )[( offset )    ]       )         \
+        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
+        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
+    )
+
+/**
+ * Put in memory a 24 bits unsigned integer in little-endian order.
+ *
+ * \param   n       24 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 24
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 24 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT24_LE( n, data, offset )                \
+{                                                               \
+    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
+    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
+    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
+}
+
+/**
+ * Get the unsigned 64 bits integer corresponding to eight bytes in
+ * big-endian order (MSB first).
+ *
+ * \param   data    Base address of the memory to get the eight bytes from.
+ * \param   offset  Offset from \p data of the first and most significant
+ *                  byte of the eight bytes to build the 64 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT64_BE( data, offset )                              \
+    ( ( MBEDTLS_IS_BIG_ENDIAN )                                            \
+        ? mbedtls_get_unaligned_uint64((data) + (offset))                  \
+        : MBEDTLS_BSWAP64(mbedtls_get_unaligned_uint64((data) + (offset))) \
+    )
+
+/**
+ * Put in memory a 64 bits unsigned integer in big-endian order.
+ *
+ * \param   n       64 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 64
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the most significant
+ *                  byte of the 64 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT64_BE( n, data, offset )                             \
+{                                                                            \
+    if ( MBEDTLS_IS_BIG_ENDIAN )                                             \
+    {                                                                        \
+        mbedtls_put_unaligned_uint64((data) + (offset), (uint64_t)(n));      \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        mbedtls_put_unaligned_uint64((data) + (offset), MBEDTLS_BSWAP64((uint64_t)(n))); \
+    }                                                                        \
+}
+
+/**
+ * Get the unsigned 64 bits integer corresponding to eight bytes in
+ * little-endian order (LSB first).
+ *
+ * \param   data    Base address of the memory to get the eight bytes from.
+ * \param   offset  Offset from \p data of the first and least significant
+ *                  byte of the eight bytes to build the 64 bits unsigned
+ *                  integer from.
+ */
+#define MBEDTLS_GET_UINT64_LE( data, offset )                              \
+    ( ( MBEDTLS_IS_BIG_ENDIAN )                                            \
+        ? MBEDTLS_BSWAP64(mbedtls_get_unaligned_uint64((data) + (offset))) \
+        : mbedtls_get_unaligned_uint64((data) + (offset))                  \
+    )
+
+/**
+ * Put in memory a 64 bits unsigned integer in little-endian order.
+ *
+ * \param   n       64 bits unsigned integer to put in memory.
+ * \param   data    Base address of the memory where to put the 64
+ *                  bits unsigned integer in.
+ * \param   offset  Offset from \p data where to put the least significant
+ *                  byte of the 64 bits unsigned integer \p n.
+ */
+#define MBEDTLS_PUT_UINT64_LE( n, data, offset )                             \
+{                                                                            \
+    if ( MBEDTLS_IS_BIG_ENDIAN )                                             \
+    {                                                                        \
+        mbedtls_put_unaligned_uint64((data) + (offset), MBEDTLS_BSWAP64((uint64_t)(n))); \
+    }                                                                        \
+    else                                                                     \
+    {                                                                        \
+        mbedtls_put_unaligned_uint64((data) + (offset), (uint64_t)(n));      \
+    }                                                                        \
+}
+
+#endif /* MBEDTLS_LIBRARY_ALIGNMENT_H */
diff --git a/library/aria.c b/library/aria.c
index 5e52eea..517e10a 100644
--- a/library/aria.c
+++ b/library/aria.c
@@ -98,47 +98,8 @@
  * modify byte order: ( A B C D ) -> ( D C B A ), i.e. change endianness
  *
  * This is submatrix P3 in [1] Appendix B.1
- *
- * Some compilers fail to translate this to a single instruction,
- * so let's provide asm versions for common platforms with C fallback.
  */
-#if defined(MBEDTLS_HAVE_ASM)
-#if defined(__arm__) /* rev available from v6 up */
-/* armcc5 --gnu defines __GNUC__ but doesn't support GNU's extended asm */
-#if defined(__GNUC__) && \
-    ( !defined(__ARMCC_VERSION) || __ARMCC_VERSION >= 6000000 ) && \
-    __ARM_ARCH >= 6
-static inline uint32_t aria_p3( uint32_t x )
-{
-    uint32_t r;
-    __asm( "rev %0, %1" : "=l" (r) : "l" (x) );
-    return( r );
-}
-#define ARIA_P3 aria_p3
-#elif defined(__ARMCC_VERSION) && __ARMCC_VERSION < 6000000 && \
-    ( __TARGET_ARCH_ARM >= 6 || __TARGET_ARCH_THUMB >= 3 )
-static inline uint32_t aria_p3( uint32_t x )
-{
-    uint32_t r;
-    __asm( "rev r, x" );
-    return( r );
-}
-#define ARIA_P3 aria_p3
-#endif
-#endif /* arm */
-#if defined(__GNUC__) && \
-    defined(__i386__) || defined(__amd64__) || defined( __x86_64__)
-static inline uint32_t aria_p3( uint32_t x )
-{
-    __asm( "bswap %0" : "=r" (x) : "0" (x) );
-    return( x );
-}
-#define ARIA_P3 aria_p3
-#endif /* x86 gnuc */
-#endif /* MBEDTLS_HAVE_ASM && GNUC */
-#if !defined(ARIA_P3)
-#define ARIA_P3(x) ARIA_P2( ARIA_P1 ( x ) )
-#endif
+#define ARIA_P3(x) MBEDTLS_BSWAP32(x)
 
 /*
  * ARIA Affine Transform
@@ -583,7 +544,6 @@
                             const unsigned char *input,
                             unsigned char *output )
 {
-    int i;
     unsigned char temp[MBEDTLS_ARIA_BLOCKSIZE];
 
     ARIA_VALIDATE_RET( ctx != NULL );
@@ -603,8 +563,7 @@
             memcpy( temp, input, MBEDTLS_ARIA_BLOCKSIZE );
             mbedtls_aria_crypt_ecb( ctx, input, output );
 
-            for( i = 0; i < MBEDTLS_ARIA_BLOCKSIZE; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, MBEDTLS_ARIA_BLOCKSIZE );
 
             memcpy( iv, temp, MBEDTLS_ARIA_BLOCKSIZE );
 
@@ -617,8 +576,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < MBEDTLS_ARIA_BLOCKSIZE; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, MBEDTLS_ARIA_BLOCKSIZE );
 
             mbedtls_aria_crypt_ecb( ctx, output, output );
             memcpy( iv, output, MBEDTLS_ARIA_BLOCKSIZE );
diff --git a/library/bignum_core.c b/library/bignum_core.c
index 30fc661..1ce8457 100644
--- a/library/bignum_core.c
+++ b/library/bignum_core.c
@@ -83,45 +83,25 @@
 
 static mbedtls_mpi_uint mpi_bigendian_to_host( mbedtls_mpi_uint a )
 {
-#if defined(__BYTE_ORDER__)
-
-/* Nothing to do on bigendian systems. */
-#if ( __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
-    return( a );
-#endif /* __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ */
-
-#if ( __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ )
-
-/* For GCC and Clang, have builtins for byte swapping. */
-#if defined(__GNUC__) && defined(__GNUC_PREREQ)
-#if __GNUC_PREREQ(4,3)
-#define have_bswap
-#endif
-#endif
-
-#if defined(__clang__) && defined(__has_builtin)
-#if __has_builtin(__builtin_bswap32)  &&                 \
-    __has_builtin(__builtin_bswap64)
-#define have_bswap
-#endif
-#endif
-
-#if defined(have_bswap)
-    /* The compiler is hopefully able to statically evaluate this! */
-    switch( sizeof(mbedtls_mpi_uint) )
+    if ( MBEDTLS_IS_BIG_ENDIAN )
     {
-        case 4:
-            return( __builtin_bswap32(a) );
-        case 8:
-            return( __builtin_bswap64(a) );
+        /* Nothing to do on bigendian systems. */
+        return( a );
     }
-#endif
-#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
-#endif /* __BYTE_ORDER__ */
+    else
+    {
+        switch( sizeof(mbedtls_mpi_uint) )
+        {
+            case 4:
+                return (mbedtls_mpi_uint) MBEDTLS_BSWAP32( (uint32_t)a );
+            case 8:
+                return (mbedtls_mpi_uint) MBEDTLS_BSWAP64( (uint64_t)a );
+        }
 
-    /* Fall back to C-based reordering if we don't know the byte order
-     * or we couldn't use a compiler-specific builtin. */
-    return( mpi_bigendian_to_host_c( a ) );
+        /* Fall back to C-based reordering if we don't know the byte order
+        * or we couldn't use a compiler-specific builtin. */
+        return( mpi_bigendian_to_host_c( a ) );
+    }
 }
 
 void mbedtls_mpi_core_bigendian_to_host( mbedtls_mpi_uint *A,
diff --git a/library/bignum_core.h b/library/bignum_core.h
index add7fee..b7af4d0 100644
--- a/library/bignum_core.h
+++ b/library/bignum_core.h
@@ -517,6 +517,9 @@
  * \brief            Perform a modular exponentiation with secret exponent:
  *                   X = A^E mod N, where \p A is already in Montgomery form.
  *
+ * \p X may be aliased to \p A, but not to \p RR or \p E, even if \p E_limbs ==
+ * \p AN_limbs.
+ *
  * \param[out] X     The destination MPI, as a little endian array of length
  *                   \p AN_limbs.
  * \param[in] A      The base MPI, as a little endian array of length \p AN_limbs.
diff --git a/library/bignum_mod_raw.c b/library/bignum_mod_raw.c
index 22e56b7..266d915 100644
--- a/library/bignum_mod_raw.c
+++ b/library/bignum_mod_raw.c
@@ -124,6 +124,37 @@
 
 /* BEGIN MERGE SLOT 3 */
 
+size_t mbedtls_mpi_mod_raw_inv_prime_working_limbs( size_t AN_limbs )
+{
+    /* mbedtls_mpi_mod_raw_inv_prime() needs a temporary for the exponent,
+     * which will be the same size as the modulus and input (AN_limbs),
+     * and additional space to pass to mbedtls_mpi_core_exp_mod(). */
+    return( AN_limbs +
+            mbedtls_mpi_core_exp_mod_working_limbs( AN_limbs, AN_limbs ) );
+}
+
+void mbedtls_mpi_mod_raw_inv_prime( mbedtls_mpi_uint *X,
+                                    const mbedtls_mpi_uint *A,
+                                    const mbedtls_mpi_uint *N,
+                                    size_t AN_limbs,
+                                    const mbedtls_mpi_uint *RR,
+                                    mbedtls_mpi_uint *T )
+{
+    /* Inversion by power: g^|G| = 1 => g^(-1) = g^(|G|-1), and
+     *                       |G| = N - 1, so we want
+     *                 g^(|G|-1) = g^(N - 2)
+     */
+
+    /* Use the first AN_limbs of T to hold N - 2 */
+    mbedtls_mpi_uint *Nminus2 = T;
+    (void) mbedtls_mpi_core_sub_int( Nminus2, N, 2, AN_limbs );
+
+    /* Rest of T is given to exp_mod for its working space */
+    mbedtls_mpi_core_exp_mod( X,
+                              A, N, AN_limbs, Nminus2, AN_limbs,
+                              RR, T + AN_limbs );
+}
+
 /* END MERGE SLOT 3 */
 
 /* BEGIN MERGE SLOT 4 */
diff --git a/library/bignum_mod_raw.h b/library/bignum_mod_raw.h
index d7b6dd1..698119e 100644
--- a/library/bignum_mod_raw.h
+++ b/library/bignum_mod_raw.h
@@ -174,6 +174,51 @@
 
 /* BEGIN MERGE SLOT 3 */
 
+/**
+ * \brief          Returns the number of limbs of working memory required for
+ *                 a call to `mbedtls_mpi_mod_raw_inv_prime()`.
+ *
+ * \param AN_limbs The number of limbs in the input `A` and the modulus `N`
+ *                 (they must be the same size) that will be given to
+ *                 `mbedtls_mpi_mod_raw_inv_prime()`.
+ *
+ * \return         The number of limbs of working memory required by
+ *                 `mbedtls_mpi_mod_raw_inv_prime()`.
+ */
+size_t mbedtls_mpi_mod_raw_inv_prime_working_limbs( size_t AN_limbs );
+
+/**
+ * \brief Perform fixed-width modular inversion of a Montgomery-form MPI with
+ *        respect to a modulus \p N that must be prime.
+ *
+ * \p X may be aliased to \p A, but not to \p N or \p RR.
+ *
+ * \param[out] X     The modular inverse of \p A with respect to \p N.
+ *                   Will be in Montgomery form.
+ * \param[in] A      The number to calculate the modular inverse of.
+ *                   Must be in Montgomery form. Must not be 0.
+ * \param[in] N      The modulus, as a little-endian array of length \p AN_limbs.
+ *                   Must be prime.
+ * \param AN_limbs   The number of limbs in \p A, \p N and \p RR.
+ * \param[in] RR     The precomputed residue of 2^{2*biL} modulo N, as a little-
+ *                   endian array of length \p AN_limbs.
+ * \param[in,out] T  Temporary storage of at least the number of limbs returned
+ *                   by `mbedtls_mpi_mod_raw_inv_prime_working_limbs()`.
+ *                   Its initial content is unused and its final content is
+ *                   indeterminate.
+ *                   It must not alias or otherwise overlap any of the other
+ *                   parameters.
+ *                   It is up to the caller to zeroize \p T when it is no
+ *                   longer needed, and before freeing it if it was dynamically
+ *                   allocated.
+ */
+void mbedtls_mpi_mod_raw_inv_prime( mbedtls_mpi_uint *X,
+                                    const mbedtls_mpi_uint *A,
+                                    const mbedtls_mpi_uint *N,
+                                    size_t AN_limbs,
+                                    const mbedtls_mpi_uint *RR,
+                                    mbedtls_mpi_uint *T );
+
 /* END MERGE SLOT 3 */
 
 /* BEGIN MERGE SLOT 4 */
diff --git a/library/camellia.c b/library/camellia.c
index 5dd6c56..6e781c7 100644
--- a/library/camellia.c
+++ b/library/camellia.c
@@ -526,7 +526,6 @@
                                 const unsigned char *input,
                                 unsigned char *output )
 {
-    int i;
     unsigned char temp[16];
     if( mode != MBEDTLS_CAMELLIA_ENCRYPT && mode != MBEDTLS_CAMELLIA_DECRYPT )
         return MBEDTLS_ERR_CAMELLIA_BAD_INPUT_DATA;
@@ -541,8 +540,7 @@
             memcpy( temp, input, 16 );
             mbedtls_camellia_crypt_ecb( ctx, mode, input, output );
 
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 16 );
 
             memcpy( iv, temp, 16 );
 
@@ -555,8 +553,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 16; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 16 );
 
             mbedtls_camellia_crypt_ecb( ctx, mode, output, output );
             memcpy( iv, output, 16 );
diff --git a/library/ccm.c b/library/ccm.c
index 675783e..065eb60 100644
--- a/library/ccm.c
+++ b/library/ccm.c
@@ -112,7 +112,6 @@
                               const unsigned char *input,
                               unsigned char *output )
 {
-    size_t i;
     size_t olen = 0;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char tmp_buf[16] = {0};
@@ -125,8 +124,7 @@
         return ret;
     }
 
-    for( i = 0; i < use_len; i++ )
-        output[i] = input[i] ^ tmp_buf[offset + i];
+    mbedtls_xor( output, input, tmp_buf + offset, use_len );
 
     mbedtls_platform_zeroize(tmp_buf, sizeof(tmp_buf));
     return ret;
@@ -269,7 +267,6 @@
                            size_t add_len )
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    unsigned char i;
     size_t olen, use_len, offset;
 
     if( ctx->state & CCM_STATE__ERROR )
@@ -310,8 +307,7 @@
             if( use_len > add_len )
                 use_len = add_len;
 
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i + offset] ^= add[i];
+            mbedtls_xor( ctx->y + offset, ctx->y + offset, add, use_len );
 
             ctx->processed += use_len;
             add_len -= use_len;
@@ -381,8 +377,7 @@
         if( ctx->mode == MBEDTLS_CCM_ENCRYPT || \
             ctx->mode == MBEDTLS_CCM_STAR_ENCRYPT )
         {
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i + offset] ^= input[i];
+            mbedtls_xor( ctx->y + offset, ctx->y + offset, input, use_len );
 
             if( use_len + offset == 16 || ctx->processed == ctx->plaintext_len )
             {
@@ -411,8 +406,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i + offset] ^= local_output[i];
+            mbedtls_xor( ctx->y + offset, ctx->y + offset, local_output, use_len );
 
             memcpy( output, local_output, use_len );
             mbedtls_platform_zeroize( local_output, 16 );
diff --git a/library/chacha20.c b/library/chacha20.c
index 85d7461..d17c58c 100644
--- a/library/chacha20.c
+++ b/library/chacha20.c
@@ -217,7 +217,6 @@
                               unsigned char *output )
 {
     size_t offset = 0U;
-    size_t i;
 
     /* Use leftover keystream bytes, if available */
     while( size > 0U && ctx->keystream_bytes_used < CHACHA20_BLOCK_SIZE_BYTES )
@@ -237,17 +236,7 @@
         chacha20_block( ctx->state, ctx->keystream8 );
         ctx->state[CHACHA20_CTR_INDEX]++;
 
-        for( i = 0U; i < 64U; i += 8U )
-        {
-            output[offset + i  ] = input[offset + i  ] ^ ctx->keystream8[i  ];
-            output[offset + i+1] = input[offset + i+1] ^ ctx->keystream8[i+1];
-            output[offset + i+2] = input[offset + i+2] ^ ctx->keystream8[i+2];
-            output[offset + i+3] = input[offset + i+3] ^ ctx->keystream8[i+3];
-            output[offset + i+4] = input[offset + i+4] ^ ctx->keystream8[i+4];
-            output[offset + i+5] = input[offset + i+5] ^ ctx->keystream8[i+5];
-            output[offset + i+6] = input[offset + i+6] ^ ctx->keystream8[i+6];
-            output[offset + i+7] = input[offset + i+7] ^ ctx->keystream8[i+7];
-        }
+        mbedtls_xor( output + offset, input + offset, ctx->keystream8, 64U );
 
         offset += CHACHA20_BLOCK_SIZE_BYTES;
         size   -= CHACHA20_BLOCK_SIZE_BYTES;
@@ -260,10 +249,7 @@
         chacha20_block( ctx->state, ctx->keystream8 );
         ctx->state[CHACHA20_CTR_INDEX]++;
 
-        for( i = 0U; i < size; i++)
-        {
-            output[offset + i] = input[offset + i] ^ ctx->keystream8[i];
-        }
+        mbedtls_xor( output + offset, input + offset, ctx->keystream8, size );
 
         ctx->keystream_bytes_used = size;
 
diff --git a/library/cmac.c b/library/cmac.c
index 3cc49d1..9870856 100644
--- a/library/cmac.c
+++ b/library/cmac.c
@@ -148,15 +148,6 @@
 #endif /* !defined(MBEDTLS_CMAC_ALT) || defined(MBEDTLS_SELF_TEST) */
 
 #if !defined(MBEDTLS_CMAC_ALT)
-static void cmac_xor_block( unsigned char *output, const unsigned char *input1,
-                            const unsigned char *input2,
-                            const size_t block_size )
-{
-    size_t idx;
-
-    for( idx = 0; idx < block_size; idx++ )
-        output[ idx ] = input1[ idx ] ^ input2[ idx ];
-}
 
 /*
  * Create padded last block from (partial) last block.
@@ -247,7 +238,7 @@
                 input,
                 block_size - cmac_ctx->unprocessed_len );
 
-        cmac_xor_block( state, cmac_ctx->unprocessed_block, state, block_size );
+        mbedtls_xor( state, cmac_ctx->unprocessed_block, state, block_size );
 
         if( ( ret = mbedtls_cipher_update( ctx, state, block_size, state,
                                            &olen ) ) != 0 )
@@ -267,7 +258,7 @@
      * final partial or complete block */
     for( j = 1; j < n; j++ )
     {
-        cmac_xor_block( state, input, state, block_size );
+        mbedtls_xor( state, input, state, block_size );
 
         if( ( ret = mbedtls_cipher_update( ctx, state, block_size, state,
                                            &olen ) ) != 0 )
@@ -319,16 +310,16 @@
     if( cmac_ctx->unprocessed_len < block_size )
     {
         cmac_pad( M_last, block_size, last_block, cmac_ctx->unprocessed_len );
-        cmac_xor_block( M_last, M_last, K2, block_size );
+        mbedtls_xor( M_last, M_last, K2, block_size );
     }
     else
     {
         /* Last block is complete block */
-        cmac_xor_block( M_last, last_block, K1, block_size );
+        mbedtls_xor( M_last, last_block, K1, block_size );
     }
 
 
-    cmac_xor_block( state, M_last, state, block_size );
+    mbedtls_xor( state, M_last, state, block_size );
     if( ( ret = mbedtls_cipher_update( ctx, state, block_size, state,
                                        &olen ) ) != 0 )
     {
diff --git a/library/common.h b/library/common.h
index 25d5294..9d3b8fe 100644
--- a/library/common.h
+++ b/library/common.h
@@ -24,9 +24,11 @@
 #define MBEDTLS_LIBRARY_COMMON_H
 
 #include "mbedtls/build_info.h"
+#include "alignment.h"
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stddef.h>
 
 /** Helper to define a function as static except when building invasive tests.
  *
@@ -107,327 +109,30 @@
     return( p == NULL ? NULL : p + n );
 }
 
-/** Byte Reading Macros
- *
- * Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th
- * byte from x, where byte 0 is the least significant byte.
- */
-#define MBEDTLS_BYTE_0( x ) ( (uint8_t) (   ( x )         & 0xff ) )
-#define MBEDTLS_BYTE_1( x ) ( (uint8_t) ( ( ( x ) >> 8  ) & 0xff ) )
-#define MBEDTLS_BYTE_2( x ) ( (uint8_t) ( ( ( x ) >> 16 ) & 0xff ) )
-#define MBEDTLS_BYTE_3( x ) ( (uint8_t) ( ( ( x ) >> 24 ) & 0xff ) )
-#define MBEDTLS_BYTE_4( x ) ( (uint8_t) ( ( ( x ) >> 32 ) & 0xff ) )
-#define MBEDTLS_BYTE_5( x ) ( (uint8_t) ( ( ( x ) >> 40 ) & 0xff ) )
-#define MBEDTLS_BYTE_6( x ) ( (uint8_t) ( ( ( x ) >> 48 ) & 0xff ) )
-#define MBEDTLS_BYTE_7( x ) ( (uint8_t) ( ( ( x ) >> 56 ) & 0xff ) )
-
 /**
- * Get the unsigned 32 bits integer corresponding to four bytes in
- * big-endian order (MSB first).
+ * Perform a fast block XOR operation, such that
+ * r[i] = a[i] ^ b[i] where 0 <= i < n
  *
- * \param   data    Base address of the memory to get the four bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the four bytes to build the 32 bits unsigned
- *                  integer from.
+ * \param   r Pointer to result (buffer of at least \p n bytes). \p r
+ *            may be equal to either \p a or \p b, but behaviour when
+ *            it overlaps in other ways is undefined.
+ * \param   a Pointer to input (buffer of at least \p n bytes)
+ * \param   b Pointer to input (buffer of at least \p n bytes)
+ * \param   n Number of bytes to process.
  */
-#ifndef MBEDTLS_GET_UINT32_BE
-#define MBEDTLS_GET_UINT32_BE( data , offset )                  \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ] << 24 )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] << 16 )         \
-        | ( (uint32_t) ( data )[( offset ) + 2] <<  8 )         \
-        | ( (uint32_t) ( data )[( offset ) + 3]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 32 bits unsigned integer in big-endian order.
- *
- * \param   n       32 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 32
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 32 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT32_BE
-#define MBEDTLS_PUT_UINT32_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_3( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_0( n );             \
+inline void mbedtls_xor( unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n )
+{
+    size_t i;
+    for ( i = 0; ( i + 4 ) <= n; i += 4 )
+    {
+        uint32_t x = mbedtls_get_unaligned_uint32( a + i ) ^ mbedtls_get_unaligned_uint32( b + i );
+        mbedtls_put_unaligned_uint32( r + i, x );
+    }
+    for ( ; i < n; i++ )
+    {
+        r[i] = a[i] ^ b[i];
+    }
 }
-#endif
-
-/**
- * Get the unsigned 32 bits integer corresponding to four bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the four bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the four bytes to build the 32 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT32_LE
-#define MBEDTLS_GET_UINT32_LE( data, offset )                   \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ]       )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
-        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
-        | ( (uint32_t) ( data )[( offset ) + 3] << 24 )         \
-    )
-#endif
-
-/**
- * Put in memory a 32 bits unsigned integer in little-endian order.
- *
- * \param   n       32 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 32
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 32 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT32_LE
-#define MBEDTLS_PUT_UINT32_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_3( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 16 bits integer corresponding to two bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the two bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the two bytes to build the 16 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT16_LE
-#define MBEDTLS_GET_UINT16_LE( data, offset )                   \
-    (                                                           \
-          ( (uint16_t) ( data )[( offset )    ]       )         \
-        | ( (uint16_t) ( data )[( offset ) + 1] <<  8 )         \
-    )
-#endif
-
-/**
- * Put in memory a 16 bits unsigned integer in little-endian order.
- *
- * \param   n       16 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 16
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 16 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT16_LE
-#define MBEDTLS_PUT_UINT16_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 16 bits integer corresponding to two bytes in
- * big-endian order (MSB first).
- *
- * \param   data    Base address of the memory to get the two bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the two bytes to build the 16 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT16_BE
-#define MBEDTLS_GET_UINT16_BE( data, offset )                   \
-    (                                                           \
-          ( (uint16_t) ( data )[( offset )    ] << 8 )          \
-        | ( (uint16_t) ( data )[( offset ) + 1]      )          \
-    )
-#endif
-
-/**
- * Put in memory a 16 bits unsigned integer in big-endian order.
- *
- * \param   n       16 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 16
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 16 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT16_BE
-#define MBEDTLS_PUT_UINT16_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_0( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 24 bits integer corresponding to three bytes in
- * big-endian order (MSB first).
- *
- * \param   data    Base address of the memory to get the three bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the three bytes to build the 24 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT24_BE
-#define MBEDTLS_GET_UINT24_BE( data , offset )                  \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ] << 16 )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] << 8  )         \
-        | ( (uint32_t) ( data )[( offset ) + 2]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 24 bits unsigned integer in big-endian order.
- *
- * \param   n       24 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 24
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 24 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT24_BE
-#define MBEDTLS_PUT_UINT24_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_0( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 24 bits integer corresponding to three bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the three bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the three bytes to build the 24 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT24_LE
-#define MBEDTLS_GET_UINT24_LE( data, offset )                   \
-    (                                                           \
-          ( (uint32_t) ( data )[( offset )    ]       )         \
-        | ( (uint32_t) ( data )[( offset ) + 1] <<  8 )         \
-        | ( (uint32_t) ( data )[( offset ) + 2] << 16 )         \
-    )
-#endif
-
-/**
- * Put in memory a 24 bits unsigned integer in little-endian order.
- *
- * \param   n       24 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 24
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 24 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT24_LE
-#define MBEDTLS_PUT_UINT24_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 64 bits integer corresponding to eight bytes in
- * big-endian order (MSB first).
- *
- * \param   data    Base address of the memory to get the eight bytes from.
- * \param   offset  Offset from \p data of the first and most significant
- *                  byte of the eight bytes to build the 64 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT64_BE
-#define MBEDTLS_GET_UINT64_BE( data, offset )                   \
-    (                                                           \
-          ( (uint64_t) ( data )[( offset )    ] << 56 )         \
-        | ( (uint64_t) ( data )[( offset ) + 1] << 48 )         \
-        | ( (uint64_t) ( data )[( offset ) + 2] << 40 )         \
-        | ( (uint64_t) ( data )[( offset ) + 3] << 32 )         \
-        | ( (uint64_t) ( data )[( offset ) + 4] << 24 )         \
-        | ( (uint64_t) ( data )[( offset ) + 5] << 16 )         \
-        | ( (uint64_t) ( data )[( offset ) + 6] <<  8 )         \
-        | ( (uint64_t) ( data )[( offset ) + 7]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 64 bits unsigned integer in big-endian order.
- *
- * \param   n       64 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 64
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the most significant
- *                  byte of the 64 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT64_BE
-#define MBEDTLS_PUT_UINT64_BE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_7( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_6( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_5( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_4( n );             \
-    ( data )[( offset ) + 4] = MBEDTLS_BYTE_3( n );             \
-    ( data )[( offset ) + 5] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 6] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 7] = MBEDTLS_BYTE_0( n );             \
-}
-#endif
-
-/**
- * Get the unsigned 64 bits integer corresponding to eight bytes in
- * little-endian order (LSB first).
- *
- * \param   data    Base address of the memory to get the eight bytes from.
- * \param   offset  Offset from \p data of the first and least significant
- *                  byte of the eight bytes to build the 64 bits unsigned
- *                  integer from.
- */
-#ifndef MBEDTLS_GET_UINT64_LE
-#define MBEDTLS_GET_UINT64_LE( data, offset )                   \
-    (                                                           \
-          ( (uint64_t) ( data )[( offset ) + 7] << 56 )         \
-        | ( (uint64_t) ( data )[( offset ) + 6] << 48 )         \
-        | ( (uint64_t) ( data )[( offset ) + 5] << 40 )         \
-        | ( (uint64_t) ( data )[( offset ) + 4] << 32 )         \
-        | ( (uint64_t) ( data )[( offset ) + 3] << 24 )         \
-        | ( (uint64_t) ( data )[( offset ) + 2] << 16 )         \
-        | ( (uint64_t) ( data )[( offset ) + 1] <<  8 )         \
-        | ( (uint64_t) ( data )[( offset )    ]       )         \
-    )
-#endif
-
-/**
- * Put in memory a 64 bits unsigned integer in little-endian order.
- *
- * \param   n       64 bits unsigned integer to put in memory.
- * \param   data    Base address of the memory where to put the 64
- *                  bits unsigned integer in.
- * \param   offset  Offset from \p data where to put the least significant
- *                  byte of the 64 bits unsigned integer \p n.
- */
-#ifndef MBEDTLS_PUT_UINT64_LE
-#define MBEDTLS_PUT_UINT64_LE( n, data, offset )                \
-{                                                               \
-    ( data )[( offset )    ] = MBEDTLS_BYTE_0( n );             \
-    ( data )[( offset ) + 1] = MBEDTLS_BYTE_1( n );             \
-    ( data )[( offset ) + 2] = MBEDTLS_BYTE_2( n );             \
-    ( data )[( offset ) + 3] = MBEDTLS_BYTE_3( n );             \
-    ( data )[( offset ) + 4] = MBEDTLS_BYTE_4( n );             \
-    ( data )[( offset ) + 5] = MBEDTLS_BYTE_5( n );             \
-    ( data )[( offset ) + 6] = MBEDTLS_BYTE_6( n );             \
-    ( data )[( offset ) + 7] = MBEDTLS_BYTE_7( n );             \
-}
-#endif
 
 /* Fix MSVC C99 compatible issue
  *      MSVC support __func__ from visual studio 2015( 1900 )
diff --git a/library/ctr_drbg.c b/library/ctr_drbg.c
index 71c48af..f5c5e7b 100644
--- a/library/ctr_drbg.c
+++ b/library/ctr_drbg.c
@@ -174,8 +174,7 @@
 
         while( use_len > 0 )
         {
-            for( i = 0; i < MBEDTLS_CTR_DRBG_BLOCKSIZE; i++ )
-                chain[i] ^= p[i];
+            mbedtls_xor( chain, chain, p, MBEDTLS_CTR_DRBG_BLOCKSIZE );
             p += MBEDTLS_CTR_DRBG_BLOCKSIZE;
             use_len -= ( use_len >= MBEDTLS_CTR_DRBG_BLOCKSIZE ) ?
                        MBEDTLS_CTR_DRBG_BLOCKSIZE : use_len;
diff --git a/library/des.c b/library/des.c
index 65f5681..c56d4d4 100644
--- a/library/des.c
+++ b/library/des.c
@@ -635,7 +635,6 @@
                     const unsigned char *input,
                     unsigned char *output )
 {
-    int i;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char temp[8];
 
@@ -646,8 +645,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 8 );
 
             ret = mbedtls_des_crypt_ecb( ctx, output, output );
             if( ret != 0 )
@@ -668,8 +666,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 8 );
 
             memcpy( iv, temp, 8 );
 
@@ -741,7 +738,6 @@
                      const unsigned char *input,
                      unsigned char *output )
 {
-    int i;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char temp[8];
 
@@ -752,8 +748,7 @@
     {
         while( length > 0 )
         {
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( input[i] ^ iv[i] );
+            mbedtls_xor( output, input, iv, 8 );
 
             ret = mbedtls_des3_crypt_ecb( ctx, output, output );
             if( ret != 0 )
@@ -774,8 +769,7 @@
             if( ret != 0 )
                 goto exit;
 
-            for( i = 0; i < 8; i++ )
-                output[i] = (unsigned char)( output[i] ^ iv[i] );
+            mbedtls_xor( output, output, iv, 8 );
 
             memcpy( iv, temp, 8 );
 
diff --git a/library/gcm.c b/library/gcm.c
index f004a73c..0178b5b 100644
--- a/library/gcm.c
+++ b/library/gcm.c
@@ -235,7 +235,6 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char work_buf[16];
-    size_t i;
     const unsigned char *p;
     size_t use_len, olen = 0;
     uint64_t iv_bits;
@@ -268,8 +267,7 @@
         {
             use_len = ( iv_len < 16 ) ? iv_len : 16;
 
-            for( i = 0; i < use_len; i++ )
-                ctx->y[i] ^= p[i];
+            mbedtls_xor( ctx->y, ctx->y, p, use_len );
 
             gcm_mult( ctx, ctx->y, ctx->y );
 
@@ -277,8 +275,7 @@
             p += use_len;
         }
 
-        for( i = 0; i < 16; i++ )
-            ctx->y[i] ^= work_buf[i];
+        mbedtls_xor( ctx->y, ctx->y, work_buf, 16);
 
         gcm_mult( ctx, ctx->y, ctx->y );
     }
@@ -313,7 +310,7 @@
                            const unsigned char *add, size_t add_len )
 {
     const unsigned char *p;
-    size_t use_len, i, offset;
+    size_t use_len, offset;
 
     /* IV is limited to 2^64 bits, so 2^61 bytes */
     if( (uint64_t) add_len >> 61 != 0 )
@@ -328,8 +325,7 @@
         if( use_len > add_len )
             use_len = add_len;
 
-        for( i = 0; i < use_len; i++ )
-            ctx->buf[i+offset] ^= p[i];
+        mbedtls_xor( ctx->buf + offset, ctx->buf + offset, p, use_len );
 
         if( offset + use_len == 16 )
             gcm_mult( ctx, ctx->buf, ctx->buf );
@@ -343,8 +339,7 @@
 
     while( add_len >= 16 )
     {
-        for( i = 0; i < 16; i++ )
-            ctx->buf[i] ^= p[i];
+        mbedtls_xor( ctx->buf, ctx->buf, p, 16 );
 
         gcm_mult( ctx, ctx->buf, ctx->buf );
 
@@ -354,8 +349,7 @@
 
     if( add_len > 0 )
     {
-        for( i = 0; i < add_len; i++ )
-            ctx->buf[i] ^= p[i];
+        mbedtls_xor( ctx->buf, ctx->buf, p, add_len );
     }
 
     return( 0 );
@@ -378,7 +372,6 @@
                      const unsigned char *input,
                      unsigned char *output )
 {
-    size_t i;
     size_t olen = 0;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
 
@@ -389,14 +382,12 @@
         return( ret );
     }
 
-    for( i = 0; i < use_len; i++ )
-    {
-        if( ctx->mode == MBEDTLS_GCM_DECRYPT )
-            ctx->buf[offset + i] ^= input[i];
-        output[i] = ectr[offset + i] ^ input[i];
-        if( ctx->mode == MBEDTLS_GCM_ENCRYPT )
-            ctx->buf[offset + i] ^= output[i];
-    }
+    if( ctx->mode == MBEDTLS_GCM_DECRYPT )
+        mbedtls_xor( ctx->buf + offset, ctx->buf + offset, input, use_len );
+    mbedtls_xor( output, ectr + offset, input, use_len );
+    if( ctx->mode == MBEDTLS_GCM_ENCRYPT )
+        mbedtls_xor( ctx->buf + offset, ctx->buf + offset, output, use_len );
+
     return( 0 );
 }
 
@@ -489,7 +480,6 @@
                         unsigned char *tag, size_t tag_len )
 {
     unsigned char work_buf[16];
-    size_t i;
     uint64_t orig_len;
     uint64_t orig_add_len;
 
@@ -524,13 +514,11 @@
         MBEDTLS_PUT_UINT32_BE( ( orig_len     >> 32 ), work_buf, 8  );
         MBEDTLS_PUT_UINT32_BE( ( orig_len           ), work_buf, 12 );
 
-        for( i = 0; i < 16; i++ )
-            ctx->buf[i] ^= work_buf[i];
+        mbedtls_xor( ctx->buf, ctx->buf, work_buf, 16 );
 
         gcm_mult( ctx, ctx->buf, ctx->buf );
 
-        for( i = 0; i < tag_len; i++ )
-            tag[i] ^= ctx->buf[i];
+        mbedtls_xor( tag, tag, ctx->buf, tag_len );
     }
 
     return( 0 );
diff --git a/library/md.c b/library/md.c
index 8efcf10..9c161a5 100644
--- a/library/md.c
+++ b/library/md.c
@@ -633,7 +633,6 @@
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     unsigned char sum[MBEDTLS_MD_MAX_SIZE];
     unsigned char *ipad, *opad;
-    size_t i;
 
     if( ctx == NULL || ctx->md_info == NULL || ctx->hmac_ctx == NULL )
         return( MBEDTLS_ERR_MD_BAD_INPUT_DATA );
@@ -657,11 +656,8 @@
     memset( ipad, 0x36, ctx->md_info->block_size );
     memset( opad, 0x5C, ctx->md_info->block_size );
 
-    for( i = 0; i < keylen; i++ )
-    {
-        ipad[i] = (unsigned char)( ipad[i] ^ key[i] );
-        opad[i] = (unsigned char)( opad[i] ^ key[i] );
-    }
+    mbedtls_xor( ipad, ipad, key, keylen );
+    mbedtls_xor( opad, opad, key, keylen );
 
     if( ( ret = mbedtls_md_starts( ctx ) ) != 0 )
         goto cleanup;
diff --git a/library/pkcs5.c b/library/pkcs5.c
index ac5945a..1e3b17e 100644
--- a/library/pkcs5.c
+++ b/library/pkcs5.c
@@ -211,7 +211,6 @@
                               uint32_t key_length, unsigned char *output )
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    int j;
     unsigned int i;
     unsigned char md1[MBEDTLS_MD_MAX_SIZE];
     unsigned char work[MBEDTLS_MD_MAX_SIZE];
@@ -263,8 +262,7 @@
 
             // U1 xor U2
             //
-            for( j = 0; j < md_size; j++ )
-                work[j] ^= md1[j];
+            mbedtls_xor( work, work, md1, md_size );
         }
 
         use_len = ( key_length < md_size ) ? key_length : md_size;
@@ -324,7 +322,6 @@
     mbedtls_md_free( &md_ctx );
     return( ret );
 #else
-    int j;
     unsigned int i;
     unsigned char md1[PSA_HASH_MAX_SIZE];
     unsigned char work[PSA_HASH_MAX_SIZE];
@@ -396,8 +393,7 @@
 
             // U1 xor U2
             //
-            for( j = 0; j < md_size; j++ )
-                work[j] ^= md1[j];
+            mbedtls_xor( work, work, md1, md_size );
         }
 
         use_len = ( key_length < md_size ) ? key_length : md_size;
diff --git a/library/platform_util.c b/library/platform_util.c
index 916a7f4..2b674f6 100644
--- a/library/platform_util.c
+++ b/library/platform_util.c
@@ -143,3 +143,20 @@
 void (*mbedtls_test_hook_test_fail)( const char *, int, const char *);
 #endif /* MBEDTLS_TEST_HOOKS */
 
+/*
+ * Provide external definitions of some inline functions so that the compiler
+ * has the option to not inline them
+ */
+extern inline void mbedtls_xor( unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n );
+
+extern inline uint16_t mbedtls_get_unaligned_uint16( const void *p );
+
+extern inline void mbedtls_put_unaligned_uint16( void *p, uint16_t x );
+
+extern inline uint32_t mbedtls_get_unaligned_uint32( const void *p );
+
+extern inline void mbedtls_put_unaligned_uint32( void *p, uint32_t x );
+
+extern inline uint64_t mbedtls_get_unaligned_uint64( const void *p );
+
+extern inline void mbedtls_put_unaligned_uint64( void *p, uint64_t x );
diff --git a/library/ssl_msg.c b/library/ssl_msg.c
index e4d50db..753998e 100644
--- a/library/ssl_msg.c
+++ b/library/ssl_msg.c
@@ -669,15 +669,12 @@
                                     unsigned char const *dynamic_iv,
                                     size_t dynamic_iv_len )
 {
-    size_t i;
-
     /* Start with Fixed IV || 0 */
     memset( dst_iv, 0, dst_iv_len );
     memcpy( dst_iv, fixed_iv, fixed_iv_len );
 
     dst_iv += dst_iv_len - dynamic_iv_len;
-    for( i = 0; i < dynamic_iv_len; i++ )
-        dst_iv[i] ^= dynamic_iv[i];
+    mbedtls_xor( dst_iv, dst_iv, dynamic_iv, dynamic_iv_len );
 }
 #endif /* MBEDTLS_GCM_C || MBEDTLS_CCM_C || MBEDTLS_CHACHAPOLY_C */
 
diff --git a/scripts/mbedtls_dev/bignum_common.py b/scripts/mbedtls_dev/bignum_common.py
index 3ff8b2f..0339b1a 100644
--- a/scripts/mbedtls_dev/bignum_common.py
+++ b/scripts/mbedtls_dev/bignum_common.py
@@ -99,6 +99,7 @@
     limb_sizes = [32, 64] # type: List[int]
     arities = [1, 2]
     arity = 2
+    suffix = False   # for arity = 1, symbol can be prefix (default) or suffix
 
     def __init__(self, val_a: str, val_b: str = "0", bits_in_limb: int = 32) -> None:
         self.val_a = val_a
@@ -170,7 +171,8 @@
         """
         if not self.case_description:
             if self.arity == 1:
-                self.case_description = "{} {:x}".format(
+                format_string = "{1:x} {0}" if self.suffix else "{0} {1:x}"
+                self.case_description = format_string.format(
                     self.symbol, self.int_a
                 )
             elif self.arity == 2:
diff --git a/scripts/mbedtls_dev/bignum_data.py b/scripts/mbedtls_dev/bignum_data.py
index e6ed300..9658933 100644
--- a/scripts/mbedtls_dev/bignum_data.py
+++ b/scripts/mbedtls_dev/bignum_data.py
@@ -90,8 +90,8 @@
                               "4708d9893a973000b54a23020fc5b043d6e4a51519d9c9cc"
                               "52d32377e78131c1")
 
-# Adding 192 bit and 1024 bit numbers because these are the shortest required
-# for ECC and RSA respectively.
+# Adding 192 bit and 1024 bit numbers because these are the shortest required
+# for ECC and RSA respectively.
 INPUTS_DEFAULT = [
         "0", "1", # corner cases
         "2", "3", # small primes
@@ -110,13 +110,21 @@
 # supported for now.
 MODULI_DEFAULT = [
         "53", # safe prime
-        "45", # non-prime
+        "45", # non-prime
         SAFE_PRIME_192_BIT_SEED_1,  # safe prime
         RANDOM_192_BIT_SEED_2_NO4,  # not a prime
         SAFE_PRIME_1024_BIT_SEED_3, # safe prime
         RANDOM_1024_BIT_SEED_4_NO5, # not a prime
         ]
 
+# Some functions, e.g. mbedtls_mpi_mod_raw_inv_prime(), only support prime moduli.
+ONLY_PRIME_MODULI = [
+        "53", # safe prime
+        "8ac72304057392b5",     # 9999999997777777333 (longer, not safe, prime)
+        SAFE_PRIME_192_BIT_SEED_1,  # safe prime
+        SAFE_PRIME_1024_BIT_SEED_3, # safe prime
+        ]
+
 def __gen_safe_prime(bits, seed):
     '''
     Generate a safe prime.
diff --git a/scripts/mbedtls_dev/bignum_mod_raw.py b/scripts/mbedtls_dev/bignum_mod_raw.py
index d05479a..0486426 100644
--- a/scripts/mbedtls_dev/bignum_mod_raw.py
+++ b/scripts/mbedtls_dev/bignum_mod_raw.py
@@ -18,6 +18,7 @@
 
 from . import test_data_generation
 from . import bignum_common
+from .bignum_data import ONLY_PRIME_MODULI
 
 class BignumModRawTarget(test_data_generation.BaseTarget):
     #pylint: disable=abstract-method, too-few-public-methods
@@ -53,6 +54,34 @@
 
 # BEGIN MERGE SLOT 3
 
+class BignumModRawInvPrime(bignum_common.ModOperationCommon,
+                           BignumModRawTarget):
+    """Test cases for bignum mpi_mod_raw_inv_prime()."""
+    moduli = ONLY_PRIME_MODULI
+    symbol = "^ -1"
+    test_function = "mpi_mod_raw_inv_prime"
+    test_name = "mbedtls_mpi_mod_raw_inv_prime (Montgomery form only)"
+    input_style = "fixed"
+    arity = 1
+    suffix = True
+
+    @property
+    def is_valid(self) -> bool:
+        return self.int_a > 0 and self.int_a < self.int_n
+
+    @property
+    def arg_a(self) -> str:
+        # Input has to be given in Montgomery form
+        mont_a = self.to_montgomery(self.int_a)
+        return self.format_arg('{:x}'.format(mont_a))
+
+    def result(self) -> List[str]:
+        result = bignum_common.invmod(self.int_a, self.int_n)
+        if result < 0:
+            result += self.int_n
+        mont_result = self.to_montgomery(result)
+        return [self.format_result(mont_result)]
+
 # END MERGE SLOT 3
 
 # BEGIN MERGE SLOT 4
diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh
index deb5257..db46b03 100755
--- a/tests/scripts/all.sh
+++ b/tests/scripts/all.sh
@@ -185,7 +185,8 @@
     export CTEST_OUTPUT_ON_FAILURE=1
 
     # CFLAGS and LDFLAGS for Asan builds that don't use CMake
-    ASAN_CFLAGS='-Werror -Wall -Wextra -fsanitize=address,undefined -fno-sanitize-recover=all'
+    # default to -O2, use -Ox _after_ this if you want another level
+    ASAN_CFLAGS='-O2 -Werror -fsanitize=address,undefined -fno-sanitize-recover=all'
 
     # Gather the list of available components. These are the functions
     # defined in this script whose name starts with "component_".
@@ -2208,11 +2209,16 @@
     msg "test: MBEDTLS_PSA_CRYPTO_CONFIG with accelerated hash and USE_PSA"
     make test
 
+    # This is mostly useful so that we can later compare outcome files with
+    # the reference config in analyze_outcomes.py, to check that the
+    # dependency declarations in ssl-opt.sh and in TLS code are correct.
     msg "test: ssl-opt.sh, MBEDTLS_PSA_CRYPTO_CONFIG with accelerated hash and USE_PSA"
     tests/ssl-opt.sh
 
-    msg "test: compat.sh, MBEDTLS_PSA_CRYPTO_CONFIG without accelerated hash and USE_PSA"
-    tests/compat.sh
+    # This is to make sure all ciphersuites are exercised, but we don't need
+    # interop testing (besides, we already got some from ssl-opt.sh).
+    msg "test: compat.sh, MBEDTLS_PSA_CRYPTO_CONFIG with accelerated hash and USE_PSA"
+    tests/compat.sh -p mbedTLS -V YES
 }
 
 # This component provides reference configuration for test_psa_crypto_config_accel_hash_use_psa
diff --git a/tests/suites/test_suite_alignment.data b/tests/suites/test_suite_alignment.data
new file mode 100644
index 0000000..8c0c21d
--- /dev/null
+++ b/tests/suites/test_suite_alignment.data
@@ -0,0 +1,119 @@
+Aligned 16-bit access
+mbedtls_unaligned_access:16:0
+
+Aligned 32-bit access
+mbedtls_unaligned_access:32:0
+
+Aligned 64-bit access
+mbedtls_unaligned_access:64:0
+
+Unaligned 16-bit access offset=1
+mbedtls_unaligned_access:16:1
+
+Unaligned 32-bit access offset=1
+mbedtls_unaligned_access:32:1
+
+Unaligned 64-bit access offset=1
+mbedtls_unaligned_access:64:1
+
+Unaligned 16-bit access offset=4
+mbedtls_unaligned_access:16:4
+
+Unaligned 32-bit access offset=4
+mbedtls_unaligned_access:32:4
+
+Unaligned 64-bit access offset=4
+mbedtls_unaligned_access:64:4
+
+Unaligned 16-bit access offset=7
+mbedtls_unaligned_access:16:7
+
+Unaligned 32-bit access offset=7
+mbedtls_unaligned_access:32:7
+
+Unaligned 64-bit access offset=7
+mbedtls_unaligned_access:64:7
+
+Unaligned 16-bit access offset=8
+mbedtls_unaligned_access:16:8
+
+Unaligned 32-bit access offset=8
+mbedtls_unaligned_access:32:8
+
+Unaligned 64-bit access offset=8
+mbedtls_unaligned_access:64:8
+
+Byteswap 16
+mbedtls_byteswap:"0100":16:"0001"
+
+Byteswap 16 with truncation
+mbedtls_byteswap:"0706050403020100":16:"0001"
+
+Byteswap 16 all-zero
+mbedtls_byteswap:"0000":16:"0000"
+
+Byteswap 16 all-ones
+mbedtls_byteswap:"ffffffffffffffff":16:"ffff"
+
+Byteswap 32
+mbedtls_byteswap:"03020100":32:"00010203"
+
+Byteswap 32 with truncation
+mbedtls_byteswap:"0706050403020100":32:"00010203"
+
+Byteswap 32 all-zero
+mbedtls_byteswap:"00000000":32:"00000000"
+
+Byteswap 32 all-ones
+mbedtls_byteswap:"ffffffffffffffff":32:"ffffffff"
+
+Byteswap 64
+mbedtls_byteswap:"0706050403020100":64:"01020304050607"
+
+Byteswap 64 all-zero
+mbedtls_byteswap:"0000000000000000":64:"0000000000000000"
+
+Byteswap 64 all-ones
+mbedtls_byteswap:"ffffffffffffffff":64:"ffffffffffffffff"
+
+Get individual bytes
+get_byte
+
+Endian-aware unaligned 16-bit BE offset=0
+unaligned_access_endian_aware:16:0:1
+
+Endian-aware unaligned 16-bit BE offset=3
+unaligned_access_endian_aware:16:3:1
+
+Endian-aware unaligned 16-bit LE offset=0
+unaligned_access_endian_aware:16:0:0
+
+Endian-aware unaligned 16-bit LE offset=3
+unaligned_access_endian_aware:16:3:0
+
+Endian-aware unaligned 32-bit BE offset=0
+unaligned_access_endian_aware:32:0:1
+
+Endian-aware unaligned 32-bit BE offset=3
+unaligned_access_endian_aware:32:3:1
+
+Endian-aware unaligned 32-bit LE offset=0
+unaligned_access_endian_aware:32:0:0
+
+Endian-aware unaligned 32-bit LE offset=3
+unaligned_access_endian_aware:32:3:0
+
+Endian-aware unaligned 64-bit BE offset=0
+unaligned_access_endian_aware:64:0:1
+
+Endian-aware unaligned 64-bit BE offset=3
+unaligned_access_endian_aware:64:3:1
+
+Endian-aware unaligned 64-bit LE offset=0
+unaligned_access_endian_aware:64:0:0
+
+Endian-aware unaligned 64-bit LE offset=3
+unaligned_access_endian_aware:64:3:0
+
+Big-endian check
+mbedtls_is_big_endian
diff --git a/tests/suites/test_suite_alignment.function b/tests/suites/test_suite_alignment.function
new file mode 100644
index 0000000..06c5668
--- /dev/null
+++ b/tests/suites/test_suite_alignment.function
@@ -0,0 +1,407 @@
+/* BEGIN_HEADER */
+#include "../library/alignment.h"
+
+#include <stdint.h>
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunreachable-code"
+#endif
+#include <stdio.h>
+
+/*
+ * Convert a string of the form "abcd" (case-insensitive) to a uint64_t.
+ */
+int parse_hex_string( char* hex_string, uint64_t *result )
+{
+    uint8_t raw[8];
+    size_t olen;
+    if ( mbedtls_test_unhexify(raw, sizeof(raw), hex_string, &olen) != 0 ) return 0;
+    *result = 0;
+    for ( size_t i = 0; i < olen; i++ )
+    {
+        if ( MBEDTLS_IS_BIG_ENDIAN ) {
+            *result |= ((uint64_t)raw[i]) << ( i * 8 );
+        }
+        else
+        {
+            *result |= ((uint64_t)raw[i]) << ( (olen - i - 1) * 8 );
+        }
+    }
+    return 1;
+}
+
+/* END_HEADER */
+
+/* BEGIN_CASE */
+void mbedtls_unaligned_access( int size, int offset )
+{
+    /* Define 64-bit aligned raw byte array */
+    uint64_t raw[2];
+
+    /* Populate with known data */
+    uint8_t *x = (uint8_t *) raw;
+    for ( size_t i = 0; i < sizeof(raw); i++ )
+        x[i] = (uint8_t)i;
+
+    TEST_ASSERT( size == 16 || size == 32 || size == 64 );
+
+    uint64_t r = 0;
+    switch ( size )
+    {
+        case 16:
+            r = mbedtls_get_unaligned_uint16( x + offset );
+            break;
+        case 32:
+            r = mbedtls_get_unaligned_uint32( x + offset );
+            break;
+        case 64:
+            r = mbedtls_get_unaligned_uint64( x + offset );
+            break;
+    }
+
+    /* Generate expected result */
+    uint64_t expected = 0;
+    for ( uint8_t i = 0; i < 8; i++ )
+    {
+        uint8_t shift;
+        if ( MBEDTLS_IS_BIG_ENDIAN )
+        {
+            /*
+            * Similar to little-endian case described below, but the shift needs
+            * to be inverted
+            */
+            shift = 7 - ( i * 8 );
+        } else {
+            /* example for offset == 1:
+            * expected = (( 1 + 0 ) << (0 * 8)) | (( 1 + 1 ) << (1 * 8)) | (( 1 + 2 ) << (2 * 8)))
+            *          = (1 << 0) | (2 << 8) | (3 << 16) ...
+            *          = 0x0807060504030201
+            * x = { 0, 1, 2, 3, ... }
+            * ie expected is the value that would be read from x on a LE system, when
+            * byte swapping is not performed
+            */
+            shift = i * 8;
+        }
+        uint64_t b = offset + i;
+        expected |= b << shift;
+    }
+
+    /* Mask out excess bits from expected result */
+    switch ( size )
+    {
+        case 16:
+            expected &= 0xffff;
+            break;
+        case 32:
+            expected &= 0xffffffff;
+            break;
+    }
+
+    TEST_EQUAL( r, expected );
+
+    /* Write sentinel to the part of the array we will testing writing to */
+    for ( size_t i = 0; i < (size_t) ( size / 8 ); i++ )
+    {
+        x[i + offset] = 0xff;
+    }
+    /*
+        * Write back to the array with mbedtls_put_unaligned_uint16 and validate
+        * that the array is unchanged as a result.
+        */
+    switch ( size )
+    {
+        case 16:
+            mbedtls_put_unaligned_uint16( x + offset, r );
+            break;
+        case 32:
+            mbedtls_put_unaligned_uint32( x + offset, r );
+            break;
+        case 64:
+            mbedtls_put_unaligned_uint64( x + offset, r );
+            break;
+    }
+    for ( size_t i = 0; i < sizeof(x); i++ )
+    {
+        TEST_EQUAL( x[i], (uint8_t)i );
+    }
+}
+/* END_CASE */
+
+/* BEGIN_CASE */
+void mbedtls_byteswap( char* input_str, int size, char *expected_str )
+{
+    uint64_t input, expected;
+    TEST_ASSERT( parse_hex_string( input_str, &input ) );
+    TEST_ASSERT( parse_hex_string( expected_str, &expected ) );
+
+    /* Check against expected result */
+    uint64_t r = 0;
+    switch ( size )
+    {
+        case 16:
+            r = MBEDTLS_BSWAP16( input );
+            break;
+        case 32:
+            r = MBEDTLS_BSWAP32( input );
+            break;
+        case 64:
+            r = MBEDTLS_BSWAP64( input );
+            break;
+        default:
+            TEST_ASSERT( ! "size must be 16, 32 or 64" );
+    }
+    TEST_EQUAL( r, expected );
+
+    /*
+     * Check byte by byte by extracting bytes from opposite ends of
+     * input and r.
+     */
+    for ( size_t i = 0; i < (size_t)( size / 8 ); i++ )
+    {
+        size_t s1 = i * 8;
+        size_t s2 = ( ( size / 8 - 1 ) - i ) * 8;
+        uint64_t a = ( input & ( (uint64_t)0xff << s1 ) ) >> s1;
+        uint64_t b = ( r & ( (uint64_t)0xff << s2 ) ) >> s2;
+        TEST_EQUAL( a, b );
+    }
+
+    /* Check BSWAP(BSWAP(x)) == x */
+    switch ( size )
+    {
+        case 16:
+            r = MBEDTLS_BSWAP16( r );
+            TEST_EQUAL( r, input & 0xffff );
+            break;
+        case 32:
+            r = MBEDTLS_BSWAP32( r );
+            TEST_EQUAL( r, input & 0xffffffff );
+            break;
+        case 64:
+            r = MBEDTLS_BSWAP64( r );
+            TEST_EQUAL( r, input );
+            break;
+    }
+}
+/* END_CASE */
+
+/* BEGIN_CASE */
+void get_byte()
+{
+    uint8_t data[16];
+
+    for ( size_t i = 0; i < sizeof(data); i++ )
+        data[i] = (uint8_t) i;
+
+    uint64_t u64 = 0x0706050403020100;
+    for ( size_t b = 0; b < 8 ; b++ )
+    {
+        uint8_t expected = b;
+        uint8_t actual = b + 1;
+        switch ( b )
+        {
+            case 0:
+                actual = MBEDTLS_BYTE_0( u64 );
+                break;
+            case 1:
+                actual = MBEDTLS_BYTE_1( u64 );
+                break;
+            case 2:
+                actual = MBEDTLS_BYTE_2( u64 );
+                break;
+            case 3:
+                actual = MBEDTLS_BYTE_3( u64 );
+                break;
+            case 4:
+                actual = MBEDTLS_BYTE_4( u64 );
+                break;
+            case 5:
+                actual = MBEDTLS_BYTE_5( u64 );
+                break;
+            case 6:
+                actual = MBEDTLS_BYTE_6( u64 );
+                break;
+            case 7:
+                actual = MBEDTLS_BYTE_7( u64 );
+                break;
+        }
+        TEST_EQUAL( actual, expected );
+    }
+
+    uint32_t u32 = 0x03020100;
+    for ( size_t b = 0; b < 4 ; b++ )
+    {
+        uint8_t expected = b;
+        uint8_t actual = b + 1;
+        switch ( b )
+        {
+            case 0:
+                actual = MBEDTLS_BYTE_0( u32 );
+                break;
+            case 1:
+                actual = MBEDTLS_BYTE_1( u32 );
+                break;
+            case 2:
+                actual = MBEDTLS_BYTE_2( u32 );
+                break;
+            case 3:
+                actual = MBEDTLS_BYTE_3( u32 );
+                break;
+        }
+        TEST_EQUAL( actual, expected );
+    }
+
+    uint16_t u16 = 0x0100;
+    for ( size_t b = 0; b < 2 ; b++ )
+    {
+        uint8_t expected = b;
+        uint8_t actual = b + 1;
+        switch ( b )
+        {
+            case 0:
+                actual = MBEDTLS_BYTE_0( u16 );
+                break;
+            case 1:
+                actual = MBEDTLS_BYTE_1( u16 );
+                break;
+        }
+        TEST_EQUAL( actual, expected );
+    }
+
+    uint8_t u8 = 0x01;
+    uint8_t actual = MBEDTLS_BYTE_0( u8 );
+    TEST_EQUAL( actual, u8 );
+}
+/* END_CASE */
+
+/* BEGIN_CASE */
+void unaligned_access_endian_aware(int size, int offset, int big_endian )
+{
+    TEST_ASSERT( size == 16 || size == 24 || size == 32 || size == 64 );
+    TEST_ASSERT( offset >= 0 && offset < 8 );
+
+    /* Define 64-bit aligned raw byte array */
+    uint64_t raw[2];
+    /* Populate with known data: x == { 0, 1, 2, ... } */
+    uint8_t *x = (uint8_t *) raw;
+    for ( size_t i = 0; i < sizeof(raw); i++ )
+        x[i] = (uint8_t) i;
+
+    uint64_t read = 0;
+    if ( big_endian )
+    {
+        switch ( size )
+        {
+            case 16:
+                read = MBEDTLS_GET_UINT16_BE( x, offset );
+                break;
+            case 24:
+                read = MBEDTLS_GET_UINT24_BE( x, offset );
+                break;
+            case 32:
+                read = MBEDTLS_GET_UINT32_BE( x, offset );
+                break;
+            case 64:
+                read = MBEDTLS_GET_UINT64_BE( x, offset );
+                break;
+        }
+    }
+    else
+    {
+        switch ( size )
+        {
+            case 16:
+                read = MBEDTLS_GET_UINT16_LE( x, offset );
+                break;
+            case 24:
+                read = MBEDTLS_GET_UINT24_LE( x, offset );
+                break;
+            case 32:
+                read = MBEDTLS_GET_UINT32_LE( x, offset );
+                break;
+            case 64:
+                read = MBEDTLS_GET_UINT64_LE( x, offset );
+                break;
+        }
+    }
+
+    /* Build up expected value byte by byte, in either big or little endian format */
+    uint64_t expected = 0;
+    for ( size_t i = 0; i < (size_t)(size / 8); i++ )
+    {
+        uint64_t b = x[i + offset];
+        uint8_t shift = (big_endian) ? (8 * ((size / 8 - 1) - i)) : (8 * i);
+        expected |= b << shift;
+    }
+
+    /* Verify read */
+    TEST_EQUAL( read, expected );
+
+    /* Test writing back to memory. First write sentiel */
+    for ( size_t i = 0; i < (size_t)(size / 8); i++ )
+    {
+        x[i + offset] = 0xff;
+    }
+    /* Overwrite sentinel with endian-aware write macro */
+    if ( big_endian )
+    {
+        switch ( size )
+        {
+            case 16:
+                MBEDTLS_PUT_UINT16_BE( read, x, offset );
+                break;
+            case 24:
+                MBEDTLS_PUT_UINT24_BE( read, x, offset );
+                break;
+            case 32:
+                MBEDTLS_PUT_UINT32_BE( read, x, offset );
+                break;
+            case 64:
+                MBEDTLS_PUT_UINT64_BE( read, x, offset );
+                break;
+        }
+    }
+    else
+    {
+        switch ( size )
+        {
+            case 16:
+                MBEDTLS_PUT_UINT16_LE( read, x, offset );
+                break;
+                case 24:
+                MBEDTLS_PUT_UINT24_LE( read, x, offset );
+                break;
+            case 32:
+                MBEDTLS_PUT_UINT32_LE( read, x, offset );
+                break;
+            case 64:
+                MBEDTLS_PUT_UINT64_LE( read, x, offset );
+                break;
+        }
+    }
+
+    /* Verify write - check memory is correct */
+    for ( size_t i = 0; i < sizeof(raw); i++ )
+        TEST_EQUAL( x[i], (uint8_t) i );
+}
+/* END_CASE */
+
+/* BEGIN_CASE */
+void mbedtls_is_big_endian()
+{
+    uint16_t check = 0x1234;
+    uint8_t* p = (uint8_t*) &check;
+
+    if ( MBEDTLS_IS_BIG_ENDIAN )
+    {
+        /* Big-endian: data stored MSB first, i.e. p == { 0x12, 0x34 } */
+        TEST_EQUAL( p[0], 0x12 );
+        TEST_EQUAL( p[1], 0x34 );
+    }
+    else
+    {
+        /* Little-endian: data stored LSB first, i.e. p == { 0x34, 0x12 } */
+        TEST_EQUAL( p[0], 0x34 );
+        TEST_EQUAL( p[1], 0x12 );
+    }
+}
+/* END_CASE */
diff --git a/tests/suites/test_suite_bignum_core.function b/tests/suites/test_suite_bignum_core.function
index b64127a..7bf03fb 100644
--- a/tests/suites/test_suite_bignum_core.function
+++ b/tests/suites/test_suite_bignum_core.function
@@ -1097,6 +1097,12 @@
 
     TEST_EQUAL( 0, memcmp( X, Y, N_limbs * sizeof( mbedtls_mpi_uint ) ) );
 
+    /* Check when output aliased to input */
+
+    mbedtls_mpi_core_exp_mod( A, A, N, N_limbs, E, E_limbs, R2, T );
+
+    TEST_EQUAL( 0, memcmp( X, A, N_limbs * sizeof( mbedtls_mpi_uint ) ) );
+
 exit:
     mbedtls_free( T );
     mbedtls_free( A );
diff --git a/tests/suites/test_suite_bignum_mod_raw.function b/tests/suites/test_suite_bignum_mod_raw.function
index c7decf0..5d23707 100644
--- a/tests/suites/test_suite_bignum_mod_raw.function
+++ b/tests/suites/test_suite_bignum_mod_raw.function
@@ -349,6 +349,75 @@
 
 /* BEGIN MERGE SLOT 3 */
 
+/* BEGIN_CASE */
+void mpi_mod_raw_inv_prime( char * input_N, char * input_A, char * input_X )
+{
+    mbedtls_mpi_uint *A = NULL;
+    mbedtls_mpi_uint *N = NULL;
+    mbedtls_mpi_uint *X = NULL;
+    size_t A_limbs, N_limbs, X_limbs;
+    mbedtls_mpi_uint *Y = NULL;
+    mbedtls_mpi_uint *T = NULL;
+    const mbedtls_mpi_uint *R2 = NULL;
+
+    /* Legacy MPIs for computing R2 */
+    mbedtls_mpi N_mpi;  /* gets set up manually, aliasing N, so no need to free */
+    mbedtls_mpi R2_mpi;
+    mbedtls_mpi_init( &R2_mpi );
+
+    TEST_EQUAL( 0, mbedtls_test_read_mpi_core( &A, &A_limbs, input_A ) );
+    TEST_EQUAL( 0, mbedtls_test_read_mpi_core( &N, &N_limbs, input_N ) );
+    TEST_EQUAL( 0, mbedtls_test_read_mpi_core( &X, &X_limbs, input_X ) );
+    ASSERT_ALLOC( Y, N_limbs );
+
+    TEST_EQUAL( A_limbs, N_limbs );
+    TEST_EQUAL( X_limbs, N_limbs );
+
+    N_mpi.s = 1;
+    N_mpi.p = N;
+    N_mpi.n = N_limbs;
+    TEST_EQUAL( 0, mbedtls_mpi_core_get_mont_r2_unsafe( &R2_mpi, &N_mpi ) );
+    TEST_EQUAL( 0, mbedtls_mpi_grow( &R2_mpi, N_limbs ) );
+    R2 = R2_mpi.p;
+
+    size_t working_limbs = mbedtls_mpi_mod_raw_inv_prime_working_limbs( N_limbs );
+
+    /* No point exactly duplicating the code in mbedtls_mpi_mod_raw_inv_prime_working_limbs()
+     * to see if the output is correct, but we can check that it's in a
+     * reasonable range.  The current calculation works out as
+     * `1 + N_limbs * (welem + 4)`, where welem is the number of elements in
+     * the window (1 << 1 up to 1 << 6).
+     */
+    size_t min_expected_working_limbs = 1 + N_limbs * 5;
+    size_t max_expected_working_limbs = 1 + N_limbs * 68;
+
+    TEST_LE_U( min_expected_working_limbs, working_limbs );
+    TEST_LE_U( working_limbs, max_expected_working_limbs );
+
+    ASSERT_ALLOC( T, working_limbs );
+
+    mbedtls_mpi_mod_raw_inv_prime( Y, A, N, N_limbs, R2, T );
+
+    TEST_EQUAL( 0, memcmp( X, Y, N_limbs * sizeof( mbedtls_mpi_uint ) ) );
+
+    /* Check when output aliased to input */
+
+    mbedtls_mpi_mod_raw_inv_prime( A, A, N, N_limbs, R2, T );
+
+    TEST_EQUAL( 0, memcmp( X, A, N_limbs * sizeof( mbedtls_mpi_uint ) ) );
+
+exit:
+    mbedtls_free( T );
+    mbedtls_free( A );
+    mbedtls_free( N );
+    mbedtls_free( X );
+    mbedtls_free( Y );
+    mbedtls_mpi_free( &R2_mpi );
+    // R2 doesn't need to be freed as it is only aliasing R2_mpi
+    // N_mpi doesn't need to be freed as it is only aliasing N
+}
+/* END_CASE */
+
 /* END MERGE SLOT 3 */
 
 /* BEGIN MERGE SLOT 4 */
diff --git a/tests/suites/test_suite_common.data b/tests/suites/test_suite_common.data
new file mode 100644
index 0000000..500852d
--- /dev/null
+++ b/tests/suites/test_suite_common.data
@@ -0,0 +1,20 @@
+Block xor, length 0
+mbedtls_xor:0
+
+Block xor, length 1
+mbedtls_xor:1
+
+Block xor, length 3
+mbedtls_xor:3
+
+Block xor, length 4
+mbedtls_xor:4
+
+Block xor, length 7
+mbedtls_xor:7
+
+Block xor, length 8
+mbedtls_xor:8
+
+Block xor, length 16
+mbedtls_xor:16
diff --git a/tests/suites/test_suite_common.function b/tests/suites/test_suite_common.function
new file mode 100644
index 0000000..4444a52
--- /dev/null
+++ b/tests/suites/test_suite_common.function
@@ -0,0 +1,90 @@
+/* BEGIN_HEADER */
+#include "../library/common.h"
+
+void fill_arrays( unsigned char *a, unsigned char *b, unsigned char *r1, unsigned char *r2, size_t n )
+{
+    for ( size_t i = 0; i < n; i++ )
+    {
+        a[i]  = (unsigned char) i * 3;
+        b[i]  = (unsigned char) i * 3 + 1;
+        r1[i] = (unsigned char) i * 3 + 2;
+        r2[i] = r1[i];
+    }
+}
+/* END_HEADER */
+
+/* BEGIN_CASE */
+void mbedtls_xor( int len )
+{
+    size_t n = (size_t) len;
+    unsigned char *a = NULL, *b = NULL, *r1 = NULL, *r2 = NULL;
+    ASSERT_ALLOC( a, n + 1 );
+    ASSERT_ALLOC( b, n + 1 );
+    ASSERT_ALLOC( r1, n + 1 );
+    ASSERT_ALLOC( r2, n + 1 );
+
+    /* Test non-overlapping */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = a[i] ^ b[i];
+    }
+    mbedtls_xor( r2, a, b, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test r == a */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = r1[i] ^ b[i];
+    }
+    mbedtls_xor( r2, r2, b, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test r == b */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = a[i] ^ r1[i];
+    }
+    mbedtls_xor( r2, a, r2, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test a == b */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = a[i] ^ a[i];
+    }
+    mbedtls_xor( r2, a, a, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test a == b == r */
+    fill_arrays( a, b, r1, r2, n );
+    for ( size_t i = 0; i < n; i++ )
+    {
+        r1[i] = r1[i] ^ r1[i];
+    }
+    mbedtls_xor( r2, r2, r2, n );
+    ASSERT_COMPARE( r1, n, r2, n );
+
+    /* Test non-word-aligned buffers, for all combinations of alignedness */
+    for ( int i = 0; i < 7; i++ )
+    {
+        int r_off = i & 1, a_off = (i & 2) >> 1, b_off = (i & 4) >> 2;
+        fill_arrays( a, b, r1, r2, n + 1 );
+
+        for ( size_t j = 0; j < n; j++ )
+        {
+            r1[j + r_off] = a[j + a_off] ^ b[j + b_off];
+        }
+        mbedtls_xor( r2 + r_off, a + a_off, b + b_off, n );
+        ASSERT_COMPARE( r1 + r_off, n, r2 + r_off, n );
+    }
+exit:
+    mbedtls_free( a );
+    mbedtls_free( b );
+    mbedtls_free( r1 );
+    mbedtls_free( r2 );
+}
+/* END_CASE */