Squashed commit upgrading to mbedtls-3.6.0

Squash merging branch import/mbedtls-3.6.0

0fc9291f4 ("libmbedtls: bignum: restore mbedtls_mpi_exp_mod() from v3.5.2")
0ef87b1e6 ("libmbedtls: reset minimum rsa key size")
70b079496 ("libmbedtls: adjust use of rsa pk_wrap API")
6cf76464f ("libmbedtls: allow inclusion of arm_neon.h")
27df5c911 ("libmbedtls: fix cipher_wrap.c for NIST AES Key Wrap mode")
aa584f9ed ("libmbedtls: fix cipher_wrap.c for chacha20 and chachapoly")
523ae957e ("libmbedtls: add fault mitigation in mbedtls_rsa_rsassa_pkcs1_v15_verify()")
30bdb1bbf ("libmbedtls: add fault mitigation in mbedtls_rsa_rsassa_pss_verify_ext()")
e45cdab62 ("libmbedtls: add SM2 curve")
d2fda4fc2 ("libmbedtls: fix no CRT issue")
ab0eb5515 ("libmbedtls: add interfaces in mbedtls for context memory operation")
7925a6f26 ("libmedtls: mpi_miller_rabin: increase count limit")
8eaf69279 ("libmbedtls: add mbedtls_mpi_init_mempool()")
12e83fc8d ("libmbedtls: make mbedtls_mpi_mont*() available")
f9e261da5 ("mbedtls: configure mbedtls to reach for config")
7b6f378d7 ("mbedtls: remove default include/mbedtls/config.h")
c16331743 ("Import mbedtls-3.6.0")

Signed-off-by: Tom Van Eyck <tom.vaneyck@kuleuven.be>
Acked-by: Jerome Forissier <jerome.forissier@linaro.org>
diff --git a/lib/libmbedtls/mbedtls/library/bignum.c b/lib/libmbedtls/mbedtls/library/bignum.c
index b6733b7..c022a61 100644
--- a/lib/libmbedtls/mbedtls/library/bignum.c
+++ b/lib/libmbedtls/mbedtls/library/bignum.c
@@ -2,19 +2,7 @@
  *  Multi-precision integer library
  *
  *  Copyright The Mbed TLS Contributors
- *  SPDX-License-Identifier: Apache-2.0
- *
- *  Licensed under the Apache License, Version 2.0 (the "License"); you may
- *  not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
+ *  SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
  */
 
 /*
@@ -50,30 +38,156 @@
 #include "mbedtls/platform.h"
 
 #include <mempool.h>
-#include <util.h>
-
-#define MPI_VALIDATE_RET(cond)                                       \
-    MBEDTLS_INTERNAL_VALIDATE_RET(cond, MBEDTLS_ERR_MPI_BAD_INPUT_DATA)
-#define MPI_VALIDATE(cond)                                           \
-    MBEDTLS_INTERNAL_VALIDATE(cond)
-
-#define MPI_SIZE_T_MAX  ((size_t) -1)   /* SIZE_T_MAX is not standard */
 
 void *mbedtls_mpi_mempool;
 
-/* Implementation that should never be optimized out by the compiler */
-static void mbedtls_mpi_zeroize(mbedtls_mpi_uint *v, size_t n)
+
+/*
+ * Conditionally select an MPI sign in constant time.
+ * (MPI sign is the field s in mbedtls_mpi. It is unsigned short and only 1 and -1 are valid
+ * values.)
+ */
+static inline signed short mbedtls_ct_mpi_sign_if(mbedtls_ct_condition_t cond,
+                                                  signed short sign1, signed short sign2)
 {
-    mbedtls_platform_zeroize(v, ciL * n);
+    return (signed short) mbedtls_ct_uint_if(cond, sign1 + 1, sign2 + 1) - 1;
 }
 
 /*
+ * Compare signed values in constant time
+ */
+int mbedtls_mpi_lt_mpi_ct(const mbedtls_mpi *X,
+                          const mbedtls_mpi *Y,
+                          unsigned *ret)
+{
+    mbedtls_ct_condition_t different_sign, X_is_negative, Y_is_negative, result;
+
+    if (X->n != Y->n) {
+        return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
+    }
+
+    /*
+     * Set N_is_negative to MBEDTLS_CT_FALSE if N >= 0, MBEDTLS_CT_TRUE if N < 0.
+     * We know that N->s == 1 if N >= 0 and N->s == -1 if N < 0.
+     */
+    X_is_negative = mbedtls_ct_bool((X->s & 2) >> 1);
+    Y_is_negative = mbedtls_ct_bool((Y->s & 2) >> 1);
+
+    /*
+     * If the signs are different, then the positive operand is the bigger.
+     * That is if X is negative (X_is_negative == 1), then X < Y is true and it
+     * is false if X is positive (X_is_negative == 0).
+     */
+    different_sign = mbedtls_ct_bool_ne(X_is_negative, Y_is_negative); // true if different sign
+    result = mbedtls_ct_bool_and(different_sign, X_is_negative);
+
+    /*
+     * Assuming signs are the same, compare X and Y. We switch the comparison
+     * order if they are negative so that we get the right result, regardles of
+     * sign.
+     */
+
+    /* This array is used to conditionally swap the pointers in const time */
+    void * const p[2] = { X->p, Y->p };
+    size_t i = mbedtls_ct_size_if_else_0(X_is_negative, 1);
+    mbedtls_ct_condition_t lt = mbedtls_mpi_core_lt_ct(p[i], p[i ^ 1], X->n);
+
+    /*
+     * Store in result iff the signs are the same (i.e., iff different_sign == false). If
+     * the signs differ, result has already been set, so we don't change it.
+     */
+    result = mbedtls_ct_bool_or(result,
+                                mbedtls_ct_bool_and(mbedtls_ct_bool_not(different_sign), lt));
+
+    *ret = mbedtls_ct_uint_if_else_0(result, 1);
+
+    return 0;
+}
+
+/*
+ * Conditionally assign X = Y, without leaking information
+ * about whether the assignment was made or not.
+ * (Leaking information about the respective sizes of X and Y is ok however.)
+ */
+#if defined(_MSC_VER) && defined(MBEDTLS_PLATFORM_IS_WINDOWS_ON_ARM64) && \
+    (_MSC_FULL_VER < 193131103)
+/*
+ * MSVC miscompiles this function if it's inlined prior to Visual Studio 2022 version 17.1. See:
+ * https://developercommunity.visualstudio.com/t/c-compiler-miscompiles-part-of-mbedtls-library-on/1646989
+ */
+__declspec(noinline)
+#endif
+int mbedtls_mpi_safe_cond_assign(mbedtls_mpi *X,
+                                 const mbedtls_mpi *Y,
+                                 unsigned char assign)
+{
+    int ret = 0;
+
+    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, Y->n));
+
+    {
+        mbedtls_ct_condition_t do_assign = mbedtls_ct_bool(assign);
+
+        X->s = mbedtls_ct_mpi_sign_if(do_assign, Y->s, X->s);
+
+        mbedtls_mpi_core_cond_assign(X->p, Y->p, Y->n, do_assign);
+
+        mbedtls_ct_condition_t do_not_assign = mbedtls_ct_bool_not(do_assign);
+        for (size_t i = Y->n; i < X->n; i++) {
+            X->p[i] = mbedtls_ct_mpi_uint_if_else_0(do_not_assign, X->p[i]);
+        }
+    }
+
+cleanup:
+    return ret;
+}
+
+/*
+ * Conditionally swap X and Y, without leaking information
+ * about whether the swap was made or not.
+ * Here it is not ok to simply swap the pointers, which would lead to
+ * different memory access patterns when X and Y are used afterwards.
+ */
+int mbedtls_mpi_safe_cond_swap(mbedtls_mpi *X,
+                               mbedtls_mpi *Y,
+                               unsigned char swap)
+{
+    int ret = 0;
+    int s;
+
+    if (X == Y) {
+        return 0;
+    }
+
+    mbedtls_ct_condition_t do_swap = mbedtls_ct_bool(swap);
+
+    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, Y->n));
+    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(Y, X->n));
+
+    s = X->s;
+    X->s = mbedtls_ct_mpi_sign_if(do_swap, Y->s, X->s);
+    Y->s = mbedtls_ct_mpi_sign_if(do_swap, s, Y->s);
+
+    mbedtls_mpi_core_cond_swap(X->p, Y->p, X->n, do_swap);
+
+cleanup:
+    return ret;
+}
+
+/* Implementation that should never be optimized out by the compiler */
+#define mbedtls_mpi_zeroize_and_free(v, n) mbedtls_zeroize_and_free(v, ciL * (n))
+
+/*
+ * Implementation that should never be optimized out by the compiler.
+ * Reintroduced to allow use of mempool.
+ */
+#define mbedtls_mpi_zeroize(v, n) mbedtls_platform_zeroize(v, ciL * (n))
+
+/*
  * Initialize one MPI
  */
 static void mpi_init(mbedtls_mpi *X, short use_mempool)
 {
-    MPI_VALIDATE(X != NULL);
-
     X->s = 1;
     X->use_mempool = use_mempool;
     X->n = 0;
@@ -100,11 +214,12 @@
     }
 
     if (X->p != NULL) {
-        mbedtls_mpi_zeroize(X->p, X->n);
-        if(X->use_mempool)
+        if(X->use_mempool) {
+            mbedtls_mpi_zeroize(X->p, X->n);
             mempool_free(mbedtls_mpi_mempool, X->p);
-        else
-        mbedtls_free(X->p);
+        } else {
+            mbedtls_mpi_zeroize_and_free(X->p, X->n);
+        }
     }
 
     X->s = 1;
@@ -118,7 +233,6 @@
 int mbedtls_mpi_grow(mbedtls_mpi *X, size_t nblimbs)
 {
     mbedtls_mpi_uint *p;
-    MPI_VALIDATE_RET(X != NULL);
 
     if (nblimbs > MBEDTLS_MPI_MAX_LIMBS) {
         return MBEDTLS_ERR_MPI_ALLOC_FAILED;
@@ -138,14 +252,18 @@
 
         if (X->p != NULL) {
             memcpy(p, X->p, X->n * ciL);
-            mbedtls_mpi_zeroize(X->p, X->n);
-            if (X->use_mempool)
+
+            if (X->use_mempool) {
+                mbedtls_mpi_zeroize(X->p, X->n);
                 mempool_free(mbedtls_mpi_mempool, X->p);
-            else
-                mbedtls_free(X->p);
+            } else {
+                mbedtls_mpi_zeroize_and_free(X->p, X->n);
+            }
         }
 
-        X->n = nblimbs;
+        /* nblimbs fits in n because we ensure that MBEDTLS_MPI_MAX_LIMBS
+         * fits, and we've checked that nblimbs <= MBEDTLS_MPI_MAX_LIMBS. */
+        X->n = (unsigned short) nblimbs;
         X->p = p;
     }
 
@@ -160,7 +278,6 @@
 {
     mbedtls_mpi_uint *p;
     size_t i;
-    MPI_VALIDATE_RET(X != NULL);
 
     if (nblimbs > MBEDTLS_MPI_MAX_LIMBS) {
         return MBEDTLS_ERR_MPI_ALLOC_FAILED;
@@ -195,14 +312,19 @@
 
     if (X->p != NULL) {
         memcpy(p, X->p, i * ciL);
-        mbedtls_mpi_zeroize(X->p, X->n);
-        if (X->use_mempool)
+
+        if (X->use_mempool) {
+            mbedtls_mpi_zeroize(X->p, X->n);
             mempool_free(mbedtls_mpi_mempool, X->p);
-        else
-            mbedtls_free(X->p);
+        }
+        else {
+            mbedtls_mpi_zeroize_and_free(X->p, X->n);
+        }
     }
 
-    X->n = i;
+    /* i fits in n because we ensure that MBEDTLS_MPI_MAX_LIMBS
+     * fits, and we've checked that i <= nblimbs <= MBEDTLS_MPI_MAX_LIMBS. */
+    X->n = (unsigned short) i;
     X->p = p;
 
     return 0;
@@ -230,15 +352,12 @@
  * This function is not constant-time. Leading zeros in Y may be removed.
  *
  * Ensure that X does not shrink. This is not guaranteed by the public API,
- * but some code in the bignum module relies on this property, for example
- * in mbedtls_mpi_exp_mod().
+ * but some code in the bignum module might still rely on this property.
  */
 int mbedtls_mpi_copy(mbedtls_mpi *X, const mbedtls_mpi *Y)
 {
     int ret = 0;
     size_t i;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(Y != NULL);
 
     if (X == Y) {
         return 0;
@@ -280,8 +399,6 @@
 void mbedtls_mpi_swap(mbedtls_mpi *X, mbedtls_mpi *Y)
 {
     mbedtls_mpi T;
-    MPI_VALIDATE(X != NULL);
-    MPI_VALIDATE(Y != NULL);
 
     memcpy(&T,  X, sizeof(mbedtls_mpi));
     memcpy(X,  Y, sizeof(mbedtls_mpi));
@@ -300,19 +417,22 @@
     return (mbedtls_mpi_uint) 0 - (mbedtls_mpi_uint) z;
 }
 
+/* Convert x to a sign, i.e. to 1, if x is positive, or -1, if x is negative.
+ * This looks awkward but generates smaller code than (x < 0 ? -1 : 1) */
+#define TO_SIGN(x) ((mbedtls_mpi_sint) (((mbedtls_mpi_uint) x) >> (biL - 1)) * -2 + 1)
+
 /*
  * Set value from integer
  */
 int mbedtls_mpi_lset(mbedtls_mpi *X, mbedtls_mpi_sint z)
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    MPI_VALIDATE_RET(X != NULL);
 
     MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, 1));
     memset(X->p, 0, X->n * ciL);
 
     X->p[0] = mpi_sint_abs(z);
-    X->s    = (z < 0) ? -1 : 1;
+    X->s    = TO_SIGN(z);
 
 cleanup:
 
@@ -324,8 +444,6 @@
  */
 int mbedtls_mpi_get_bit(const mbedtls_mpi *X, size_t pos)
 {
-    MPI_VALIDATE_RET(X != NULL);
-
     if (X->n * biL <= pos) {
         return 0;
     }
@@ -341,7 +459,6 @@
     int ret = 0;
     size_t off = pos / biL;
     size_t idx = pos % biL;
-    MPI_VALIDATE_RET(X != NULL);
 
     if (val != 0 && val != 1) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
@@ -368,16 +485,34 @@
  */
 size_t mbedtls_mpi_lsb(const mbedtls_mpi *X)
 {
-    size_t i, j, count = 0;
-    MBEDTLS_INTERNAL_VALIDATE_RET(X != NULL, 0);
+    size_t i;
 
+#if defined(__has_builtin)
+#if (MBEDTLS_MPI_UINT_MAX == UINT_MAX) && __has_builtin(__builtin_ctz)
+    #define mbedtls_mpi_uint_ctz __builtin_ctz
+#elif (MBEDTLS_MPI_UINT_MAX == ULONG_MAX) && __has_builtin(__builtin_ctzl)
+    #define mbedtls_mpi_uint_ctz __builtin_ctzl
+#elif (MBEDTLS_MPI_UINT_MAX == ULLONG_MAX) && __has_builtin(__builtin_ctzll)
+    #define mbedtls_mpi_uint_ctz __builtin_ctzll
+#endif
+#endif
+
+#if defined(mbedtls_mpi_uint_ctz)
     for (i = 0; i < X->n; i++) {
-        for (j = 0; j < biL; j++, count++) {
+        if (X->p[i] != 0) {
+            return i * biL + mbedtls_mpi_uint_ctz(X->p[i]);
+        }
+    }
+#else
+    size_t count = 0;
+    for (i = 0; i < X->n; i++) {
+        for (size_t j = 0; j < biL; j++, count++) {
             if (((X->p[i] >> j) & 1) != 0) {
                 return count;
             }
         }
     }
+#endif
 
     return 0;
 }
@@ -432,8 +567,6 @@
     int sign = 1;
     mbedtls_mpi_uint d;
     mbedtls_mpi T;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(s != NULL);
 
     if (radix < 2 || radix > 16) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
@@ -454,7 +587,7 @@
     slen = strlen(s);
 
     if (radix == 16) {
-        if (slen > MPI_SIZE_T_MAX >> 2) {
+        if (slen > SIZE_MAX >> 2) {
             return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
         }
 
@@ -536,9 +669,6 @@
     size_t n;
     char *p;
     mbedtls_mpi T;
-    MPI_VALIDATE_RET(X    != NULL);
-    MPI_VALIDATE_RET(olen != NULL);
-    MPI_VALIDATE_RET(buflen == 0 || buf != NULL);
 
     if (radix < 2 || radix > 16) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
@@ -604,7 +734,7 @@
     }
 
     *p++ = '\0';
-    *olen = p - buf;
+    *olen = (size_t) (p - buf);
 
 cleanup:
 
@@ -628,9 +758,6 @@
      */
     char s[MBEDTLS_MPI_RW_BUFFER_SIZE];
 
-    MPI_VALIDATE_RET(X   != NULL);
-    MPI_VALIDATE_RET(fin != NULL);
-
     if (radix < 2 || radix > 16) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
     }
@@ -674,7 +801,6 @@
      * newline characters and '\0'
      */
     char s[MBEDTLS_MPI_RW_BUFFER_SIZE];
-    MPI_VALIDATE_RET(X != NULL);
 
     if (radix < 2 || radix > 16) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
@@ -746,9 +872,6 @@
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     const size_t limbs = CHARS_TO_LIMBS(buflen);
 
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(buflen == 0 || buf != NULL);
-
     /* Ensure that target MPI has exactly the necessary number of limbs */
     MBEDTLS_MPI_CHK(mbedtls_mpi_resize_clear(X, limbs));
 
@@ -788,12 +911,7 @@
 int mbedtls_mpi_shift_l(mbedtls_mpi *X, size_t count)
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    size_t i, v0, t1;
-    mbedtls_mpi_uint r0 = 0, r1;
-    MPI_VALIDATE_RET(X != NULL);
-
-    v0 = count / (biL);
-    t1 = count & (biL - 1);
+    size_t i;
 
     i = mbedtls_mpi_bitlen(X) + count;
 
@@ -803,31 +921,7 @@
 
     ret = 0;
 
-    /*
-     * shift by count / limb_size
-     */
-    if (v0 > 0) {
-        for (i = X->n; i > v0; i--) {
-            X->p[i - 1] = X->p[i - v0 - 1];
-        }
-
-        for (; i > 0; i--) {
-            X->p[i - 1] = 0;
-        }
-    }
-
-    /*
-     * shift by count % limb_size
-     */
-    if (t1 > 0) {
-        for (i = v0; i < X->n; i++) {
-            r1 = X->p[i] >> (biL - t1);
-            X->p[i] <<= t1;
-            X->p[i] |= r0;
-            r0 = r1;
-        }
-    }
-
+    mbedtls_mpi_core_shift_l(X->p, X->n, count);
 cleanup:
 
     return ret;
@@ -838,7 +932,6 @@
  */
 int mbedtls_mpi_shift_r(mbedtls_mpi *X, size_t count)
 {
-    MPI_VALIDATE_RET(X != NULL);
     if (X->n != 0) {
         mbedtls_mpi_core_shift_r(X->p, X->n, count);
     }
@@ -851,8 +944,6 @@
 int mbedtls_mpi_cmp_abs(const mbedtls_mpi *X, const mbedtls_mpi *Y)
 {
     size_t i, j;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(Y != NULL);
 
     for (i = X->n; i > 0; i--) {
         if (X->p[i - 1] != 0) {
@@ -866,9 +957,8 @@
         }
     }
 
-    if (i == 0 && j == 0) {
-        return 0;
-    }
+    /* If i == j == 0, i.e. abs(X) == abs(Y),
+     * we end up returning 0 at the end of the function. */
 
     if (i > j) {
         return 1;
@@ -895,8 +985,6 @@
 int mbedtls_mpi_cmp_mpi(const mbedtls_mpi *X, const mbedtls_mpi *Y)
 {
     size_t i, j;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(Y != NULL);
 
     for (i = X->n; i > 0; i--) {
         if (X->p[i - 1] != 0) {
@@ -947,10 +1035,9 @@
 {
     mbedtls_mpi Y;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(X != NULL);
 
     *p  = mpi_sint_abs(z);
-    Y.s = (z < 0) ? -1 : 1;
+    Y.s = TO_SIGN(z);
     Y.n = 1;
     Y.p = p;
 
@@ -964,9 +1051,8 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     size_t j;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
+    mbedtls_mpi_uint *p;
+    mbedtls_mpi_uint c;
 
     if (X == B) {
         const mbedtls_mpi *T = A; A = X; B = T;
@@ -997,9 +1083,9 @@
 
     /* j is the number of non-zero limbs of B. Add those to X. */
 
-    mbedtls_mpi_uint *p = X->p;
+    p = X->p;
 
-    mbedtls_mpi_uint c = mbedtls_mpi_core_add(p, p, B->p, j);
+    c = mbedtls_mpi_core_add(p, p, B->p, j);
 
     p += j;
 
@@ -1027,9 +1113,6 @@
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     size_t n;
     mbedtls_mpi_uint carry;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
 
     for (n = B->n; n > 0; n--) {
         if (B->p[n - 1] != 0) {
@@ -1081,9 +1164,6 @@
                        int flip_B)
 {
     int ret, s;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
 
     s = A->s;
     if (A->s * B->s * flip_B < 0) {
@@ -1132,11 +1212,9 @@
 {
     mbedtls_mpi B;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
 
     p[0] = mpi_sint_abs(b);
-    B.s = (b < 0) ? -1 : 1;
+    B.s = TO_SIGN(b);
     B.n = 1;
     B.p = p;
 
@@ -1150,11 +1228,9 @@
 {
     mbedtls_mpi B;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
 
     p[0] = mpi_sint_abs(b);
-    B.s = (b < 0) ? -1 : 1;
+    B.s = TO_SIGN(b);
     B.n = 1;
     B.p = p;
 
@@ -1170,11 +1246,9 @@
     size_t i, j;
     mbedtls_mpi TA, TB;
     int result_is_zero = 0;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
 
-    mbedtls_mpi_init_mempool(&TA); mbedtls_mpi_init_mempool(&TB);
+    mbedtls_mpi_init_mempool(&TA);
+    mbedtls_mpi_init_mempool(&TB);
 
     if (X == A) {
         MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TA, A)); A = &TA;
@@ -1204,13 +1278,7 @@
     MBEDTLS_MPI_CHK(mbedtls_mpi_grow(X, i + j));
     MBEDTLS_MPI_CHK(mbedtls_mpi_lset(X, 0));
 
-    for (size_t k = 0; k < j; k++) {
-        /* We know that there cannot be any carry-out since we're
-         * iterating from bottom to top. */
-        (void) mbedtls_mpi_core_mla(X->p + k, i + 1,
-                                    A->p, i,
-                                    B->p[k]);
-    }
+    mbedtls_mpi_core_mul(X->p, A->p, i, B->p, j);
 
     /* If the result is 0, we don't shortcut the operation, which reduces
      * but does not eliminate side channels leaking the zero-ness. We do
@@ -1234,9 +1302,6 @@
  */
 int mbedtls_mpi_mul_int(mbedtls_mpi *X, const mbedtls_mpi *A, mbedtls_mpi_uint b)
 {
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-
     size_t n = A->n;
     while (n > 0 && A->p[n - 1] == 0) {
         --n;
@@ -1382,8 +1447,6 @@
     size_t i, n, t, k;
     mbedtls_mpi X, Y, Z, T1, T2;
     mbedtls_mpi_uint TP2[3];
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
 
     if (mbedtls_mpi_cmp_int(B, 0) == 0) {
         return MBEDTLS_ERR_MPI_DIVISION_BY_ZERO;
@@ -1506,10 +1569,9 @@
 {
     mbedtls_mpi B;
     mbedtls_mpi_uint p[1];
-    MPI_VALIDATE_RET(A != NULL);
 
     p[0] = mpi_sint_abs(b);
-    B.s = (b < 0) ? -1 : 1;
+    B.s = TO_SIGN(b);
     B.n = 1;
     B.p = p;
 
@@ -1522,9 +1584,6 @@
 int mbedtls_mpi_mod_mpi(mbedtls_mpi *R, const mbedtls_mpi *A, const mbedtls_mpi *B)
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    MPI_VALIDATE_RET(R != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
 
     if (mbedtls_mpi_cmp_int(B, 0) < 0) {
         return MBEDTLS_ERR_MPI_NEGATIVE_VALUE;
@@ -1552,8 +1611,6 @@
 {
     size_t i;
     mbedtls_mpi_uint x, y, z;
-    MPI_VALIDATE_RET(r != NULL);
-    MPI_VALIDATE_RET(A != NULL);
 
     if (b == 0) {
         return MBEDTLS_ERR_MPI_DIVISION_BY_ZERO;
@@ -1604,18 +1661,17 @@
     return 0;
 }
 
-static void mpi_montg_init(mbedtls_mpi_uint *mm, const mbedtls_mpi *N)
+/**
+ * \remark Replaced by our own because the original has been removed since
+ *         mbedtls v3.6.0.
+*/
+void mbedtls_mpi_montg_init(mbedtls_mpi_uint *mm, const mbedtls_mpi *N)
 {
     *mm = mbedtls_mpi_core_montmul_init(N->p);
 }
 
-void mbedtls_mpi_montg_init( mbedtls_mpi_uint *mm, const mbedtls_mpi *N )
-{
-	mpi_montg_init( mm, N );
-}
-
 /** Montgomery multiplication: A = A * B * R^-1 mod N  (HAC 14.36)
- *
+ * 
  * \param[in,out]   A   One of the numbers to multiply.
  *                      It must have at least as many limbs as N
  *                      (A->n >= N->n), and any limbs beyond n are ignored.
@@ -1634,27 +1690,25 @@
  *                      Its initial content is unused and
  *                      its final content is indeterminate.
  *                      It does not get reallocated.
+ * \remark Replaced by our own because the original has been removed since
+ *         mbedtls v3.6.0.
  */
-static void mpi_montmul(mbedtls_mpi *A, const mbedtls_mpi *B,
+void mbedtls_mpi_montmul(mbedtls_mpi *A, const mbedtls_mpi *B,
                         const mbedtls_mpi *N, mbedtls_mpi_uint mm,
                         mbedtls_mpi *T)
 {
     mbedtls_mpi_core_montmul(A->p, A->p, B->p, B->n, N->p, N->n, mm, T->p);
 }
 
-void mbedtls_mpi_montmul(mbedtls_mpi *A, const mbedtls_mpi *B,
-                         const mbedtls_mpi *N, mbedtls_mpi_uint mm,
-                         mbedtls_mpi *T )
-{
-    mpi_montmul( A, B, N, mm, T);
-}
-
-/*
+/**
  * Montgomery reduction: A = A * R^-1 mod N
  *
- * See mpi_montmul() regarding constraints and guarantees on the parameters.
+ * See mbedtls_mpi_montmul() regarding constraints and guarantees on the parameters.
+ * 
+ * \remark Replaced by our own because the original has been removed since
+ *         mbedtls v3.6.0.
  */
-static void mpi_montred(mbedtls_mpi *A, const mbedtls_mpi *N,
+void mbedtls_mpi_montred(mbedtls_mpi *A, const mbedtls_mpi *N,
                         mbedtls_mpi_uint mm, mbedtls_mpi *T)
 {
     mbedtls_mpi_uint z = 1;
@@ -1663,13 +1717,7 @@
     U.n = U.s = (int) z;
     U.p = &z;
 
-    mpi_montmul(A, &U, N, mm, T);
-}
-
-void mbedtls_mpi_montred(mbedtls_mpi *A, const mbedtls_mpi *N,
-                         mbedtls_mpi_uint mm, mbedtls_mpi *T)
-{
-    mpi_montred(A, N, mm, T);
+    mbedtls_mpi_montmul(A, &U, N, mm, T);
 }
 
 /**
@@ -1693,10 +1741,8 @@
 
     for (size_t i = 0; i < T_size; i++) {
         MBEDTLS_MPI_CHK(mbedtls_mpi_safe_cond_assign(R, &T[i],
-                                                     (unsigned char) mbedtls_ct_size_bool_eq(i,
-                                                                                             idx)));
+                                                     (unsigned char) mbedtls_ct_uint_eq(i, idx)));
     }
-
 cleanup:
     return ret;
 }
@@ -1714,16 +1760,9 @@
     size_t bufsize, nbits;
     size_t exponent_bits_in_window = 0;
     mbedtls_mpi_uint ei, mm, state;
-    mbedtls_mpi RR, T, WW, Apos;
-    mbedtls_mpi *W;
-    const size_t array_size_W = 2 << MBEDTLS_MPI_WINDOW_SIZE;
+    mbedtls_mpi RR, T, W[(size_t) 1 << MBEDTLS_MPI_WINDOW_SIZE], WW, Apos;
     int neg;
 
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(E != NULL);
-    MPI_VALIDATE_RET(N != NULL);
-
     if (mbedtls_mpi_cmp_int(N, 0) <= 0 || (N->p[0] & 1) == 0) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
     }
@@ -1740,10 +1779,11 @@
     /*
      * Init temps and window size
      */
-    mpi_montg_init(&mm, N);
-    mbedtls_mpi_init_mempool(&RR); mbedtls_mpi_init(&T);
-    mbedtls_mpi_init_mempool(&Apos);
-    mbedtls_mpi_init_mempool(&WW);
+    mbedtls_mpi_montg_init(&mm, N);
+    mbedtls_mpi_init(&RR); mbedtls_mpi_init(&T);
+    mbedtls_mpi_init(&Apos);
+    mbedtls_mpi_init(&WW);
+    memset(W, 0, sizeof(W));
 
     i = mbedtls_mpi_bitlen(E);
 
@@ -1758,14 +1798,6 @@
 
     const size_t w_table_used_size = (size_t) 1 << window_bitsize;
 
-    W = mempool_alloc(mbedtls_mpi_mempool,
-                      sizeof( mbedtls_mpi ) * array_size_W);
-    if (W == NULL) {
-        ret = MBEDTLS_ERR_MPI_ALLOC_FAILED;
-        goto cleanup;
-    }
-    for (i = 0; i < array_size_W; i++)
-        mbedtls_mpi_init_mempool(W + i);
     /*
      * This function is not constant-trace: its memory accesses depend on the
      * exponent value. To defend against timing attacks, callers (such as RSA
@@ -1780,8 +1812,9 @@
      * and squarings. Firstly, when multiplying by an element of the window
      * W[i], we do a constant-trace table lookup to obfuscate i. This leaves
      * squarings as having a different memory access patterns from other
-     * multiplications. So secondly, we put the accumulator X in the table as
-     * well, and also do a constant-trace table lookup to multiply by X.
+     * multiplications. So secondly, we put the accumulator in the table as
+     * well, and also do a constant-trace table lookup to multiply by the
+     * accumulator which is W[x_index].
      *
      * This way, all multiplications take the form of a lookup-and-multiply.
      * The number of lookup-and-multiply operations inside each iteration of
@@ -1794,21 +1827,20 @@
      * observe both memory accesses and branches. However, branch prediction
      * exploitation typically requires many traces of execution over the same
      * data, which is defeated by randomized blinding.
-     *
-     * To achieve this, we make a copy of X and we use the table entry in each
-     * calculation from this point on.
      */
     const size_t x_index = 0;
-    mbedtls_mpi_copy(&W[x_index], X);
+    mbedtls_mpi_init(&W[x_index]);
 
     j = N->n + 1;
-    /* All W[i] and X must have at least N->n limbs for the mpi_montmul()
-     * and mpi_montred() calls later. Here we ensure that W[1] and X are
-     * large enough, and later we'll grow other W[i] to the same length.
-     * They must not be shrunk midway through this function!
+    /* All W[i] including the accumulator must have at least N->n limbs for
+     * the mbedtls_mpi_montmul() and mbedtls_mpi_montred() calls later.
+     * Here we ensure that
+     * W[1] and the accumulator W[x_index] are large enough. later we'll grow
+     * other W[i] to the same length. They must not be shrunk midway through
+     * this function!
      */
     MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[x_index], j));
-
+    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[1],  j));
     MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&T, j * 2));
 
     /*
@@ -1836,8 +1868,6 @@
         memcpy(&RR, prec_RR, sizeof(mbedtls_mpi));
     }
 
-    MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[1],  j));
-
     /*
      * W[1] = A * R^2 * R^-1 mod N = A * R mod N
      */
@@ -1845,7 +1875,7 @@
         MBEDTLS_MPI_CHK(mbedtls_mpi_mod_mpi(&W[1], A, N));
         /* This should be a no-op because W[1] is already that large before
          * mbedtls_mpi_mod_mpi(), but it's necessary to avoid an overflow
-         * in mpi_montmul() below, so let's make sure. */
+         * in mbedtls_mpi_montmul() below, so let's make sure. */
         MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[1], N->n + 1));
     } else {
         MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[1], A));
@@ -1853,13 +1883,13 @@
 
     /* Note that this is safe because W[1] always has at least N->n limbs
      * (it grew above and was preserved by mbedtls_mpi_copy()). */
-    mpi_montmul(&W[1], &RR, N, mm, &T);
+    mbedtls_mpi_montmul(&W[1], &RR, N, mm, &T);
 
     /*
      * W[x_index] = R^2 * R^-1 mod N = R mod N
      */
     MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[x_index], &RR));
-    mpi_montred(&W[x_index], N, mm, &T);
+    mbedtls_mpi_montred(&W[x_index], N, mm, &T);
 
 
     if (window_bitsize > 1) {
@@ -1879,7 +1909,7 @@
         MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[j], &W[1]));
 
         for (i = 0; i < window_bitsize - 1; i++) {
-            mpi_montmul(&W[j], &W[j], N, mm, &T);
+            mbedtls_mpi_montmul(&W[j], &W[j], N, mm, &T);
         }
 
         /*
@@ -1889,7 +1919,7 @@
             MBEDTLS_MPI_CHK(mbedtls_mpi_grow(&W[i], N->n + 1));
             MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&W[i], &W[i - 1]));
 
-            mpi_montmul(&W[i], &W[1], N, mm, &T);
+            mbedtls_mpi_montmul(&W[i], &W[1], N, mm, &T);
         }
     }
 
@@ -1925,7 +1955,7 @@
              * out of window, square W[x_index]
              */
             MBEDTLS_MPI_CHK(mpi_select(&WW, W, w_table_used_size, x_index));
-            mpi_montmul(&W[x_index], &WW, N, mm, &T);
+            mbedtls_mpi_montmul(&W[x_index], &WW, N, mm, &T);
             continue;
         }
 
@@ -1944,7 +1974,7 @@
             for (i = 0; i < window_bitsize; i++) {
                 MBEDTLS_MPI_CHK(mpi_select(&WW, W, w_table_used_size,
                                            x_index));
-                mpi_montmul(&W[x_index], &WW, N, mm, &T);
+                mbedtls_mpi_montmul(&W[x_index], &WW, N, mm, &T);
             }
 
             /*
@@ -1952,7 +1982,7 @@
              */
             MBEDTLS_MPI_CHK(mpi_select(&WW, W, w_table_used_size,
                                        exponent_bits_in_window));
-            mpi_montmul(&W[x_index], &WW, N, mm, &T);
+            mbedtls_mpi_montmul(&W[x_index], &WW, N, mm, &T);
 
             state--;
             nbits = 0;
@@ -1965,20 +1995,20 @@
      */
     for (i = 0; i < nbits; i++) {
         MBEDTLS_MPI_CHK(mpi_select(&WW, W, w_table_used_size, x_index));
-        mpi_montmul(&W[x_index], &WW, N, mm, &T);
+        mbedtls_mpi_montmul(&W[x_index], &WW, N, mm, &T);
 
         exponent_bits_in_window <<= 1;
 
         if ((exponent_bits_in_window & ((size_t) 1 << window_bitsize)) != 0) {
             MBEDTLS_MPI_CHK(mpi_select(&WW, W, w_table_used_size, 1));
-            mpi_montmul(&W[x_index], &WW, N, mm, &T);
+            mbedtls_mpi_montmul(&W[x_index], &WW, N, mm, &T);
         }
     }
 
     /*
      * W[x_index] = A^E * R * R^-1 mod N = A^E mod N
      */
-    mpi_montred(&W[x_index], N, mm, &T);
+    mbedtls_mpi_montred(&W[x_index], N, mm, &T);
 
     if (neg && E->n != 0 && (E->p[0] & 1) != 0) {
         W[x_index].s = -1;
@@ -1988,15 +2018,18 @@
     /*
      * Load the result in the output variable.
      */
-    mbedtls_mpi_copy(X, &W[x_index]);
+    MBEDTLS_MPI_CHK(mbedtls_mpi_copy(X, &W[x_index]));
 
 cleanup:
 
-    if (W)
-        for (i = 0; i < array_size_W; i++)
-            mbedtls_mpi_free(W + i);
-    mempool_free(mbedtls_mpi_mempool , W);
+    /* The first bit of the sliding window is always 1 and therefore the first
+     * half of the table was unused. */
+    for (i = w_table_used_size/2; i < w_table_used_size; i++) {
+        mbedtls_mpi_free(&W[i]);
+    }
 
+    mbedtls_mpi_free(&W[x_index]);
+    mbedtls_mpi_free(&W[1]);
     mbedtls_mpi_free(&T);
     mbedtls_mpi_free(&Apos);
     mbedtls_mpi_free(&WW);
@@ -2017,10 +2050,6 @@
     size_t lz, lzt;
     mbedtls_mpi TA, TB;
 
-    MPI_VALIDATE_RET(G != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(B != NULL);
-
     mbedtls_mpi_init_mempool(&TA); mbedtls_mpi_init_mempool(&TB);
 
     MBEDTLS_MPI_CHK(mbedtls_mpi_copy(&TA, A));
@@ -2131,9 +2160,6 @@
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     const size_t limbs = CHARS_TO_LIMBS(size);
 
-    MPI_VALIDATE_RET(X     != NULL);
-    MPI_VALIDATE_RET(f_rng != NULL);
-
     /* Ensure that target MPI has exactly the necessary number of limbs */
     MBEDTLS_MPI_CHK(mbedtls_mpi_resize_clear(X, limbs));
     if (size == 0) {
@@ -2177,9 +2203,6 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     mbedtls_mpi G, TA, TU, U1, U2, TB, TV, V1, V2;
-    MPI_VALIDATE_RET(X != NULL);
-    MPI_VALIDATE_RET(A != NULL);
-    MPI_VALIDATE_RET(N != NULL);
 
     if (mbedtls_mpi_cmp_int(N, 1) <= 0) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
@@ -2265,29 +2288,30 @@
 
 #if defined(MBEDTLS_GENPRIME)
 
-static const int small_prime[] =
-{
-    3,    5,    7,   11,   13,   17,   19,   23,
-    29,   31,   37,   41,   43,   47,   53,   59,
-    61,   67,   71,   73,   79,   83,   89,   97,
-    101,  103,  107,  109,  113,  127,  131,  137,
-    139,  149,  151,  157,  163,  167,  173,  179,
-    181,  191,  193,  197,  199,  211,  223,  227,
-    229,  233,  239,  241,  251,  257,  263,  269,
-    271,  277,  281,  283,  293,  307,  311,  313,
-    317,  331,  337,  347,  349,  353,  359,  367,
-    373,  379,  383,  389,  397,  401,  409,  419,
-    421,  431,  433,  439,  443,  449,  457,  461,
-    463,  467,  479,  487,  491,  499,  503,  509,
-    521,  523,  541,  547,  557,  563,  569,  571,
-    577,  587,  593,  599,  601,  607,  613,  617,
-    619,  631,  641,  643,  647,  653,  659,  661,
-    673,  677,  683,  691,  701,  709,  719,  727,
-    733,  739,  743,  751,  757,  761,  769,  773,
-    787,  797,  809,  811,  821,  823,  827,  829,
-    839,  853,  857,  859,  863,  877,  881,  883,
-    887,  907,  911,  919,  929,  937,  941,  947,
-    953,  967,  971,  977,  983,  991,  997, -103
+/* Gaps between primes, starting at 3. https://oeis.org/A001223 */
+static const unsigned char small_prime_gaps[] = {
+    2, 2, 4, 2, 4, 2, 4, 6,
+    2, 6, 4, 2, 4, 6, 6, 2,
+    6, 4, 2, 6, 4, 6, 8, 4,
+    2, 4, 2, 4, 14, 4, 6, 2,
+    10, 2, 6, 6, 4, 6, 6, 2,
+    10, 2, 4, 2, 12, 12, 4, 2,
+    4, 6, 2, 10, 6, 6, 6, 2,
+    6, 4, 2, 10, 14, 4, 2, 4,
+    14, 6, 10, 2, 4, 6, 8, 6,
+    6, 4, 6, 8, 4, 8, 10, 2,
+    10, 2, 6, 4, 6, 8, 4, 2,
+    4, 12, 8, 4, 8, 4, 6, 12,
+    2, 18, 6, 10, 6, 6, 2, 6,
+    10, 6, 6, 2, 6, 6, 4, 2,
+    12, 10, 2, 4, 6, 6, 2, 12,
+    4, 6, 8, 10, 8, 10, 8, 6,
+    6, 4, 8, 6, 4, 8, 4, 14,
+    10, 12, 2, 10, 2, 4, 2, 10,
+    14, 4, 2, 4, 14, 4, 2, 4,
+    20, 4, 8, 10, 8, 4, 6, 6,
+    14, 4, 6, 6, 8, 6, /*reaches 997*/
+    0 /* the last entry is effectively unused */
 };
 
 /*
@@ -2304,20 +2328,20 @@
     int ret = 0;
     size_t i;
     mbedtls_mpi_uint r;
+    unsigned p = 3; /* The first odd prime */
 
     if ((X->p[0] & 1) == 0) {
         return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
     }
 
-    for (i = 0; small_prime[i] > 0; i++) {
-        if (mbedtls_mpi_cmp_int(X, small_prime[i]) <= 0) {
-            return 1;
-        }
-
-        MBEDTLS_MPI_CHK(mbedtls_mpi_mod_int(&r, X, small_prime[i]));
-
+    for (i = 0; i < sizeof(small_prime_gaps); p += small_prime_gaps[i], i++) {
+        MBEDTLS_MPI_CHK(mbedtls_mpi_mod_int(&r, X, p));
         if (r == 0) {
-            return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
+            if (mbedtls_mpi_cmp_int(X, p) == 0) {
+                return 1;
+            } else {
+                return MBEDTLS_ERR_MPI_NOT_ACCEPTABLE;
+            }
         }
     }
 
@@ -2336,9 +2360,6 @@
     size_t i, j, k, s;
     mbedtls_mpi W, R, T, A, RR;
 
-    MPI_VALIDATE_RET(X     != NULL);
-    MPI_VALIDATE_RET(f_rng != NULL);
-
     mbedtls_mpi_init_mempool(&W); mbedtls_mpi_init_mempool(&R);
     mbedtls_mpi_init_mempool(&T); mbedtls_mpi_init_mempool(&A);
     mbedtls_mpi_init_mempool(&RR);
@@ -2426,8 +2447,6 @@
 {
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
     mbedtls_mpi XX;
-    MPI_VALIDATE_RET(X     != NULL);
-    MPI_VALIDATE_RET(f_rng != NULL);
 
     XX.s = 1;
     XX.n = X->n;
@@ -2477,9 +2496,6 @@
     mbedtls_mpi_uint r;
     mbedtls_mpi Y;
 
-    MPI_VALIDATE_RET(X     != NULL);
-    MPI_VALIDATE_RET(f_rng != NULL);
-
     if (nbits < 3 || nbits > MBEDTLS_MPI_MAX_BITS) {
         return MBEDTLS_ERR_MPI_BAD_INPUT_DATA;
     }