Add bignum_new.c starting with MPI_CORE(montmul) for Montgomery multiplication

Signed-off-by: Tom Cosgrove <tom.cosgrove@arm.com>
diff --git a/include/mbedtls/mbedtls_config.h b/include/mbedtls/mbedtls_config.h
index ad0e599..cfa5e46 100644
--- a/include/mbedtls/mbedtls_config.h
+++ b/include/mbedtls/mbedtls_config.h
@@ -2016,6 +2016,7 @@
  *          library/bignum_core.c
  *          library/bignum_mod.c
  *          library/bignum_mod_raw.c
+ *          library/bignum_new.c
  * Caller:  library/dhm.c
  *          library/ecp.c
  *          library/ecdsa.c
diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt
index 378cfb4..7e28df0 100644
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
@@ -21,6 +21,7 @@
     bignum_core.c
     bignum_mod.c
     bignum_mod_raw.c
+    bignum_new.c
     camellia.c
     ccm.c
     chacha20.c
diff --git a/library/Makefile b/library/Makefile
index 85cea6b..041b5af 100644
--- a/library/Makefile
+++ b/library/Makefile
@@ -86,6 +86,7 @@
 	     bignum_core.o \
 	     bignum_mod.o \
 	     bignum_mod_raw.o \
+	     bignum_new.o \
 	     camellia.o \
 	     ccm.o \
 	     chacha20.o \
diff --git a/library/bignum_core.h b/library/bignum_core.h
index 1e159a7..62d0151 100644
--- a/library/bignum_core.h
+++ b/library/bignum_core.h
@@ -172,8 +172,107 @@
  *
  * \return c            The carry at the end of the operation.
  */
-mbedtls_mpi_uint mbedtls_mpi_core_mla( mbedtls_mpi_uint *d, size_t d_len ,
+mbedtls_mpi_uint mbedtls_mpi_core_mla( mbedtls_mpi_uint *d, size_t d_len,
                                        const mbedtls_mpi_uint *s, size_t s_len,
                                        mbedtls_mpi_uint b );
 
+#define MPI_CORE(func) mbedtls_mpi_core_ ## func ## _minimal
+
+/** Montgomery multiplication: X = A * B * R^-1 mod N  (HAC 14.36)
+ *
+ * \param[out]     X      The destination MPI, as a big endian array of length \p n.
+ *                        On successful completion, X contains the result of
+ *                        the multiplication A * B * R^-1 mod N where
+ *                        R = (2^ciL)^n.
+ * \param[in]      A      Big endian presentation of first operand.
+ *                        Must have exactly \p n limbs.
+ * \param[in]      B      Big endian presentation of second operand.
+ * \param[in]      B_len  The number of limbs in \p B.
+ * \param[in]      N      Big endian presentation of the modulus.
+ *                        This must be odd and have exactly \p n limbs.
+ * \param[in]      n      The number of limbs in \p X, \p A, \p N.
+ * \param          mm     The Montgomery constant for \p N: -N^-1 mod 2^ciL.
+ *                        This can be calculated by `mpi_montg_init()`.
+ * \param[in,out]  T      Temporary storage of size at least 2*n+1 limbs.
+ *                        Its initial content is unused and
+ *                        its final content is indeterminate.
+ */
+void MPI_CORE(montmul)( mbedtls_mpi_uint *X,
+                        const mbedtls_mpi_uint *A,
+                        const mbedtls_mpi_uint *B, size_t B_len,
+                        const mbedtls_mpi_uint *N, size_t n,
+                        mbedtls_mpi_uint mm, mbedtls_mpi_uint *T );
+
+/**
+ * \brief Perform a known-size multiply accumulate operation
+ *
+ * Add \p b * \p s to \p d.
+ *
+ * \param[in,out] d     The pointer to the (little-endian) array
+ *                      representing the bignum to accumulate onto.
+ * \param d_len         The number of limbs of \p d. This must be
+ *                      at least \p s_len.
+ * \param[in] s         The pointer to the (little-endian) array
+ *                      representing the bignum to multiply with.
+ *                      This may be the same as \p d. Otherwise,
+ *                      it must be disjoint from \p d.
+ * \param s_len         The number of limbs of \p s.
+ * \param b             A scalar to multiply with.
+ *
+ * \return c            The carry at the end of the operation.
+ */
+mbedtls_mpi_uint MPI_CORE(mla)( mbedtls_mpi_uint *d, size_t d_len ,
+                                const mbedtls_mpi_uint *s, size_t s_len,
+                                mbedtls_mpi_uint b );
+
+/**
+ * \brief Subtract two known-size large unsigned integers, returning the borrow.
+ *
+ * Calculate l - r where l and r have the same size.
+ * This function operates modulo (2^ciL)^n and returns the carry
+ * (1 if there was a wraparound, i.e. if `l < r`, and 0 otherwise).
+ *
+ * d may be aliased to l or r.
+ *
+ * \param[out] d        The result of the subtraction.
+ * \param[in] l         The left operand.
+ * \param[in] r         The right operand.
+ * \param n             Number of limbs of \p d, \p l and \p r.
+ *
+ * \return              1 if `l < r`.
+ *                      0 if `l >= r`.
+ */
+mbedtls_mpi_uint MPI_CORE(sub)( mbedtls_mpi_uint *d,
+                                const mbedtls_mpi_uint *l,
+                                const mbedtls_mpi_uint *r,
+                                size_t n );
+
+/**
+ * \brief Constant-time conditional addition of two known-size large unsigned
+ *        integers, returning the carry.
+ *
+ * Functionally equivalent to
+ *
+ * ```
+ * if( cond )
+ *    d += r;
+ * return carry;
+ * ```
+ *
+ * \param[in,out] d     The pointer to the (little-endian) array
+ *                      representing the bignum to accumulate onto.
+ * \param[in] r         The pointer to the (little-endian) array
+ *                      representing the bignum to conditionally add
+ *                      to \p d. This must be disjoint from \p d.
+ * \param n             Number of limbs of \p d and \p r.
+ * \param cond          Condition bit dictating whether addition should
+ *                      happen or not. This must be \c 0 or \c 1.
+ *
+ * \return              1 if `d + cond*r >= (2^{ciL})^n`, 0 otherwise.
+ */
+mbedtls_mpi_uint MPI_CORE(add_if)( mbedtls_mpi_uint *d,
+                                   const mbedtls_mpi_uint *r,
+                                   size_t n,
+                                   unsigned cond );
+
 #endif /* MBEDTLS_BIGNUM_CORE_H */
diff --git a/library/bignum_new.c b/library/bignum_new.c
new file mode 100644
index 0000000..29c1212
--- /dev/null
+++ b/library/bignum_new.c
@@ -0,0 +1,126 @@
+/*
+ *  Multi-precision integer library
+ *
+ *  Copyright The Mbed TLS Contributors
+ *  SPDX-License-Identifier: Apache-2.0
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"); you may
+ *  not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ *  WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include "common.h"
+
+#if defined(MBEDTLS_BIGNUM_C)
+
+#include "mbedtls/bignum.h"
+#include "bignum_core.h"
+#include "bn_mul.h"
+
+#include <string.h>
+
+void MPI_CORE(montmul)( mbedtls_mpi_uint *X,
+                        const mbedtls_mpi_uint *A,
+                        const mbedtls_mpi_uint *B,
+                        size_t B_len,
+                        const mbedtls_mpi_uint *N,
+                        size_t n,
+                        mbedtls_mpi_uint mm,
+                        mbedtls_mpi_uint *T )
+{
+    memset( T, 0, (2*n+1)*ciL );
+
+    for( size_t i = 0; i < n; i++, T++ )
+    {
+        mbedtls_mpi_uint u0, u1;
+        /* T = (T + u0*B + u1*N) / 2^biL */
+        u0 = A[i];
+        u1 = ( T[0] + u0 * B[0] ) * mm;
+
+        (void) MPI_CORE(mla)( T, n + 2, B, B_len, u0 );
+        (void) MPI_CORE(mla)( T, n + 2, N, n, u1 );
+    }
+
+    mbedtls_mpi_uint carry, borrow, fixup;
+
+    carry  = T[n];
+    borrow = MPI_CORE(sub)( X, T, N, n );
+    fixup  = carry < borrow;
+    (void) MPI_CORE(add_if)( X, N, n, fixup );
+}
+
+mbedtls_mpi_uint MPI_CORE(mla)( mbedtls_mpi_uint *d, size_t d_len,
+                                const mbedtls_mpi_uint *s, size_t s_len,
+                                mbedtls_mpi_uint b )
+{
+    mbedtls_mpi_uint c = 0; /* carry */
+    if( d_len < s_len )
+        s_len = d_len;
+    size_t excess_len = d_len - s_len;
+    size_t steps_x8 = s_len / 8;
+    size_t steps_x1 = s_len & 7;
+
+    while( steps_x8-- )
+    {
+        MULADDC_X8_INIT
+        MULADDC_X8_CORE
+        MULADDC_X8_STOP
+    }
+
+    while( steps_x1-- )
+    {
+        MULADDC_X1_INIT
+        MULADDC_X1_CORE
+        MULADDC_X1_STOP
+    }
+
+    while( excess_len-- )
+    {
+        *d += c; c = ( *d < c ); d++;
+    }
+
+    return( c );
+}
+
+mbedtls_mpi_uint MPI_CORE(sub)( mbedtls_mpi_uint *d,
+                                const mbedtls_mpi_uint *l,
+                                const mbedtls_mpi_uint *r,
+                                size_t n )
+{
+    mbedtls_mpi_uint c = 0, t, z;
+
+    for( size_t i = 0; i < n; i++ )
+    {
+        z = ( l[i] <  c );    t = l[i] - c;
+        c = ( t < r[i] ) + z; d[i] = t - r[i];
+    }
+
+    return( c );
+}
+
+mbedtls_mpi_uint MPI_CORE(add_if)( mbedtls_mpi_uint *d,
+                                   const mbedtls_mpi_uint *r,
+                                   size_t n,
+                                   unsigned cond )
+{
+    mbedtls_mpi_uint c = 0, t;
+    for( size_t i = 0; i < n; i++ )
+    {
+        mbedtls_mpi_uint add = cond * r[i];
+        t  = c;
+        t += d[i]; c  = ( t < d[i] );
+        t += add;  c += ( t < add  );
+        d[i] = t;
+    }
+    return( c );
+}
+
+#endif /* MBEDTLS_BIGNUM_C */