Optimise final 2 rounds

Final two rounds logic could be significantly simplified.

Signed-off-by: Paul Elliott <paul.elliott@arm.com>
diff --git a/library/ecp_curves.c b/library/ecp_curves.c
index 1c797d8..6ee3641 100644
--- a/library/ecp_curves.c
+++ b/library/ecp_curves.c
@@ -5487,7 +5487,7 @@
 MBEDTLS_STATIC_TESTABLE
 int mbedtls_ecp_mod_p448(mbedtls_mpi_uint *X, size_t X_limbs)
 {
-    size_t i, round;
+    size_t round;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
 
     if (X_limbs <= P448_WIDTH) {
@@ -5567,32 +5567,23 @@
      * A0 + A1 + B1 + (B0 + B1) * 2^224 = A0 + A1 + B0 * 2^224. */
     for (round = 0; round < 2; ++round) {
 
-        /* Q = A1 */
-        memset(Q, 0, (Q_limbs * ciL));
-        memcpy(Q, X + P448_WIDTH, ((Q_limbs - 1) * ciL));
+        /* M = A1 */
+        memset(M, 0, (M_limbs * ciL));
+        memcpy(M, X + P448_WIDTH, ((M_limbs - 1) * ciL));
 
         /* X = A0 */
         memset(X + P448_WIDTH, 0, ((M_limbs - 1) * ciL));
 
-        /* M = B0 */
-        memcpy(M, Q, (Q_limbs * ciL));
-        M[M_limbs - 1] = 0;
-
-        if (ciL > 4) {
-            M[P224_WIDTH_MIN] &= ((mbedtls_mpi_uint) -1) >> (P224_UNUSED_BITS);
-        }
-
-        /* M = B0 * 2^224
-         * Oversize M once again takes any carry. */
-        memmove((char *) M + P224_SIZE, M, P224_SIZE + ciL);
-        memset(M, 0, P224_SIZE);
-
         /* M = A1 + B0 * 2^224
-         * No need to have to call mbedtls_mpi_core_add() as as both bignums
-         * should be all zero except one non-colliding limb each. */
-        for (i = 0; i < (M_limbs - 1); ++i) {
-            M[i] = M[i] + Q[i];
-        }
+         * We know that only one limb of A1 will be non-zero and that it will be
+         * limb 0. We also know that B0 is the bottom 224 bits of A1 (which is
+         * then shifted up 224 bits), so, given M is currently A1 this turns
+         * into:
+         * M = M + (M << 224)
+         * As the single non-zero limb in B0 will be A1 limb 0 shifted up by 224
+         * bits, we can just move that into the right place, shifted up
+         * accordingly.*/
+        M[P224_WIDTH_MIN] = M[0] << (224 & (biL - 1));
 
         /* X = A0 + (A1 + B0 * 2^224) */
         (void) mbedtls_mpi_core_add(X, X, M, M_limbs);