Remove temporary allocation in mbedtls_mpi_sub_abs

In cases where X == B the code now no longer requires an extra call to
mbedtls_calloc if X->n >= A->n.  This reduces calls to mbedtls_calloc in
test_suite_ecdsa by 52% and in test_suite_ecdh by 36%.

This does not update mpi_sub_hlp - in mpi_montmul this translates to a
performance decrease for rsa and dhm since in all cases X == A.

Signed-off-by: Eric Adamson <eadamson@fastmail.fm>
diff --git a/library/bignum.c b/library/bignum.c
index d53aefd..63a656f 100644
--- a/library/bignum.c
+++ b/library/bignum.c
@@ -1353,9 +1353,7 @@
  */
 int mbedtls_mpi_sub_abs( mbedtls_mpi *X, const mbedtls_mpi *A, const mbedtls_mpi *B )
 {
-    mbedtls_mpi TB;
     int ret = MBEDTLS_ERR_ERROR_CORRUPTION_DETECTED;
-    size_t n;
     MPI_VALIDATE_RET( X != NULL );
     MPI_VALIDATE_RET( A != NULL );
     MPI_VALIDATE_RET( B != NULL );
@@ -1363,34 +1361,48 @@
     if( mbedtls_mpi_cmp_abs( A, B ) < 0 )
         return( MBEDTLS_ERR_MPI_NEGATIVE_VALUE );
 
-    mbedtls_mpi_init( &TB );
-
-    if( X == B )
-    {
-        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( &TB, B ) );
-        B = &TB;
-    }
-
-    if( X != A )
-        MBEDTLS_MPI_CHK( mbedtls_mpi_copy( X, A ) );
-
     /*
      * X should always be positive as a result of unsigned subtractions.
      */
     X->s = 1;
-
     ret = 0;
 
-    for( n = B->n; n > 0; n-- )
-        if( B->p[n - 1] != 0 )
-            break;
+    MBEDTLS_MPI_CHK( mbedtls_mpi_grow( X, A->n ) );
 
-    mpi_sub_hlp( n, B->p, X->p );
+    size_t width = A->n > B->n ? B->n : A->n;
+    size_t i;
+    mbedtls_mpi_uint c = 0;
+
+    for( i = 0; i < width; i++ )
+    {
+        mbedtls_mpi_uint a = A->p[i];
+        mbedtls_mpi_uint b = B->p[i];
+        mbedtls_mpi_uint z = a < c;
+
+        X->p[i] = a - c;
+        c = ( X->p[i] <  b ) + z;
+        X->p[i] -= b;
+    }
+
+    for( ; i < A->n && c > 0; i++ )
+    {
+        mbedtls_mpi_uint a = A->p[i];
+        mbedtls_mpi_uint z = a < c;
+        X->p[i] = a - c;
+        c = z;
+    }
+
+    for( ; i < A->n; i++ )
+    {
+        X->p[i] = A->p[i];
+    }
+
+    for( ; i < X->n; i++ )
+    {
+        X->p[i] = 0;
+    }
 
 cleanup:
-
-    mbedtls_mpi_free( &TB );
-
     return( ret );
 }