Add 2-fold unrolled assembly for umaal based multiplication

Signed-off-by: Hanno Becker <hanno.becker@arm.com>
diff --git a/library/bn_mul.h b/library/bn_mul.h
index 52eb70a..275be60 100644
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@@ -711,19 +711,57 @@
     defined (__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1)
 
 #define MULADDC_X1_INIT                            \
-    asm(
+    {                                              \
+        mbedtls_mpi_uint tmp_a, tmp_b;             \
+        asm volatile (
 
-#define MULADDC_X1_CORE                         \
-            "ldr    r0, [%0], #4        \n\t"   \
-            "ldr    r1, [%1]            \n\t"   \
-            "umaal  r1, %2, %3, r0      \n\t"   \
-            "str    r1, [%1], #4        \n\t"
+#define MULADDC_X1_CORE                                         \
+           ".p2align  2                                 \n\t"   \
+            "ldr.w    %[a], [%[in]], #4                 \n\t"   \
+            "ldr.w    %[b], [%[acc]]                    \n\t"   \
+            "umaal    %[b], %[carry], %[scalar], %[a]   \n\t"   \
+            "str.w    %[b], [%[acc]], #4                \n\t"
 
-#define MULADDC_X1_STOP                         \
-         : "=r" (s),  "=r" (d), "=r" (c)        \
-         : "r" (b), "0" (s), "1" (d), "2" (c)   \
-         : "r0", "r1", "memory"                 \
-         );
+#define MULADDC_X1_STOP                                      \
+            : [a]      "=&r" (tmp_a),                        \
+              [b]      "=&r" (tmp_b),                        \
+              [in]     "+r"  (s),                            \
+              [acc]    "+r"  (d),                            \
+              [carry]  "+l"  (c)                             \
+            : [scalar] "r"   (b)                             \
+            : "memory"                                       \
+        );                                                   \
+    }
+
+#define MULADDC_X2_INIT                              \
+    {                                                \
+        mbedtls_mpi_uint tmp_a0, tmp_b0;             \
+        mbedtls_mpi_uint tmp_a1, tmp_b1;             \
+        asm volatile (
+
+#define MULADDC_X2_CORE                                           \
+           ".p2align  2                                   \n\t"   \
+            "ldr.w    %[a0], [%[in]],  #+8                \n\t"   \
+            "ldr.w    %[b0], [%[acc]], #+8                \n\t"   \
+            "ldr.w    %[a1], [%[in],  #-4]                \n\t"   \
+            "ldr.w    %[b1], [%[acc], #-4]                \n\t"   \
+            "umaal    %[b0], %[carry], %[scalar], %[a0]   \n\t"   \
+            "umaal    %[b1], %[carry], %[scalar], %[a1]   \n\t"   \
+            "str.w    %[b0], [%[acc], #-8]                \n\t"   \
+            "str.w    %[b1], [%[acc], #-4]                \n\t"
+
+#define MULADDC_X2_STOP                                      \
+            : [a0]     "=&r" (tmp_a0),                       \
+              [b0]     "=&r" (tmp_b0),                       \
+              [a1]     "=&r" (tmp_a1),                       \
+              [b1]     "=&r" (tmp_b1),                       \
+              [in]     "+r"  (s),                            \
+              [acc]    "+r"  (d),                            \
+              [carry]  "+l"  (c)                             \
+            : [scalar] "r"   (b)                             \
+            : "memory"                                       \
+        );                                                   \
+    }
 
 #else