Improve PBKDF2 with CMAC perf by ~16%

10x perf in cmac_multiply_by_u; 2% uplift in AES-CMAC benchmarks

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
diff --git a/library/cmac.c b/library/cmac.c
index f40cae2..f9f606f 100644
--- a/library/cmac.c
+++ b/library/cmac.c
@@ -58,7 +58,7 @@
     const unsigned char R_128 = 0x87;
     const unsigned char R_64 = 0x1B;
     unsigned char R_n, mask;
-    unsigned char overflow = 0x00;
+    uint32_t overflow = 0x00;
     int i;
 
     if (blocksize == MBEDTLS_AES_BLOCK_SIZE) {
@@ -69,9 +69,12 @@
         return MBEDTLS_ERR_CIPHER_BAD_INPUT_DATA;
     }
 
-    for (i = (int) blocksize - 1; i >= 0; i--) {
-        output[i] = input[i] << 1 | overflow;
-        overflow = input[i] >> 7;
+    for (i = (int) blocksize - 4; i >= 0; i -= 4) {
+        uint32_t i32 = MBEDTLS_GET_UINT32_BE(&input[i], 0);
+        uint32_t new_overflow = i32 >> 31;
+        i32 = (i32 << 1) | overflow;
+        MBEDTLS_PUT_UINT32_BE(i32, &output[i], 0);
+        overflow = new_overflow;
     }
 
     /* mask = ( input[0] >> 7 ) ? 0xff : 0x00