tinycrypt: Add ARM assembler optimisations
For ARM Compiler 5 only, provide assembler versions of vli_add, vli_sub,
vli_rshift1 and muladd.
Signed-off-by: Kevin Bracey <kevin.bracey@arm.com>
diff --git a/tinycrypt/ecc.c b/tinycrypt/ecc.c
index aeb6447..01db4e6 100644
--- a/tinycrypt/ecc.c
+++ b/tinycrypt/ecc.c
@@ -69,6 +69,16 @@
#include <string.h>
#include "mbedtls/platform_util.h"
+#ifdef __CC_ARM
+#pragma diag_suppress 667 // strict diagnostic: "asm" function is nonstandard
+#endif
+
+#if defined MBEDTLS_HAVE_ASM
+#ifndef asm
+#define asm __asm
+#endif
+#endif
+
/* Parameters for curve NIST P-256 aka secp256r1 */
const uECC_word_t curve_p[NUM_ECC_WORDS] = {
BYTES_TO_WORDS_8(FF, FF, FF, FF, FF, FF, FF, FF),
@@ -322,6 +332,62 @@
/* Computes result = left - right, returning borrow, in constant time.
* Can modify in place. */
+#if defined MBEDTLS_HAVE_ASM && defined __CC_ARM
+__asm uECC_word_t uECC_vli_sub(uECC_word_t *result, const uECC_word_t *left,
+ const uECC_word_t *right)
+{
+#if NUM_ECC_WORDS != 8
+#error adjust ARM assembly to handle NUM_ECC_WORDS != 8
+#endif
+#if defined __thumb__ && __TARGET_ARCH_THUMB < 4
+ PUSH {r4-r6,lr}
+ FRAME PUSH {r4-r6,lr}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ SUBS r3,r5
+ SBCS r4,r6
+ STMIA r0!,{r3,r4}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ SBCS r3,r5
+ SBCS r4,r6
+ STMIA r0!,{r3,r4}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ SBCS r3,r5
+ SBCS r4,r6
+ STMIA r0!,{r3,r4}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ SBCS r3,r5
+ SBCS r4,r6
+ STMIA r0!,{r3,r4}
+ SBCS r0,r0 // r0 := r0 - r0 - borrow = -borrow
+ RSBS r0,r0,#0 // r0 := borrow
+ POP {r4-r6,pc}
+#else
+ PUSH {r4-r8,lr}
+ FRAME PUSH {r4-r8,lr}
+ LDMIA r1!,{r3-r6}
+ LDMIA r2!,{r7,r8,r12,lr}
+ SUBS r3,r7
+ SBCS r4,r8
+ SBCS r5,r12
+ SBCS r6,lr
+ STMIA r0!,{r3-r6}
+ LDMIA r1!,{r3-r6}
+ LDMIA r2!,{r7,r8,r12,lr}
+ SBCS r3,r7
+ SBCS r4,r8
+ SBCS r5,r12
+ SBCS r6,lr
+ STMIA r0!,{r3-r6}
+ SBCS r0,r0 // r0 := r0 - r0 - borrow = -borrow
+ RSBS r0,r0,#0 // r0 := borrow
+ POP {r4-r8,pc}
+#endif
+}
+#else
uECC_word_t uECC_vli_sub(uECC_word_t *result, const uECC_word_t *left,
const uECC_word_t *right)
{
@@ -336,9 +402,66 @@
}
return borrow;
}
+#endif
/* Computes result = left + right, returning carry, in constant time.
* Can modify in place. */
+#if defined MBEDTLS_HAVE_ASM && defined __CC_ARM
+static __asm uECC_word_t uECC_vli_add(uECC_word_t *result, const uECC_word_t *left,
+ const uECC_word_t *right)
+{
+#if NUM_ECC_WORDS != 8
+#error adjust ARM assembly to handle NUM_ECC_WORDS != 8
+#endif
+#if defined __thumb__ && __TARGET_ARCH_THUMB < 4
+ PUSH {r4-r6,lr}
+ FRAME PUSH {r4-r6,lr}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ ADDS r3,r5
+ ADCS r4,r6
+ STMIA r0!,{r3,r4}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ ADCS r3,r5
+ ADCS r4,r6
+ STMIA r0!,{r3,r4}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ ADCS r3,r5
+ ADCS r4,r6
+ STMIA r0!,{r3,r4}
+ LDMIA r1!,{r3,r4}
+ LDMIA r2!,{r5,r6}
+ ADCS r3,r5
+ ADCS r4,r6
+ STMIA r0!,{r3,r4}
+ MOVS r0,#0 // does not affect C flag
+ ADCS r0,r0 // r0 := 0 + 0 + C = carry
+ POP {r4-r6,pc}
+#else
+ PUSH {r4-r8,lr}
+ FRAME PUSH {r4-r8,lr}
+ LDMIA r1!,{r3-r6}
+ LDMIA r2!,{r7,r8,r12,lr}
+ ADDS r3,r7
+ ADCS r4,r8
+ ADCS r5,r12
+ ADCS r6,lr
+ STMIA r0!,{r3-r6}
+ LDMIA r1!,{r3-r6}
+ LDMIA r2!,{r7,r8,r12,lr}
+ ADCS r3,r7
+ ADCS r4,r8
+ ADCS r5,r12
+ ADCS r6,lr
+ STMIA r0!,{r3-r6}
+ MOVS r0,#0 // does not affect C flag
+ ADCS r0,r0 // r0 := 0 + 0 + C = carry
+ POP {r4-r8,pc}
+#endif
+}
+#else
static uECC_word_t uECC_vli_add(uECC_word_t *result, const uECC_word_t *left,
const uECC_word_t *right)
{
@@ -352,6 +475,7 @@
}
return carry;
}
+#endif
cmpresult_t uECC_vli_cmp(const uECC_word_t *left, const uECC_word_t *right)
{
@@ -362,6 +486,46 @@
}
/* Computes vli = vli >> 1. */
+#if defined MBEDTLS_HAVE_ASM && defined __CC_ARM
+static __asm void uECC_vli_rshift1(uECC_word_t *vli)
+{
+#if defined __thumb__ && __TARGET_ARCH_THUMB < 4
+// RRX instruction is not available, so although we
+// can use C flag, it's not that effective. Does at
+// least save one working register, meaning we don't need stack
+ MOVS r3,#0 // initial carry = 0
+ MOVS r2,#__cpp(4 * (NUM_ECC_WORDS - 1))
+01 LDR r1,[r0,r2]
+ LSRS r1,r1,#1 // r2 = word >> 1
+ ORRS r1,r3 // merge in the previous carry
+ STR r1,[r0,r2]
+ ADCS r3,r3 // put C into bottom bit of r3
+ LSLS r3,r3,#31 // shift it up to the top ready for next word
+ SUBS r2,r2,#4
+ BPL %B01
+ BX lr
+#else
+#if NUM_ECC_WORDS != 8
+#error adjust ARM assembly to handle NUM_ECC_WORDS != 8
+#endif
+// Smooth multiword operation, lots of 32-bit instructions
+ ADDS r0,#32
+ LDMDB r0,{r1-r3,ip}
+ LSRS ip,ip,#1
+ RRXS r3,r3
+ RRXS r2,r2
+ RRXS r1,r1
+ STMDB r0!,{r1-r3,ip}
+ LDMDB r0,{r1-r3,ip}
+ RRXS ip,ip
+ RRXS r3,r3
+ RRXS r2,r2
+ RRX r1,r1
+ STMDB r0!,{r1-r3,ip}
+ BX lr
+#endif
+}
+#else
static void uECC_vli_rshift1(uECC_word_t *vli)
{
uECC_word_t *end = vli;
@@ -374,6 +538,7 @@
carry = temp << (uECC_WORD_BITS - 1);
}
}
+#endif
/* Compute a * b + r, where r is a double-word with high-order word r1 and
* low-order word r0, and store the result in the same double-word (r1, r0),
@@ -385,6 +550,50 @@
* [out] r0, r1: low and high-order words of the result
* [out] r2: carry
*/
+#if defined MBEDTLS_HAVE_ASM && defined __CC_ARM
+static __asm void muladd(uECC_word_t a, uECC_word_t b, uECC_word_t *r0,
+ uECC_word_t *r1, uECC_word_t *r2)
+{
+#if defined __thumb__ && __TARGET_ARCH_THUMB < 4
+ IMPORT __ARM_common_ll_muluu
+ PRESERVE8
+ // have to save r4 to keep stack 8-byte aligned, but then
+ // may as well make use of it - helps a bit having it
+ PUSH {r2,r3,r4,lr}
+ FRAME SAVE {r4,lr},-8
+ FRAME ADDRESS sp,16
+ BL __ARM_common_ll_muluu
+ POP {r2,r3}
+ FRAME ADDRESS sp,8
+ LDR r4,[r2]
+ ADDS r4,r0
+ STR r4,[r2]
+ LDR r4,[r3]
+ ADCS r4,r1
+ STR r4,[r3]
+ LDR r3,[sp,#8]
+ LDR r4,[r3]
+ MOVS r0,#0
+ ADCS r4,r0
+ STR r4,[r3]
+ POP {r4,pc}
+#else
+ UMULL r0,r1,r0,r1
+ LDR ip,[r2]
+ ADDS r0,r0,ip
+ STR r0,[r2]
+ LDR r0,[r3]
+ ADCS r0,r1
+ STR r0,[r3]
+ LDR r3,[sp,#0]
+ LDR r0,[r3]
+ ADC r0,r0,#0
+ STR r0,[r3]
+ BX lr
+#endif
+
+}
+#else
static void muladd(uECC_word_t a, uECC_word_t b, uECC_word_t *r0,
uECC_word_t *r1, uECC_word_t *r2)
{
@@ -397,6 +606,7 @@
*r0 = (uECC_word_t)r01;
}
+#endif
/* State for implementing random delays in uECC_vli_mult_rnd().
*