Improve SCA CM AES performance
SCA CM implementation caused AES performance drop. For example
AES-CCM-128 calculation speed was dropped from 240 KB/s to 111 KB/s.
(-54%), Similarily AES-CBC-128 calculation speed was dropped from
536 KB/s to 237 KB/s (-56%).
Use functions instead of macros to reduce code indirections and
therefore increase performance. Now the performance is 163 KB/s for
AES-CCM-128 (-32%) and 348 KB/s for AES-CBC-128 (-35%).
When SCA countermeasures are activated the performance is as follows:
122 KB/s for AES-CCM-128 (-49%) and 258 KB/s for AES-CBC-128 (-52%)
compared to the original AES implementation.
diff --git a/library/aes.c b/library/aes.c
index 4dacd64..9c5c169 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -928,106 +928,63 @@
#endif /* !MBEDTLS_AES_SETKEY_DEC_ALT */
-#define AES_FROUND(R,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
- do \
- { \
- (X0) = *(R)++ ^ AES_FT0( ( (Y0) ) & 0xFF ) ^ \
- AES_FT1( ( (Y1) >> 8 ) & 0xFF ) ^ \
- AES_FT2( ( (Y2) >> 16 ) & 0xFF ) ^ \
- AES_FT3( ( (Y3) >> 24 ) & 0xFF ); \
- \
- (X1) = *(R)++ ^ AES_FT0( ( (Y1) ) & 0xFF ) ^ \
- AES_FT1( ( (Y2) >> 8 ) & 0xFF ) ^ \
- AES_FT2( ( (Y3) >> 16 ) & 0xFF ) ^ \
- AES_FT3( ( (Y0) >> 24 ) & 0xFF ); \
- \
- (X2) = *(R)++ ^ AES_FT0( ( (Y2) ) & 0xFF ) ^ \
- AES_FT1( ( (Y3) >> 8 ) & 0xFF ) ^ \
- AES_FT2( ( (Y0) >> 16 ) & 0xFF ) ^ \
- AES_FT3( ( (Y1) >> 24 ) & 0xFF ); \
- \
- (X3) = *(R)++ ^ AES_FT0( ( (Y3) ) & 0xFF ) ^ \
- AES_FT1( ( (Y0) >> 8 ) & 0xFF ) ^ \
- AES_FT2( ( (Y1) >> 16 ) & 0xFF ) ^ \
- AES_FT3( ( (Y2) >> 24 ) & 0xFF ); \
- } while( 0 )
-
-#define AES_FROUND_F(R,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
- do { \
- (X0) = *(R)++ ^ ( (uint32_t) FSb[ ( (Y0) ) & 0xFF ] ) ^ \
- ( (uint32_t) FSb[ ( (Y1) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) FSb[ ( (Y2) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) FSb[ ( (Y3) >> 24 ) & 0xFF ] << 24 ); \
- \
- (X1) = *(R)++ ^ ( (uint32_t) FSb[ ( (Y1) ) & 0xFF ] ) ^ \
- ( (uint32_t) FSb[ ( (Y2) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) FSb[ ( (Y3) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) FSb[ ( (Y0) >> 24 ) & 0xFF ] << 24 ); \
- \
- (X2) = *(R)++ ^ ( (uint32_t) FSb[ ( (Y2) ) & 0xFF ] ) ^ \
- ( (uint32_t) FSb[ ( (Y3) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) FSb[ ( (Y0) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) FSb[ ( (Y1) >> 24 ) & 0xFF ] << 24 ); \
- \
- (X3) = *(R)++ ^ ( (uint32_t) FSb[ ( (Y3) ) & 0xFF ] ) ^ \
- ( (uint32_t) FSb[ ( (Y0) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) FSb[ ( (Y1) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) FSb[ ( (Y2) >> 24 ) & 0xFF ] << 24 ); \
- } while ( 0 )
-
-
-#define AES_RROUND(R,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
- do \
- { \
- (X0) = *(R)++ ^ AES_RT0( ( (Y0) ) & 0xFF ) ^ \
- AES_RT1( ( (Y3) >> 8 ) & 0xFF ) ^ \
- AES_RT2( ( (Y2) >> 16 ) & 0xFF ) ^ \
- AES_RT3( ( (Y1) >> 24 ) & 0xFF ); \
- \
- (X1) = *(R)++ ^ AES_RT0( ( (Y1) ) & 0xFF ) ^ \
- AES_RT1( ( (Y0) >> 8 ) & 0xFF ) ^ \
- AES_RT2( ( (Y3) >> 16 ) & 0xFF ) ^ \
- AES_RT3( ( (Y2) >> 24 ) & 0xFF ); \
- \
- (X2) = *(R)++ ^ AES_RT0( ( (Y2) ) & 0xFF ) ^ \
- AES_RT1( ( (Y1) >> 8 ) & 0xFF ) ^ \
- AES_RT2( ( (Y0) >> 16 ) & 0xFF ) ^ \
- AES_RT3( ( (Y3) >> 24 ) & 0xFF ); \
- \
- (X3) = *(R)++ ^ AES_RT0( ( (Y3) ) & 0xFF ) ^ \
- AES_RT1( ( (Y2) >> 8 ) & 0xFF ) ^ \
- AES_RT2( ( (Y1) >> 16 ) & 0xFF ) ^ \
- AES_RT3( ( (Y0) >> 24 ) & 0xFF ); \
- } while( 0 )
-
-#define AES_RROUND_F(R,X0,X1,X2,X3,Y0,Y1,Y2,Y3) \
- do \
- { \
- (X0) = *(R)++ ^ ( (uint32_t) RSb[ ( (Y0) ) & 0xFF ] ) ^ \
- ( (uint32_t) RSb[ ( (Y3) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) RSb[ ( (Y2) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) RSb[ ( (Y1) >> 24 ) & 0xFF ] << 24 ); \
- \
- (X1) = *(R)++ ^ ( (uint32_t) RSb[ ( (Y1) ) & 0xFF ] ) ^ \
- ( (uint32_t) RSb[ ( (Y0) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) RSb[ ( (Y3) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) RSb[ ( (Y2) >> 24 ) & 0xFF ] << 24 ); \
- \
- (X2) = *(R)++ ^ ( (uint32_t) RSb[ ( (Y2) ) & 0xFF ] ) ^ \
- ( (uint32_t) RSb[ ( (Y1) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) RSb[ ( (Y0) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) RSb[ ( (Y3) >> 24 ) & 0xFF ] << 24 ); \
- \
- (X3) = *(R)++ ^ ( (uint32_t) RSb[ ( (Y3) ) & 0xFF ] ) ^ \
- ( (uint32_t) RSb[ ( (Y2) >> 8 ) & 0xFF ] << 8 ) ^ \
- ( (uint32_t) RSb[ ( (Y1) >> 16 ) & 0xFF ] << 16 ) ^ \
- ( (uint32_t) RSb[ ( (Y0) >> 24 ) & 0xFF ] << 24 ); \
- } while( 0 )
-
/*
* AES-ECB block encryption
*/
#if !defined(MBEDTLS_AES_ENCRYPT_ALT)
+
+static uint32_t *aes_fround( uint32_t *R,
+ uint32_t *X0, uint32_t *X1, uint32_t *X2, uint32_t *X3,
+ uint32_t Y0, uint32_t Y1, uint32_t Y2, uint32_t Y3 )
+{
+ *X0 = *R++ ^ AES_FT0( ( Y0 ) & 0xFF ) ^
+ AES_FT1( ( Y1 >> 8 ) & 0xFF ) ^
+ AES_FT2( ( Y2 >> 16 ) & 0xFF ) ^
+ AES_FT3( ( Y3 >> 24 ) & 0xFF );
+
+ *X1 = *R++ ^ AES_FT0( ( Y1 ) & 0xFF ) ^
+ AES_FT1( ( Y2 >> 8 ) & 0xFF ) ^
+ AES_FT2( ( Y3 >> 16 ) & 0xFF ) ^
+ AES_FT3( ( Y0 >> 24 ) & 0xFF );
+
+ *X2 = *R++ ^ AES_FT0( ( Y2 ) & 0xFF ) ^
+ AES_FT1( ( Y3 >> 8 ) & 0xFF ) ^
+ AES_FT2( ( Y0 >> 16 ) & 0xFF ) ^
+ AES_FT3( ( Y1 >> 24 ) & 0xFF );
+
+ *X3 = *R++ ^ AES_FT0( ( Y3 ) & 0xFF ) ^
+ AES_FT1( ( Y0 >> 8 ) & 0xFF ) ^
+ AES_FT2( ( Y1 >> 16 ) & 0xFF ) ^
+ AES_FT3( ( Y2 >> 24 ) & 0xFF );
+
+ return R;
+}
+
+static void aes_fround_final( uint32_t *R,
+ uint32_t *X0, uint32_t *X1, uint32_t *X2, uint32_t *X3,
+ uint32_t Y0, uint32_t Y1, uint32_t Y2, uint32_t Y3 )
+{
+ *X0 = *R++ ^ ( (uint32_t) FSb[ ( (Y0) ) & 0xFF ] ) ^
+ ( (uint32_t) FSb[ ( (Y1) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) FSb[ ( (Y2) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) FSb[ ( (Y3) >> 24 ) & 0xFF ] << 24 );
+
+ *X1 = *R++ ^ ( (uint32_t) FSb[ ( (Y1) ) & 0xFF ] ) ^
+ ( (uint32_t) FSb[ ( (Y2) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) FSb[ ( (Y3) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) FSb[ ( (Y0) >> 24 ) & 0xFF ] << 24 );
+
+ *X2 = *R++ ^ ( (uint32_t) FSb[ ( (Y2) ) & 0xFF ] ) ^
+ ( (uint32_t) FSb[ ( (Y3) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) FSb[ ( (Y0) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) FSb[ ( (Y1) >> 24 ) & 0xFF ] << 24 );
+
+ *X3 = *R++ ^ ( (uint32_t) FSb[ ( (Y3) ) & 0xFF ] ) ^
+ ( (uint32_t) FSb[ ( (Y0) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) FSb[ ( (Y1) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) FSb[ ( (Y2) >> 24 ) & 0xFF ] << 24 );
+}
+
int mbedtls_internal_aes_encrypt( mbedtls_aes_context *ctx,
const unsigned char input[16],
unsigned char output[16] )
@@ -1073,11 +1030,11 @@
aes_data_ptr = aes_data_table[round_ctrl_table[i] >> 4];
offset = round_ctrl_table[i] & 0x0f;
- AES_FROUND( aes_data_ptr->rk_ptr,
- aes_data_ptr->xy_values[0 + offset],
- aes_data_ptr->xy_values[1 + offset],
- aes_data_ptr->xy_values[2 + offset],
- aes_data_ptr->xy_values[3 + offset],
+ aes_data_ptr->rk_ptr = aes_fround( aes_data_ptr->rk_ptr,
+ &aes_data_ptr->xy_values[0 + offset],
+ &aes_data_ptr->xy_values[1 + offset],
+ &aes_data_ptr->xy_values[2 + offset],
+ &aes_data_ptr->xy_values[3 + offset],
aes_data_ptr->xy_values[4 - offset],
aes_data_ptr->xy_values[5 - offset],
aes_data_ptr->xy_values[6 - offset],
@@ -1087,11 +1044,11 @@
for ( j = 0; j < start_fin_loops; j++ )
{
aes_data_ptr = aes_data_table[round_ctrl_table[ i + j ] >> 4];
- AES_FROUND_F( aes_data_ptr->rk_ptr,
- aes_data_ptr->xy_values[0],
- aes_data_ptr->xy_values[1],
- aes_data_ptr->xy_values[2],
- aes_data_ptr->xy_values[3],
+ aes_fround_final( aes_data_ptr->rk_ptr,
+ &aes_data_ptr->xy_values[0],
+ &aes_data_ptr->xy_values[1],
+ &aes_data_ptr->xy_values[2],
+ &aes_data_ptr->xy_values[3],
aes_data_ptr->xy_values[4],
aes_data_ptr->xy_values[5],
aes_data_ptr->xy_values[6],
@@ -1122,6 +1079,58 @@
#if !defined(MBEDTLS_AES_DECRYPT_ALT)
#if !defined(MBEDTLS_AES_ONLY_ENCRYPT)
+
+static uint32_t *aes_rround( uint32_t *R,
+ uint32_t *X0, uint32_t *X1, uint32_t *X2, uint32_t *X3,
+ uint32_t Y0, uint32_t Y1, uint32_t Y2, uint32_t Y3 )
+{
+ *X0 = *R++ ^ AES_RT0( ( Y0 ) & 0xFF ) ^
+ AES_RT1( ( Y3 >> 8 ) & 0xFF ) ^
+ AES_RT2( ( Y2 >> 16 ) & 0xFF ) ^
+ AES_RT3( ( Y1 >> 24 ) & 0xFF );
+
+ *X1 = *R++ ^ AES_RT0( ( Y1 ) & 0xFF ) ^
+ AES_RT1( ( Y0 >> 8 ) & 0xFF ) ^
+ AES_RT2( ( Y3 >> 16 ) & 0xFF ) ^
+ AES_RT3( ( Y2 >> 24 ) & 0xFF );
+
+ *X2 = *R++ ^ AES_RT0( ( Y2 ) & 0xFF ) ^
+ AES_RT1( ( Y1 >> 8 ) & 0xFF ) ^
+ AES_RT2( ( Y0 >> 16 ) & 0xFF ) ^
+ AES_RT3( ( Y3 >> 24 ) & 0xFF );
+
+ *X3 = *R++ ^ AES_RT0( ( Y3 ) & 0xFF ) ^
+ AES_RT1( ( Y2 >> 8 ) & 0xFF ) ^
+ AES_RT2( ( Y1 >> 16 ) & 0xFF ) ^
+ AES_RT3( ( Y0 >> 24 ) & 0xFF );
+ return R;
+}
+
+static void aes_rround_final( uint32_t *R,
+ uint32_t *X0, uint32_t *X1, uint32_t *X2, uint32_t *X3,
+ uint32_t Y0, uint32_t Y1, uint32_t Y2, uint32_t Y3 )
+{
+ *X0 = *R++ ^ ( (uint32_t) RSb[ ( (Y0) ) & 0xFF ] ) ^
+ ( (uint32_t) RSb[ ( (Y3) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) RSb[ ( (Y2) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) RSb[ ( (Y1) >> 24 ) & 0xFF ] << 24 );
+
+ *X1 = *R++ ^ ( (uint32_t) RSb[ ( (Y1) ) & 0xFF ] ) ^
+ ( (uint32_t) RSb[ ( (Y0) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) RSb[ ( (Y3) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) RSb[ ( (Y2) >> 24 ) & 0xFF ] << 24 );
+
+ *X2 = *R++ ^ ( (uint32_t) RSb[ ( (Y2) ) & 0xFF ] ) ^
+ ( (uint32_t) RSb[ ( (Y1) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) RSb[ ( (Y0) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) RSb[ ( (Y3) >> 24 ) & 0xFF ] << 24 );
+
+ *X3 = *R++ ^ ( (uint32_t) RSb[ ( (Y3) ) & 0xFF ] ) ^
+ ( (uint32_t) RSb[ ( (Y2) >> 8 ) & 0xFF ] << 8 ) ^
+ ( (uint32_t) RSb[ ( (Y1) >> 16 ) & 0xFF ] << 16 ) ^
+ ( (uint32_t) RSb[ ( (Y0) >> 24 ) & 0xFF ] << 24 );
+}
+
int mbedtls_internal_aes_decrypt( mbedtls_aes_context *ctx,
const unsigned char input[16],
unsigned char output[16] )
@@ -1167,11 +1176,11 @@
aes_data_ptr = aes_data_table[round_ctrl_table[i] >> 4];
offset = round_ctrl_table[i] & 0x0f;
- AES_RROUND( aes_data_ptr->rk_ptr,
- aes_data_ptr->xy_values[0 + offset],
- aes_data_ptr->xy_values[1 + offset],
- aes_data_ptr->xy_values[2 + offset],
- aes_data_ptr->xy_values[3 + offset],
+ aes_data_ptr->rk_ptr = aes_rround( aes_data_ptr->rk_ptr,
+ &aes_data_ptr->xy_values[0 + offset],
+ &aes_data_ptr->xy_values[1 + offset],
+ &aes_data_ptr->xy_values[2 + offset],
+ &aes_data_ptr->xy_values[3 + offset],
aes_data_ptr->xy_values[4 - offset],
aes_data_ptr->xy_values[5 - offset],
aes_data_ptr->xy_values[6 - offset],
@@ -1181,11 +1190,11 @@
for ( j = 0; j < start_fin_loops; j++ )
{
aes_data_ptr = aes_data_table[round_ctrl_table[ i + j ] >> 4];
- AES_RROUND_F( aes_data_ptr->rk_ptr,
- aes_data_ptr->xy_values[0],
- aes_data_ptr->xy_values[1],
- aes_data_ptr->xy_values[2],
- aes_data_ptr->xy_values[3],
+ aes_rround_final( aes_data_ptr->rk_ptr,
+ &aes_data_ptr->xy_values[0],
+ &aes_data_ptr->xy_values[1],
+ &aes_data_ptr->xy_values[2],
+ &aes_data_ptr->xy_values[3],
aes_data_ptr->xy_values[4],
aes_data_ptr->xy_values[5],
aes_data_ptr->xy_values[6],