Base64 decoding: use ranges instead of tables
Instead of doing constant-flow table lookup, which requires 128 memory loads
for each lookup into a 128-entry table, do a range-based calculation, which
requires more CPU instructions per range but there are only 5 ranges.
Experimentally, this is ~12x faster on my PC (based on
programs/x509/load_roots). The code is slightly smaller, too.
Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
diff --git a/library/base64.c b/library/base64.c
index 8b818c8..7d9ddf9 100644
--- a/library/base64.c
+++ b/library/base64.c
@@ -46,23 +46,6 @@
'8', '9', '+', '/'
};
-static const unsigned char base64_dec_map[128] =
-{
- 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
- 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
- 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
- 127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
- 127, 127, 127, 62, 127, 127, 127, 63, 52, 53,
- 54, 55, 56, 57, 58, 59, 60, 61, 127, 127,
- 127, 127, 127, 127, 127, 0, 1, 2, 3, 4,
- 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
- 25, 127, 127, 127, 127, 127, 127, 26, 27, 28,
- 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
- 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
- 49, 50, 51, 127, 127, 127, 127, 127
-};
-
#define BASE64_SIZE_T_MAX ( (size_t) -1 ) /* SIZE_T_MAX is not standard */
/*
@@ -133,6 +116,18 @@
return result;
}
+/* Return 0xff if low <= c <= high, 0 otherwise.
+ *
+ * Constant flow with respect to c.
+ */
+static unsigned char mask_of_range( unsigned char low, unsigned char high,
+ unsigned char c )
+{
+ unsigned low_mask = ( c - low ) >> 8;
+ unsigned high_mask = ( c - high - 1 ) >> 8;
+ return( ~low_mask & high_mask & 0xff );
+}
+
/*
* Encode a buffer into base64 format
*/
@@ -211,6 +206,34 @@
return( 0 );
}
+/* Given a Base64 digit, return its value.
+ * If c is not a Base64 digit ('A'..'Z', 'a'..'z', '0'..'9', '+' or '/'),
+ * return -1.
+ *
+ * The implementation assumes that letters are consecutive (e.g. ASCII
+ * but not EBCDIC).
+ *
+ * The implementation is constant-flow (no branch or memory access depending
+ * on the value of c) unless the compiler inlines and optimizes a specific
+ * access.
+ */
+static signed char dec_value( unsigned char c )
+{
+ unsigned char val = 0;
+ /* For each range of digits, if c is in that range, mask val with
+ * the corresponding value. Since c can only be in a single range,
+ * only at most one masking will change val. Set val to one plus
+ * the desired value so that it stays 0 if c is in none of the ranges. */
+ val |= mask_of_range( 'A', 'Z', c ) & ( c - 'A' + 0 + 1 );
+ val |= mask_of_range( 'a', 'z', c ) & ( c - 'a' + 26 + 1 );
+ val |= mask_of_range( '0', '9', c ) & ( c - '0' + 52 + 1 );
+ val |= mask_of_range( '+', '+', c ) & ( c - '+' + 62 + 1 );
+ val |= mask_of_range( '/', '/', c ) & ( c - '/' + 63 + 1 );
+ /* At this point, val is 0 if c is an invalid digit and v+1 if c is
+ * a digit with the value v. */
+ return( val - 1 );
+}
+
/*
* Decode a base64-formatted buffer
*/
@@ -220,7 +243,6 @@
size_t i, n;
uint32_t j, x;
unsigned char *p;
- unsigned char dec_map_lookup;
/* First pass: check for validity and get output length */
for( i = n = j = 0; i < slen; i++ )
@@ -260,8 +282,7 @@
{
if( j != 0 )
return( MBEDTLS_ERR_BASE64_INVALID_CHARACTER );
- dec_map_lookup = mbedtls_base64_table_lookup( base64_dec_map, sizeof( base64_dec_map ), src[i] );
- if( dec_map_lookup == 127 )
+ if( dec_value( src[i] ) < 0 )
return( MBEDTLS_ERR_BASE64_INVALID_CHARACTER );
}
n++;
@@ -298,8 +319,7 @@
}
else
{
- dec_map_lookup = mbedtls_base64_table_lookup( base64_dec_map, sizeof( base64_dec_map ), *src );
- x = ( x << 6 ) | ( dec_map_lookup & 0x3F );
+ x = ( x << 6 ) | ( dec_value( *src ) & 0x3F );
}
if( ++n == 4 )