Pack the iota round constants

This saves ~160 bytes of code size, at the cost of a bit of localized
complexity in the code. The impact on performance is measurable but small
(<5% observed on x86_64) and can go either way (there's a calculation vs
memory bandwidth compromise).

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
diff --git a/library/sha3.c b/library/sha3.c
index 5df08f9..27d495f 100644
--- a/library/sha3.c
+++ b/library/sha3.c
@@ -26,14 +26,35 @@
 
 #define XOR_BYTE 0x6
 
-static const uint64_t rc[24] = {
-    0x0000000000000001, 0x0000000000008082, 0x800000000000808a, 0x8000000080008000,
-    0x000000000000808b, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
-    0x000000000000008a, 0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-    0x000000008000808b, 0x800000000000008b, 0x8000000000008089, 0x8000000000008003,
-    0x8000000000008002, 0x8000000000000080, 0x000000000000800a, 0x800000008000000a,
-    0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008,
+/* Precomputed masks for the iota transform.
+ *
+ * Each round uses a 64-bit mask value. In each mask values, only
+ * bits whose position is of the form 2^k-1 can be set, thus only
+ * 7 of 64 bits of the mask need to be known for each mask value.
+ *
+ * We use a compressed encoding of the mask where bits 63, 31 and 15
+ * are moved to bits 4-6. This allows us to make each mask value
+ * 1 byte rather than 8 bytes, saving 7*24 = 168 bytes of data (with
+ * perhaps a little variation due to alignment). Decompressing this
+ * requires a little code, but much less than the savings on the table.
+ *
+ * The impact on performance depends on the platform and compiler.
+ * There's a bit more computation, but less memory bandwidth. A quick
+ * benchmark on x86_64 shows a 7% speed improvement with GCC and a
+ * 5% speed penalty with Clang, compared to the naive uint64_t[24] table.
+ * YMMV.
+ */
+/* Helper macro to set the values of the higher bits in unused low positions */
+#define H(b63, b31, b15) (b63 << 6 | b31 << 5 | b15 << 4)
+static const uint8_t iota_r_packed[24] = {
+    H(0, 0, 0) | 0x01, H(0, 0, 1) | 0x82, H(1, 0, 1) | 0x8a, H(1, 1, 1) | 0x00,
+    H(0, 0, 1) | 0x8b, H(0, 1, 0) | 0x01, H(1, 1, 1) | 0x81, H(1, 0, 1) | 0x09,
+    H(0, 0, 0) | 0x8a, H(0, 0, 0) | 0x88, H(0, 1, 1) | 0x09, H(0, 1, 0) | 0x0a,
+    H(0, 1, 1) | 0x8b, H(1, 0, 0) | 0x8b, H(1, 0, 1) | 0x89, H(1, 0, 1) | 0x03,
+    H(1, 0, 1) | 0x02, H(1, 0, 0) | 0x80, H(0, 0, 1) | 0x0a, H(1, 1, 0) | 0x0a,
+    H(1, 1, 1) | 0x81, H(1, 0, 1) | 0x80, H(0, 1, 0) | 0x01, H(1, 1, 1) | 0x08,
 };
+#undef H
 
 static const uint8_t rho[24] = {
     1, 62, 28, 27, 36, 44,  6, 55, 20,
@@ -132,7 +153,11 @@
         s[24] ^= (~lane[0]) & lane[1];
 
         /* Iota */
-        s[0] ^= rc[round];
+        /* Decompress the round masks (see definition of rc) */
+        s[0] ^= ((iota_r_packed[round] & 0x40ull) << 57 |
+                 (iota_r_packed[round] & 0x20ull) << 26 |
+                 (iota_r_packed[round] & 0x10ull) << 11 |
+                 (iota_r_packed[round] & 0x8f));
     }
 }