Double perf for AES-XEX

As seen from the first benchmark run, AES-XEX was running pourly (even
slower than AES-CBC). This commit doubles the performances of the
current implementation.
diff --git a/library/aes.c b/library/aes.c
index 1f21587..1c69c97 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -999,36 +999,45 @@
                     const unsigned char *input,
                     unsigned char *output )
 {
-    int i;
-    unsigned char t_buf[16];
-    unsigned char scratch[16];
+    union xex_buf128 {
+        uint8_t  u8[16];
+        uint64_t u64[2];
+    };
+
+    union xex_buf128 scratch;
+    union xex_buf128 t_buf;
+    union xex_buf128 *inbuf;
+    union xex_buf128 *outbuf;
+
+    inbuf = (union xex_buf128*)input;
+    outbuf = (union xex_buf128*)output;
 
     if( length % 16 )
         return( MBEDTLS_ERR_AES_INVALID_INPUT_LENGTH );
 
 
-    mbedtls_aes_crypt_ecb( tweak_ctx, MBEDTLS_AES_ENCRYPT, iv, t_buf );
+    mbedtls_aes_crypt_ecb( tweak_ctx, MBEDTLS_AES_ENCRYPT, iv, t_buf.u8 );
 
     goto first;
 
     do
     {
-        mbedtls_gf128mul_x_ble( t_buf, t_buf );
+        mbedtls_gf128mul_x_ble( t_buf.u8, t_buf.u8 );
 
 first:
         /* PP <- T xor P */
-        for( i = 0; i < 16; i++ )
-            scratch[i] = (unsigned char)( input[i] ^ t_buf[i] );
+        scratch.u64[0] = (uint64_t)( inbuf->u64[0] ^ t_buf.u64[0] );
+        scratch.u64[1] = (uint64_t)( inbuf->u64[1] ^ t_buf.u64[1] );
 
         /* CC <- E(Key2,PP) */
-        mbedtls_aes_crypt_ecb( crypt_ctx, mode, scratch, output );
+        mbedtls_aes_crypt_ecb( crypt_ctx, mode, scratch.u8, outbuf->u8 );
 
         /* C <- T xor CC */
-        for( i = 0; i < 16; i++ )
-            output[i] = (unsigned char)( output[i] ^ t_buf[i] );
+        outbuf->u64[0] = (uint64_t)( outbuf->u64[0] ^ t_buf.u64[0] );
+        outbuf->u64[1] = (uint64_t)( outbuf->u64[1] ^ t_buf.u64[1] );
 
-        input  += 16;
-        output += 16;
+        inbuf  += 1;
+        outbuf += 1;
         length -= 16;
     } while( length > 0 );