Make ecp_mod_p521 a bit faster
diff --git a/library/ecp.c b/library/ecp.c
index 78b05c4..33081a0 100644
--- a/library/ecp.c
+++ b/library/ecp.c
@@ -547,12 +547,12 @@
 /*
  * Size of p521 in terms of t_uint
  */
-#define P521_SIZE_INT   ( 521 / CHAR_BIT / sizeof( t_uint ) + 1 )
+#define P521_SIZE_INT   ( 521 / 8 / sizeof( t_uint ) + 1 )
 
 /*
  * Bits to keep in the most significant t_uint
  */
-#if defined(POLARSS_HAVE_INT8)
+#if defined(POLARSSL_HAVE_INT8)
 #define P521_MASK       0x01
 #else
 #define P521_MASK       0x01FF
@@ -560,26 +560,36 @@
 
 /*
  * Fast quasi-reduction modulo p521 (FIPS 186-3 D.2.5)
+ * Write N as A1 + 2^521 A0, return A0 + A1
  */
 static int ecp_mod_p521( mpi *N )
 {
     int ret;
-    t_uint Mp[P521_SIZE_INT];
+    size_t i;
     mpi M;
+    t_uint Mp[P521_SIZE_INT+1];
+    /* Worst case for the size of M is when sizeof( t_uint ) == 16:
+     * we need to hold bits 513 to 1056, which is 34 limbs, that is
+     * P521_SIZE_INT + 1. Otherwise P521_SIZE is enough. */
 
     if( N->n < P521_SIZE_INT )
         return( 0 );
 
-    memset( Mp, 0, P521_SIZE_INT * sizeof( t_uint ) );
-    memcpy( Mp, N->p, P521_SIZE_INT * sizeof( t_uint ) );
-    Mp[P521_SIZE_INT - 1] &= P521_MASK;
-
+    /* M = A1 */
     M.s = 1;
-    M.n = P521_SIZE_INT;
+    M.n = N->n - ( P521_SIZE_INT - 1 );
+    if( M.n > P521_SIZE_INT + 1 )
+        M.n = P521_SIZE_INT + 1;
     M.p = Mp;
+    memcpy( Mp, N->p + P521_SIZE_INT - 1, M.n * sizeof( t_uint ) );
+    MPI_CHK( mpi_shift_r( &M, 521 % ( 8 * sizeof( t_uint ) ) ) );
 
-    MPI_CHK( mpi_shift_r( N, 521 ) );
+    /* N = A0 */
+    N->p[P521_SIZE_INT - 1] &= P521_MASK;
+    for( i = P521_SIZE_INT; i < N->n; i++ )
+        N->p[i] = 0;
 
+    /* N = A0 + A1 */
     MPI_CHK( mpi_add_abs( N, N, &M ) );
 
 cleanup:
diff --git a/tests/suites/test_suite_ecp.data b/tests/suites/test_suite_ecp.data
index 4748ff9..c8ed20f 100644
--- a/tests/suites/test_suite_ecp.data
+++ b/tests/suites/test_suite_ecp.data
@@ -273,6 +273,22 @@
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_fast_mod:POLARSSL_ECP_DP_SECP192R1:"1AC2D6F96A2A425E9DD1776DD8368D4BBC86BF4964E79FEA713583BF948BBEFF0939F96FB19EC48C585BDA6A2D35C750"
 
+ECP mod p521 very small
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"01"
+
+ECP mod p521 small (522 bits)
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"030000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+
+ECP mod p521 readable
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"03FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+
+ECP mod p521 readable with carry
+depends_on:POLARSSL_ECP_DP_SECP521R1_ENABLED
+ecp_fast_mod:POLARSSL_ECP_DP_SECP521R1:"03FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001"
+
 ECP test vectors secp192r1 rfc 5114
 depends_on:POLARSSL_ECP_DP_SECP192R1_ENABLED
 ecp_test_vect:POLARSSL_ECP_DP_SECP192R1:"323FA3169D8E9C6593F59476BC142000AB5BE0E249C43426":"CD46489ECFD6C105E7B3D32566E2B122E249ABAADD870612":"68887B4877DF51DD4DC3D6FD11F0A26F8FD3844317916E9A":"631F95BB4A67632C9C476EEE9AB695AB240A0499307FCF62":"519A121680E0045466BA21DF2EEE47F5973B500577EF13D5":"FF613AB4D64CEE3A20875BDB10F953F6B30CA072C60AA57F":"AD420182633F8526BFE954ACDA376F05E5FF4F837F54FEBE":"4371545ED772A59741D0EDA32C671112B7FDDD51461FCF32"