CMSIS-DSP: Added Helium support to Complex Math Functions.
Added new test patterns for complex math functions.
Added new tests to test framework.
Improved error handling of test framework.
diff --git a/CMSIS/DSP/Source/CommonTables/arm_common_tables.c b/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
index 7f7f79d..40de826 100644
--- a/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
+++ b/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
@@ -57213,4 +57213,129 @@
 };
 #endif /* defined(ARM_ALL_FAST_TABLES) */
 
+#if defined(ARM_MATH_MVEI)
+     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
+const q31_t sqrtTable_Q31[256] = {
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x5ffffffe, 0x7ffffffa, 0x5e88c596, 0x7a39e2ff,
+    0x5d2238d8, 0x74dfa6b1, 0x5bcb268e, 0x6fe69562,
+    0x5a827998, 0x6b454dba, 0x5947373a, 0x66f39063,
+    0x58187bf8, 0x62ea1669, 0x56f578e6, 0x5f226e5f,
+    0x55dd714e, 0x5b96df40, 0x54cfb8b0, 0x58424fca,
+    0x53cbb102, 0x552031c7, 0x52d0c92c, 0x522c7048,
+    0x51de7bb2, 0x4f6360a9, 0x50f44d86, 0x4cc1b594,
+    0x5011cd0a, 0x4a4473f6, 0x4f36911a, 0x47e8e962,
+    0x4e62384e, 0x45aca3d0, 0x4d946838, 0x438d6a6a,
+    0x4cccccca, 0x41893746, 0x4c0b17c0, 0x3f9e31fa,
+    0x4b4f0022, 0x3dcaaac3, 0x4a9841ce, 0x3c0d1660,
+    0x49e69d14, 0x3a640a53, 0x4939d654, 0x38ce39a6,
+    0x4891b5b2, 0x374a720b, 0x47ee06ba, 0x35d7993c,
+    0x474e9830, 0x3474aacc, 0x46b33bc0, 0x3320b60c,
+    0x461bc5d2, 0x31dadc4f, 0x45880d4c, 0x30a24f3c,
+    0x44f7eb6a, 0x2f764f6a, 0x446b3b92, 0x2e562b0b,
+    0x43e1db32, 0x2d413cc9, 0x435ba98a, 0x2c36eab0,
+    0x42d887a4, 0x2b36a542, 0x42585824, 0x2a3fe69b,
+    0x41daff34, 0x295231b0, 0x41606266, 0x286d118d,
+    0x40e868a4, 0x279018c3, 0x4072fa10, 0x26bae0c8,
+    0x3ffffffe, 0x25ed0979, 0x3f8f64d2, 0x25263894,
+    0x3f2113f8, 0x24661958, 0x3eb4f9d8, 0x23ac5c10,
+    0x3e4b03ba, 0x22f8b5bb, 0x3de31fc4, 0x224adfba,
+    0x3d7d3cee, 0x21a2977a, 0x3d194ae8, 0x20ff9e30,
+    0x3cb73a24, 0x2061b89a, 0x3c56fbba, 0x1fc8aebb,
+    0x3bf88166, 0x1f344ba6, 0x3b9bbd80, 0x1ea45d4b,
+    0x3b40a2f0, 0x1e18b448, 0x3ae7252e, 0x1d9123b9,
+    0x3a8f382c, 0x1d0d8113, 0x3a38d062, 0x1c8da3fb,
+    0x39e3e2b6, 0x1c116628, 0x39906482, 0x1b98a33c,
+    0x393e4b8a, 0x1b2338aa, 0x38ed8df6, 0x1ab1059a,
+    0x389e2250, 0x1a41eace, 0x384fff7a, 0x19d5ca89,
+    0x38031cb2, 0x196c887b, 0x37b77184, 0x190609a9,
+    0x376cf5ce, 0x18a2345b, 0x3723a1ba, 0x1840f00a,
+    0x36db6db6, 0x17e22550, 0x36945276, 0x1785bdd2,
+    0x364e48f4, 0x172ba43e, 0x36094a64, 0x16d3c42c,
+    0x35c55036, 0x167e0a1f, 0x35825414, 0x162a6372,
+    0x35404fde, 0x15d8be4d, 0x34ff3dac, 0x1589099d,
+    0x34bf17c4, 0x153b3508, 0x347fd89e, 0x14ef30e4,
+    0x34417ade, 0x14a4ee2a, 0x3403f956, 0x145c5e76,
+    0x33c74f02, 0x141573fa, 0x338b7706, 0x13d02171,
+    0x33506cae, 0x138c5a28, 0x33162b6a, 0x134a11e6,
+    0x32dcaed0, 0x13093cee, 0x32a3f294, 0x12c9cffa,
+    0x326bf292, 0x128bc034, 0x3234aac0, 0x124f0330,
+    0x31fe1736, 0x12138ee5, 0x31c83428, 0x11d959b0,
+    0x3192fde6, 0x11a05a45, 0x315e70de, 0x116887b5,
+    0x312a8994, 0x1131d960, 0x30f744aa, 0x10fc46fd,
+    0x30c49ed6, 0x10c7c88b, 0x309294ea, 0x10945653,
+    0x306123cc, 0x1061e8e6, 0x30304878, 0x10307919
+};
+     #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+     #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
+const q15_t sqrtTable_Q15[256] = {
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+    0x5fff, 0x7fff, 0x5e88, 0x7a39,
+    0x5d22, 0x74df, 0x5bcb, 0x6fe6,
+    0x5a82, 0x6b45, 0x5947, 0x66f3,
+    0x5818, 0x62ea, 0x56f5, 0x5f22,
+    0x55dd, 0x5b96, 0x54cf, 0x5842,
+    0x53cb, 0x5520, 0x52d0, 0x522c,
+    0x51de, 0x4f63, 0x50f4, 0x4cc1,
+    0x5011, 0x4a44, 0x4f36, 0x47e8,
+    0x4e62, 0x45ac, 0x4d94, 0x438d,
+    0x4ccc, 0x4189, 0x4c0b, 0x3f9e,
+    0x4b4f, 0x3dca, 0x4a98, 0x3c0d,
+    0x49e6, 0x3a64, 0x4939, 0x38ce,
+    0x4891, 0x374a, 0x47ee, 0x35d7,
+    0x474e, 0x3474, 0x46b3, 0x3320,
+    0x461b, 0x31da, 0x4588, 0x30a2,
+    0x44f7, 0x2f76, 0x446b, 0x2e56,
+    0x43e1, 0x2d41, 0x435b, 0x2c36,
+    0x42d8, 0x2b36, 0x4258, 0x2a3f,
+    0x41da, 0x2952, 0x4160, 0x286d,
+    0x40e8, 0x2790, 0x4072, 0x26ba,
+    0x3fff, 0x25ed, 0x3f8f, 0x2526,
+    0x3f21, 0x2466, 0x3eb4, 0x23ac,
+    0x3e4b, 0x22f8, 0x3de3, 0x224a,
+    0x3d7d, 0x21a2, 0x3d19, 0x20ff,
+    0x3cb7, 0x2061, 0x3c56, 0x1fc8,
+    0x3bf8, 0x1f34, 0x3b9b, 0x1ea4,
+    0x3b40, 0x1e18, 0x3ae7, 0x1d91,
+    0x3a8f, 0x1d0d, 0x3a38, 0x1c8d,
+    0x39e3, 0x1c11, 0x3990, 0x1b98,
+    0x393e, 0x1b23, 0x38ed, 0x1ab1,
+    0x389e, 0x1a41, 0x384f, 0x19d5,
+    0x3803, 0x196c, 0x37b7, 0x1906,
+    0x376c, 0x18a2, 0x3723, 0x1840,
+    0x36db, 0x17e2, 0x3694, 0x1785,
+    0x364e, 0x172b, 0x3609, 0x16d3,
+    0x35c5, 0x167e, 0x3582, 0x162a,
+    0x3540, 0x15d8, 0x34ff, 0x1589,
+    0x34bf, 0x153b, 0x347f, 0x14ef,
+    0x3441, 0x14a4, 0x3403, 0x145c,
+    0x33c7, 0x1415, 0x338b, 0x13d0,
+    0x3350, 0x138c, 0x3316, 0x134a,
+    0x32dc, 0x1309, 0x32a3, 0x12c9,
+    0x326b, 0x128b, 0x3234, 0x124f,
+    0x31fe, 0x1213, 0x31c8, 0x11d9,
+    0x3192, 0x11a0, 0x315e, 0x1168,
+    0x312a, 0x1131, 0x30f7, 0x10fc,
+    0x30c4, 0x10c7, 0x3092, 0x1094,
+    0x3061, 0x1061, 0x3030, 0x1030
+};
+     #endif
+#endif /* defined(ARM_MATH_MVEI) */
+
+
 #endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_TABLES) */
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt b/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt
index fbf40c7..87ea102 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt
@@ -7,13 +7,47 @@
 
 file(GLOB SRC "./*_*.c")
 
-add_library(CMSISDSPComplexMath STATIC ${SRC})
+add_library(CMSISDSPComplexMath STATIC)
 
 configLib(CMSISDSPComplexMath ${ROOT})
 configDsp(CMSISDSPComplexMath ${ROOT})
 
+
+include(interpol)
+interpol(CMSISDSPFastMath)
+
+if (CONFIGTABLE AND ALLFAST)
+    target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)  
+endif()
+
+# MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
+# There is the possibility of not compiling this function and not including
+# the table.
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
+endif()
+
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
+
+
 ### Includes
 target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")
-
-
-
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
index df5db00..601b74d 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -68,15 +68,66 @@
   @return        none
  */
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 
 void arm_cmplx_conj_f32(
+    const float32_t * pSrc,
+    float32_t * pDst,
+    uint32_t numSamples)
+{
+    static const float32_t cmplx_conj_sign[4] = { 1.0, -1.0, 1.0, -1.0 };
+    int32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
+    int32_t blkCnt;
+    f32x4_t vecSrc;
+    f32x4_t vecSign;
+
+    /*
+     * load sign vector
+     */
+    vecSign = *(f32x4_t *) cmplx_conj_sign;
+
+    /* Compute 4 real samples at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst,vmulq(vecSrc, vecSign));
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+        blkCnt--;
+    }
+
+     /* Tail */
+    blkCnt = (blockSize & 0x3) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+  
+      /* Calculate Complex Conjugate and store result in destination buffer. */
+      *pDst++ =  *pSrc++;
+      *pDst++ = -*pSrc++;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+}
+
+#else
+void arm_cmplx_conj_f32(
   const float32_t * pSrc,
         float32_t * pDst,
         uint32_t numSamples)
 {
         uint32_t blkCnt;                               /* Loop counter */
 
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
    float32x4_t zero;
    float32x4x2_t vec;
 
@@ -105,7 +156,7 @@
    blkCnt = numSamples & 0x3;
 
 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
@@ -155,6 +206,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
   @} end of cmplx_conj group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
index 073a337..65eb910 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -49,6 +49,58 @@
                    The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
  */
 
+
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_conj_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
+    int32_t blkCnt;
+    q31_t in1; 
+
+    q15x8_t vecSrc;
+    q15x8_t vecSign;
+
+    /*
+     * {2, 0, 2, 0, 2, 0, 2, 0} - {1, 1, 1, 1, 1, 1, 1, 1}
+     */
+    vecSign = vsubq(vdwdupq_u16(2, 4, 2), vdupq_n_u16(1));
+
+
+    /* Compute 8 real samples at a time */
+    blkCnt = blockSize >> 3U;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst,vmulq(vecSrc, vecSign));
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 8;
+        pDst += 8;
+        blkCnt --;
+    }
+    
+     /* Tail */
+    blkCnt = (blockSize & 0x7) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+  
+      /* Calculate Complex Conjugate and store result in destination buffer. */
+      *pDst++ =  *pSrc++;
+      in1 = *pSrc++;
+      *pDst++ = __SSAT(-in1, 16);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+#else
 void arm_cmplx_conj_q15(
   const q15_t * pSrc,
         q15_t * pDst,
@@ -151,6 +203,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of cmplx_conj group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
index 6ef1ddb..b8f5dc2 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -49,6 +49,62 @@
                    The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
  */
 
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_conj_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples)
+{
+
+    int32_t blockSize = numSamples * CMPLX_DIM;   /* loop counters */
+    int32_t blkCnt;
+    q31x4_t vecSrc;
+    q31x4_t vecSign;
+    q31_t in;                                      /* Temporary input variable */
+
+    /*
+     * {2, 0, 2, 0} - {1, 1, 1, 1}
+     */
+    vecSign = vsubq(vdwdupq_u32(2, 4, 2), vdupq_n_u32(1));
+
+    /* Compute 4 real samples at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+
+        vecSrc = vld1q(pSrc);
+        vst1q(pDst,vmulq(vecSrc, vecSign));
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrc += 4;
+        pDst += 4;
+        blkCnt --;
+    }
+
+     /* Tail */
+    blkCnt = (blockSize & 0x3) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+  
+      /* Calculate Complex Conjugate and store result in destination buffer. */
+      *pDst++ =  *pSrc++;
+      in = *pSrc++;
+      *pDst++ = __QSUB(0, in);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+
+}
+#else
+
 void arm_cmplx_conj_q31(
   const q31_t * pSrc,
         q31_t * pDst,
@@ -131,6 +187,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of cmplx_conj group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
index 06f1bfa..c22ec2b 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -74,6 +74,74 @@
   @return        none
  */
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_dot_prod_f32(
+    const float32_t * pSrcA,
+    const float32_t * pSrcB,
+    uint32_t numSamples,
+    float32_t * realResult,
+    float32_t * imagResult)
+{
+    int32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
+    int32_t blkCnt;
+    float32_t real_sum, imag_sum;
+    f32x4_t vecSrcA, vecSrcB;
+    f32x4_t vec_acc = vdupq_n_f32(0.0f);
+    float32_t a0,b0,c0,d0;
+
+    /* Compute 2 complex samples at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
+        vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrcA += 4;
+        pSrcB += 4;
+        blkCnt--;
+    }
+
+
+    real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
+    imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
+   
+    /* Tail */
+    blkCnt = (blockSize & 3) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      a0 = *pSrcA++;
+      b0 = *pSrcA++;
+      c0 = *pSrcB++;
+      d0 = *pSrcB++;
+  
+      real_sum += a0 * c0;
+      imag_sum += a0 * d0;
+      real_sum -= b0 * d0;
+      imag_sum += b0 * c0;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+
+    /*
+     * Store the real and imaginary results in the destination buffers
+     */
+    *realResult = real_sum;
+    *imagResult = imag_sum;
+}
+
+#else
 void arm_cmplx_dot_prod_f32(
   const float32_t * pSrcA,
   const float32_t * pSrcB,
@@ -85,7 +153,7 @@
         float32_t real_sum = 0.0f, imag_sum = 0.0f;    /* Temporary result variables */
         float32_t a0,b0,c0,d0;
 
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
     float32x4x2_t vec1,vec2,vec3,vec4;
     float32x4_t accR,accI;
     float32x2_t accum = vdup_n_f32(0);
@@ -145,7 +213,7 @@
     blkCnt = numSamples & 0x7;
 
 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
@@ -227,6 +295,7 @@
   *realResult = real_sum;
   *imagResult = imag_sum;
 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
   @} end of cmplx_dot_prod group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
index 2ecd801..5fe5f0a 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -54,6 +54,86 @@
                    The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
  */
 
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_dot_prod_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        uint32_t numSamples,
+        q31_t * realResult,
+        q31_t * imagResult)
+{
+
+  int32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
+  int32_t blkCnt;
+  q15_t a0,b0,c0,d0;
+
+  q63_t accReal = 0LL; q63_t accImag = 0LL;
+  q15x8_t vecSrcA, vecSrcB;
+
+
+
+  /* should give more freedom to generate stall free code */
+  vecSrcA = vld1q(pSrcA);
+  vecSrcB = vld1q(pSrcB);
+  pSrcA += 8;
+  pSrcB += 8;
+
+  /* Compute 4 complex samples at a time */
+  blkCnt = blockSize >> 3;
+  while (blkCnt > 0U) 
+  {
+      q15x8_t vecSrcC, vecSrcD;
+
+      accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
+      vecSrcC = vld1q(pSrcA);
+      pSrcA += 8;
+
+      accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
+      vecSrcD = vld1q(pSrcB);
+      pSrcB += 8;
+
+      accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
+      vecSrcA = vld1q(pSrcA);
+      pSrcA += 8;
+
+      accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
+      vecSrcB = vld1q(pSrcB);
+      pSrcB += 8;
+      /*
+       * Decrement the blockSize loop counter
+       */
+      blkCnt--;
+  }
+
+  /* Tail */
+  pSrcA -= 8;
+  pSrcB -= 8; 
+
+  blkCnt = (blockSize & 7) >> 1;
+  
+  while (blkCnt > 0U)
+  {
+    a0 = *pSrcA++;
+    b0 = *pSrcA++;
+    c0 = *pSrcB++;
+    d0 = *pSrcB++;
+
+    accReal += (q31_t)a0 * c0;
+    accImag += (q31_t)a0 * d0;
+    accReal -= (q31_t)b0 * d0;
+    accImag += (q31_t)b0 * c0;
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+  /* Store real and imaginary result in 8.24 format  */
+  /* Convert real data in 34.30 to 8.24 by 6 right shifts */
+  *realResult = (q31_t) (accReal >> 6);
+  /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
+  *imagResult = (q31_t) (accImag >> 6);
+}
+#else
 void arm_cmplx_dot_prod_q15(
   const q15_t * pSrcA,
   const q15_t * pSrcB,
@@ -66,7 +146,6 @@
         q15_t a0,b0,c0,d0;
 
 #if defined (ARM_MATH_LOOPUNROLL)
-
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
 
@@ -148,6 +227,7 @@
   /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
   *imagResult = (q31_t) (imag_sum >> 6);
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of cmplx_dot_prod group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
index d715d98..821c6f3 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -55,6 +55,72 @@
                    Input down scaling is not required.
  */
 
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_dot_prod_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        uint32_t numSamples,
+        q63_t * realResult,
+        q63_t * imagResult)
+{
+    int32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
+    int32_t blkCnt;
+    q31x4_t vecSrcA, vecSrcB;
+    q63_t accReal = 0LL; 
+    q63_t accImag = 0LL;
+
+    q31_t a0,b0,c0,d0;
+
+     /* Compute 2 complex samples at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {        
+
+        vecSrcA = vld1q(pSrcA);
+        vecSrcB = vld1q(pSrcB);
+
+        accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
+        accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
+
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        pSrcA += 4;
+        pSrcB += 4;
+        blkCnt --;
+    }
+
+    accReal = asrl(accReal, (14 - 8));
+    accImag = asrl(accImag, (14 - 8));
+
+    /* Tail */
+    blkCnt = (blockSize & 3) >> 1;
+
+    while (blkCnt > 0U)
+    {
+      a0 = *pSrcA++;
+      b0 = *pSrcA++;
+      c0 = *pSrcB++;
+      d0 = *pSrcB++;
+  
+      accReal += ((q63_t)a0 * c0) >> 14;
+      accImag += ((q63_t)a0 * d0) >> 14;
+      accReal -= ((q63_t)b0 * d0) >> 14;
+      accImag += ((q63_t)b0 * c0) >> 14;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+  
+    /* Store real and imaginary result in destination buffer. */
+    *realResult = accReal;
+    *imagResult = accImag;
+}
+
+#else
 void arm_cmplx_dot_prod_q31(
   const q31_t * pSrcA,
   const q31_t * pSrcB,
@@ -147,6 +213,7 @@
   *realResult = real_sum;
   *imagResult = imag_sum;
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of cmplx_dot_prod group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
index 84812dc..5dce795 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -69,6 +69,86 @@
   @return        none
  */
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_cmplx_mag_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples;  /* loop counters */
+    uint32_t  blkCnt;           /* loop counters */
+    f32x4x2_t vecSrc;
+    f32x4_t sum;
+    float32_t real, imag;                      /* Temporary variables to hold input values */
+
+    /* Compute 4 complex samples at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        q31x4_t newtonStartVec;
+        f32x4_t sumHalf, invSqrt;
+
+        vecSrc = vld2q(pSrc);  
+        pSrc += 8;
+        sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
+        sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
+
+        /*
+         * inlined Fast SQRT using inverse SQRT newton-raphson method
+         */
+
+        /* compute initial value */
+        newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
+        sumHalf = sum * 0.5f;
+        /*
+         * compute 3 x iterations
+         *
+         * The more iterations, the more accuracy.
+         * If you need to trade a bit of accuracy for more performance,
+         * you can comment out the 3rd use of the macro.
+         */
+        INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
+        INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
+        INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
+        /*
+         * set negative values to 0
+         */
+        invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
+        /*
+         * sqrt(x) = x * invSqrt(x)
+         */
+        sum = vmulq(sum, invSqrt);
+        vst1q(pDst, sum); 
+        pDst += 4;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    while (blkCnt > 0U)
+    {
+      /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+  
+      real = *pSrc++;
+      imag = *pSrc++;
+  
+      /* store result in destination buffer. */
+      arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
 void arm_cmplx_mag_f32(
   const float32_t * pSrc,
         float32_t * pDst,
@@ -77,7 +157,7 @@
   uint32_t blkCnt;                               /* loop counter */
   float32_t real, imag;                      /* Temporary variables to hold input values */
 
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   float32x4x2_t vecA;
   float32x4_t vRealA;
@@ -125,7 +205,7 @@
 
 #else
 
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
@@ -182,6 +262,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
   @} end of cmplx_mag group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
index a493274..498c0e6 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@@ -47,7 +47,65 @@
   @par           Scaling and Overflow Behavior
                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
  */
+#if defined(ARM_MATH_MVEI)
 
+#include "arm_helium_utils.h"
+
+void arm_cmplx_mag_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples)
+{
+
+    int32_t blockSize = numSamples;  /* loop counters */
+    uint32_t  blkCnt;           /* loop counters */
+    q15x8x2_t vecSrc;
+    q15x8_t sum;
+    q31_t in;
+    q31_t acc0;
+
+    blkCnt = blockSize >> 3;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld2q(pSrc);  
+        pSrc += 16;
+        sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
+                     vmulhq(vecSrc.val[1], vecSrc.val[1]));
+
+        sum = vshrq(sum, 1);
+
+        sum = FAST_VSQRT_Q15(sum);
+
+        vst1q(pDst, sum); 
+        pDst += 8;
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 7;
+
+    while (blkCnt > 0U)
+    {
+      /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+  
+      in = read_q15x2_ia ((q15_t **) &pSrc);
+      acc0 = __SMUAD(in, in);
+  
+      /* store result in 2.14 format in destination buffer. */
+      arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+  
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
 void arm_cmplx_mag_q15(
   const q15_t * pSrc,
         q15_t * pDst,
@@ -156,6 +214,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of cmplx_mag group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
index 873e566..d0950ee 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@@ -49,6 +49,76 @@
                    Input down scaling is not required.
  */
 
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_cmplx_mag_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples;  /* loop counters */
+    uint32_t  blkCnt;           /* loop counters */
+
+    q31x4x2_t vecSrc;
+    q31x4_t sum;
+
+    q31_t real, imag;                              /* Temporary input variables */
+    q31_t acc0, acc1;                              /* Accumulators */
+
+    /* Compute 4 complex samples at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld2q(pSrc);
+
+        sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
+                     vmulhq(vecSrc.val[1], vecSrc.val[1]));
+
+        sum = vshrq(sum, 1);
+
+        /*
+
+        This function is using a table. There are compilations flags to avoid
+        including this table (and in this case, arm_cmplx_maq_q31 must not
+        be built and linked.)
+
+        */
+        sum = FAST_VSQRT_Q31(sum);
+
+        vst1q(pDst, sum);
+
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+        pSrc += 8;
+        pDst += 4;
+    }
+
+    /*
+     * tail
+     */
+    blkCnt = blockSize & 3;
+    while (blkCnt > 0U)
+    {
+      /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+  
+      real = *pSrc++;
+      imag = *pSrc++;
+      acc0 = (q31_t) (((q63_t) real * real) >> 33);
+      acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+  
+      /* store result in 2.30 format in destination buffer. */
+      arm_sqrt_q31(acc0 + acc1, pDst++);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
 void arm_cmplx_mag_q31(
   const q31_t * pSrc,
         q31_t * pDst,
@@ -124,6 +194,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of cmplx_mag group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
index 99f051c..1cea04e 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -69,6 +69,56 @@
   @return        none
  */
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mag_squared_f32(
+  const float32_t * pSrc,
+        float32_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples;  /* loop counters */
+    uint32_t  blkCnt;           /* loop counters */
+    f32x4x2_t vecSrc;
+    f32x4_t sum;
+    float32_t real, imag;                          /* Temporary input variables */
+
+    /* Compute 4 complex samples at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld2q(pSrc);
+        sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
+        sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
+        vst1q(pDst, sum);
+
+        pSrc += 8;
+        pDst += 4;
+        
+        /*
+         * Decrement the blockSize loop counter
+         */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+    while (blkCnt > 0U)
+    {
+      /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+  
+      real = *pSrc++;
+      imag = *pSrc++;
+  
+      /* store result in destination buffer. */
+      *pDst++ = (real * real) + (imag * imag);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+}
+
+#else
 void arm_cmplx_mag_squared_f32(
   const float32_t * pSrc,
         float32_t * pDst,
@@ -77,7 +127,7 @@
         uint32_t blkCnt;                               /* Loop counter */
         float32_t real, imag;                          /* Temporary input variables */
 
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
   float32x4x2_t vecA;
   float32x4_t vRealA;
   float32x4_t vImagA;
@@ -123,7 +173,7 @@
   blkCnt = numSamples & 7;
 
 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
@@ -178,6 +228,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
   @} end of cmplx_mag_squared group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
index fa5f4e6..a6107be 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@@ -48,6 +48,64 @@
                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
  */
 
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_mag_squared_q15(
+  const q15_t * pSrc,
+        q15_t * pDst,
+        uint32_t numSamples)
+{
+  int32_t blockSize = numSamples;  /* loop counters */
+  uint32_t  blkCnt;           /* loop counters */
+  q31_t in;
+  q31_t acc0;                                    /* Accumulators */
+  q15x8x2_t vecSrc;
+  q15x8_t vReal, vImag;
+  q15x8_t vMagSq;
+
+  
+  blkCnt = blockSize >> 3;
+  while (blkCnt > 0U)
+  {
+    vecSrc = vld2q(pSrc);
+    vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
+    vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
+    vMagSq = vqaddq(vReal, vImag);
+    vMagSq = vshrq(vMagSq, 1);
+
+    vst1q(pDst, vMagSq);
+
+    pSrc += 16;
+    pDst += 8;
+    /*
+     * Decrement the blkCnt loop counter
+     * Advance vector source and destination pointers
+     */
+    blkCnt --;
+  }
+
+  /*
+   * tail
+   */
+  blkCnt = blockSize & 7;
+  while (blkCnt > 0U)
+  {
+    /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+    in = read_q15x2_ia ((q15_t **) &pSrc);
+    acc0 = __SMUAD(in, in);
+
+    /* store result in 3.13 format in destination buffer. */
+    *pDst++ = (q15_t) (acc0 >> 17);
+
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+
+}
+
+#else
 void arm_cmplx_mag_squared_q15(
   const q15_t * pSrc,
         q15_t * pDst,
@@ -156,6 +214,8 @@
 
 }
 
+#endif /* defined(ARM_MATH_MVEI) */
+
 /**
   @} end of cmplx_mag_squared group
  */
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
index 54863ef..7127e6e 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@@ -49,6 +49,62 @@
                    Input down scaling is not required.
  */
 
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_mag_squared_q31(
+  const q31_t * pSrc,
+        q31_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t blockSize = numSamples;  /* loop counters */
+    uint32_t  blkCnt;           /* loop counters */
+    q31x4x2_t vecSrc;
+    q31x4_t vReal, vImag;
+    q31x4_t vMagSq;
+    q31_t real, imag;                              /* Temporary input variables */
+    q31_t acc0, acc1;                              /* Accumulators */
+
+    /* Compute 4 complex samples at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+        vecSrc = vld2q(pSrc);
+        vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
+        vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
+        vMagSq = vqaddq(vReal, vImag);
+        vMagSq = vshrq(vMagSq, 1);
+
+        vst1q(pDst, vMagSq);
+
+        pSrc += 8;
+        pDst += 4;
+        /*
+         * Decrement the blkCnt loop counter
+         * Advance vector source and destination pointers
+         */
+        blkCnt --;
+    } 
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+    while (blkCnt > 0U)
+    {
+      /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+  
+      real = *pSrc++;
+      imag = *pSrc++;
+      acc0 = (q31_t) (((q63_t) real * real) >> 33);
+      acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+  
+      /* store result in 3.29 format in destination buffer. */
+      *pDst++ = acc0 + acc1;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
 void arm_cmplx_mag_squared_q31(
   const q31_t * pSrc,
         q31_t * pDst,
@@ -124,6 +180,8 @@
 
 }
 
+#endif /* defined(ARM_MATH_MVEI) */
+
 /**
   @} end of cmplx_mag_squared group
  */
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
index 8d14821..1de597d 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -68,6 +68,67 @@
   @return        none
  */
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mult_cmplx_f32(
+  const float32_t * pSrcA,
+  const float32_t * pSrcB,
+        float32_t * pDst,
+        uint32_t numSamples)
+{
+    int32_t  blkCnt;           /* loop counters */
+    int32_t blockSize = numSamples;  /* loop counters */
+    float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
+
+    f32x4x2_t vecA;
+    f32x4x2_t vecB;
+    f32x4x2_t vecDst;
+
+    /* Compute 4 complex outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+            vecA = vld2q(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
+            vecB = vld2q(pSrcB);  // load & separate real/imag pSrcB
+            pSrcA += 8;
+            pSrcB += 8;
+
+            /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
+            vecDst.val[0] = vmulq(vecA.val[0], vecB.val[0]);
+            vecDst.val[0] = vfmsq(vecDst.val[0],vecA.val[1], vecB.val[1]);
+            /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+            vecDst.val[1] = vmulq(vecA.val[0], vecB.val[1]);
+            vecDst.val[1] = vfmaq(vecDst.val[1], vecA.val[1], vecB.val[0]);
+
+            vst2q(pDst, vecDst);
+            pDst += 8;
+
+            blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+    while (blkCnt > 0U)
+    {
+      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+  
+      a = *pSrcA++;
+      b = *pSrcA++;
+      c = *pSrcB++;
+      d = *pSrcB++;
+  
+      /* store result in destination buffer. */
+      *pDst++ = (a * c) - (b * d);
+      *pDst++ = (a * d) + (b * c);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+
+}
+
+#else
 void arm_cmplx_mult_cmplx_f32(
   const float32_t * pSrcA,
   const float32_t * pSrcB,
@@ -77,7 +138,7 @@
     uint32_t blkCnt;                               /* Loop counter */
     float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
 
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
     float32x4x2_t va, vb;
     float32x4_t real, imag;
     float32x4x2_t outCplx;
@@ -115,7 +176,7 @@
     blkCnt = numSamples & 3;
 
 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
@@ -188,6 +249,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
   @} end of CmplxByCmplxMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
index 6659427..6650179 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@@ -49,6 +49,65 @@
                    The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
  */
 
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_mult_cmplx_q15(
+  const q15_t * pSrcA,
+  const q15_t * pSrcB,
+        q15_t * pDst,
+        uint32_t numSamples)
+{
+  int32_t  blkCnt;           /* loop counters */
+  int32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
+  q15_t a, b, c, d;  
+
+  q15x8_t vecA;
+  q15x8_t vecB;
+  q15x8_t vecDst;
+
+  blkCnt = blockSize >> 3;
+  while (blkCnt > 0U)
+  {
+      vecA = vld1q(pSrcA);
+      vecB = vld1q(pSrcB);
+      /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
+      vecDst = vqdmlsdhq_s16(vuninitializedq_f16(), vecA, vecB);
+      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+      vecDst = vqdmladhxq_s16(vecDst, vecA, vecB);
+
+      vecDst = vshrq(vecDst, 2);
+
+      vst1q(pDst, vecDst);
+
+      blkCnt --;
+      pSrcA += 8;
+      pSrcB += 8;
+      pDst += 8;
+  };
+
+  /*
+   * tail
+   */
+  blkCnt = (blockSize & 7) >> 1;
+  while (blkCnt > 0U)
+  {
+    /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+
+    a = *pSrcA++;
+    b = *pSrcA++;
+    c = *pSrcB++;
+    d = *pSrcB++;
+
+    /* store result in 3.13 format in destination buffer. */
+    *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+    *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
 void arm_cmplx_mult_cmplx_q15(
   const q15_t * pSrcA,
   const q15_t * pSrcB,
@@ -130,6 +189,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of CmplxByCmplxMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
index f6d6dc6..dbf7e2b 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@@ -50,6 +50,62 @@
                    Input down scaling is not required.
  */
 
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_mult_cmplx_q31(
+  const q31_t * pSrcA,
+  const q31_t * pSrcB,
+        q31_t * pDst,
+        uint32_t numSamples)
+{
+
+    int32_t  blkCnt;           /* loop counters */
+    int32_t blockSize = numSamples * CMPLX_DIM;  /* loop counters */
+    q31x4_t vecA;
+    q31x4_t vecB;
+    q31x4_t vecDst;
+    q31_t a, b, c, d;                              /* Temporary variables */
+
+    /* Compute 2 complex outputs at a time */
+    blkCnt = blockSize >> 2;
+    while (blkCnt > 0U)
+    {
+
+        vecA = vld1q(pSrcA);
+        vecB = vld1q(pSrcB);
+        /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
+        vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB);
+        /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
+        vecDst = vqdmladhxq(vecDst, vecA, vecB);
+
+        vecDst = vshrq(vecDst, 2);
+        vst1q(pDst, vecDst);
+
+        blkCnt --;
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst += 4;
+    };
+
+    blkCnt = (blockSize & 3) >> 1;
+    while (blkCnt > 0U)
+    {
+      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
+      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
+  
+      a = *pSrcA++;
+      b = *pSrcA++;
+      c = *pSrcB++;
+      d = *pSrcB++;
+  
+      /* store result in 3.29 format in destination buffer. */
+      *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+      *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+#else
 void arm_cmplx_mult_cmplx_q31(
   const q31_t * pSrcA,
   const q31_t * pSrcB,
@@ -131,6 +187,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of CmplxByCmplxMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
index 9651999..6e556b5 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -69,6 +69,60 @@
   @return        none
  */
 
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mult_real_f32(
+  const float32_t * pSrcCmplx,
+  const float32_t * pSrcReal,
+        float32_t * pCmplxDst,
+        uint32_t numSamples)
+{
+    const static uint32_t stride_cmplx_x_real_32[4] = { 0, 0, 1, 1 };
+
+    int32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
+    int32_t blkCnt;
+    f32x4_t rVec;
+    f32x4_t cmplxVec;
+    f32x4_t dstVec;
+    uint32x4_t strideVec;
+    float32_t in;  
+
+
+    /* stride vector for pairs of real generation */
+    strideVec = vld1q(stride_cmplx_x_real_32);
+
+    /* Compute 4 complex outputs at a time */
+    blkCnt = blockSizeC >> 2;
+    while (blkCnt > 0U) 
+    {
+        cmplxVec = vld1q(pSrcCmplx);
+        rVec = vldrwq_gather_shifted_offset_f32(pSrcReal, strideVec);
+        dstVec = vmulq(cmplxVec, rVec);
+        vst1q(pCmplxDst, dstVec);
+
+        pSrcReal += 2;
+        pSrcCmplx += 4;
+        pCmplxDst += 4;
+        blkCnt--;
+    }
+
+    blkCnt = (blockSizeC & 3) >> 1; 
+    while (blkCnt > 0U)
+    {
+      /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+      /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+  
+      in = *pSrcReal++;
+      /* store result in destination buffer. */
+      *pCmplxDst++ = *pSrcCmplx++ * in;
+      *pCmplxDst++ = *pSrcCmplx++ * in;
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+
+#else
 void arm_cmplx_mult_real_f32(
   const float32_t * pSrcCmplx,
   const float32_t * pSrcReal,
@@ -78,7 +132,7 @@
         uint32_t blkCnt;                               /* Loop counter */
         float32_t in;                                  /* Temporary variable */
 
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
     float32x4_t r;
     float32x4x2_t ab,outCplx;
 
@@ -106,7 +160,7 @@
     /* Tail */
     blkCnt = numSamples & 3;
 #else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 
   /* Loop unrolling: Compute 4 outputs at a time */
   blkCnt = numSamples >> 2U;
@@ -163,6 +217,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
 
 /**
   @} end of CmplxByRealMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
index 4877d20..854d9a6 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@@ -49,6 +49,7 @@
                    The function uses saturating arithmetic.
                    Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
  */
+#if defined(ARM_MATH_MVEI)
 
 void arm_cmplx_mult_real_q15(
   const q15_t * pSrcCmplx,
@@ -56,6 +57,60 @@
         q15_t * pCmplxDst,
         uint32_t numSamples)
 {
+  const static uint16_t stride_cmplx_x_real_16[8] = {
+      0, 0, 1, 1, 2, 2, 3, 3
+      };
+  q15x8_t rVec;
+  q15x8_t cmplxVec;
+  q15x8_t dstVec;
+  uint16x8_t strideVec;
+  int32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
+  int32_t blkCnt;
+  q15_t in;  
+
+  /*
+  * stride vector for pairs of real generation
+  */
+  strideVec = vld1q(stride_cmplx_x_real_16);
+
+  blkCnt = blockSizeC >> 3;
+
+  while (blkCnt > 0U) 
+  {
+    cmplxVec = vld1q(pSrcCmplx);
+    rVec = vldrhq_gather_shifted_offset_s16(pSrcReal, strideVec);
+    dstVec = vqdmulhq(cmplxVec, rVec);
+    vst1q(pCmplxDst, dstVec);
+
+    pSrcReal += 4;
+    pSrcCmplx += 8;
+    pCmplxDst += 8;
+    blkCnt --;
+  }
+
+  /* Tail */
+  blkCnt = (blockSizeC & 7) >> 1;
+  while (blkCnt > 0U)
+  {
+    /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+    /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+    in = *pSrcReal++;
+    /* store the result in the destination buffer. */
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+    *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+
+    /* Decrement loop counter */
+    blkCnt--;
+  }
+}
+#else
+void arm_cmplx_mult_real_q15(
+  const q15_t * pSrcCmplx,
+  const q15_t * pSrcReal,
+        q15_t * pCmplxDst,
+        uint32_t numSamples)
+{
         uint32_t blkCnt;                               /* Loop counter */
         q15_t in;                                      /* Temporary variable */
 
@@ -176,6 +231,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of CmplxByRealMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
index 906410f..7125cc8 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@@ -50,6 +50,61 @@
                    Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
  */
 
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_mult_real_q31(
+  const q31_t * pSrcCmplx,
+  const q31_t * pSrcReal,
+        q31_t * pCmplxDst,
+        uint32_t numSamples)
+{
+
+    const static uint32_t stride_cmplx_x_real_32[4] = {
+        0, 0, 1, 1
+    };
+    q31x4_t rVec;
+    q31x4_t cmplxVec;
+    q31x4_t dstVec;
+    uint32x4_t strideVec;
+    int32_t blockSizeC = numSamples * CMPLX_DIM;   /* loop counters */
+    int32_t blkCnt;
+    q31_t in;
+
+    /*
+     * stride vector for pairs of real generation
+     */
+    strideVec = vld1q(stride_cmplx_x_real_32);
+
+    /* Compute 4 complex outputs at a time */
+    blkCnt = blockSizeC >> 2;
+    while (blkCnt > 0U) 
+    {
+        cmplxVec = vld1q(pSrcCmplx);
+        rVec = vldrwq_gather_shifted_offset_s32(pSrcReal, strideVec);
+        dstVec = vqdmulhq(cmplxVec, rVec);
+        vst1q(pCmplxDst, dstVec);
+
+        pSrcReal += 2;
+        pSrcCmplx += 4;
+        pCmplxDst += 4;
+        blkCnt --;
+    }
+
+    blkCnt = (blockSizeC & 3) >> 1; 
+    while (blkCnt > 0U)
+    {
+      /* C[2 * i    ] = A[2 * i    ] * B[i]. */
+      /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+  
+      in = *pSrcReal++;
+      /* store saturated result in 1.31 format to destination buffer */
+      *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+      *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+  
+      /* Decrement loop counter */
+      blkCnt--;
+    }
+}
+#else
 void arm_cmplx_mult_real_q31(
   const q31_t * pSrcCmplx,
   const q31_t * pSrcReal,
@@ -142,6 +197,7 @@
   }
 
 }
+#endif /* defined(ARM_MATH_MVEI) */
 
 /**
   @} end of CmplxByRealMult group
diff --git a/CMSIS/DSP/Source/interpol.cmake b/CMSIS/DSP/Source/interpol.cmake
index 80282cf..2538d91 100644
--- a/CMSIS/DSP/Source/interpol.cmake
+++ b/CMSIS/DSP/Source/interpol.cmake
@@ -40,4 +40,12 @@
     target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_RECIP_Q15)
 endif()
 
+if (CONFIGTABLE AND ARM_CMPLX_MAG_Q31 AND (MVEI OR HELIUM))
+    target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_FAST_SQRT_Q31_MVE)
+endif()
+
+if (CONFIGTABLE AND ARM_CMPLX_MAG_Q15 AND (MVEI OR HELIUM))
+    target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_FAST_SQRT_Q15_MVE)
+endif()
+
 endfunction()
\ No newline at end of file