CMSIS-DSP: Added Helium support to Complex Math Functions.
Added new test patterns for complex math functions.
Added new tests to test framework.
Improved error handling of test framework.
diff --git a/CMSIS/DSP/Source/CommonTables/arm_common_tables.c b/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
index 7f7f79d..40de826 100644
--- a/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
+++ b/CMSIS/DSP/Source/CommonTables/arm_common_tables.c
@@ -57213,4 +57213,129 @@
};
#endif /* defined(ARM_ALL_FAST_TABLES) */
+#if defined(ARM_MATH_MVEI)
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q31_MVE)
+const q31_t sqrtTable_Q31[256] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x5ffffffe, 0x7ffffffa, 0x5e88c596, 0x7a39e2ff,
+ 0x5d2238d8, 0x74dfa6b1, 0x5bcb268e, 0x6fe69562,
+ 0x5a827998, 0x6b454dba, 0x5947373a, 0x66f39063,
+ 0x58187bf8, 0x62ea1669, 0x56f578e6, 0x5f226e5f,
+ 0x55dd714e, 0x5b96df40, 0x54cfb8b0, 0x58424fca,
+ 0x53cbb102, 0x552031c7, 0x52d0c92c, 0x522c7048,
+ 0x51de7bb2, 0x4f6360a9, 0x50f44d86, 0x4cc1b594,
+ 0x5011cd0a, 0x4a4473f6, 0x4f36911a, 0x47e8e962,
+ 0x4e62384e, 0x45aca3d0, 0x4d946838, 0x438d6a6a,
+ 0x4cccccca, 0x41893746, 0x4c0b17c0, 0x3f9e31fa,
+ 0x4b4f0022, 0x3dcaaac3, 0x4a9841ce, 0x3c0d1660,
+ 0x49e69d14, 0x3a640a53, 0x4939d654, 0x38ce39a6,
+ 0x4891b5b2, 0x374a720b, 0x47ee06ba, 0x35d7993c,
+ 0x474e9830, 0x3474aacc, 0x46b33bc0, 0x3320b60c,
+ 0x461bc5d2, 0x31dadc4f, 0x45880d4c, 0x30a24f3c,
+ 0x44f7eb6a, 0x2f764f6a, 0x446b3b92, 0x2e562b0b,
+ 0x43e1db32, 0x2d413cc9, 0x435ba98a, 0x2c36eab0,
+ 0x42d887a4, 0x2b36a542, 0x42585824, 0x2a3fe69b,
+ 0x41daff34, 0x295231b0, 0x41606266, 0x286d118d,
+ 0x40e868a4, 0x279018c3, 0x4072fa10, 0x26bae0c8,
+ 0x3ffffffe, 0x25ed0979, 0x3f8f64d2, 0x25263894,
+ 0x3f2113f8, 0x24661958, 0x3eb4f9d8, 0x23ac5c10,
+ 0x3e4b03ba, 0x22f8b5bb, 0x3de31fc4, 0x224adfba,
+ 0x3d7d3cee, 0x21a2977a, 0x3d194ae8, 0x20ff9e30,
+ 0x3cb73a24, 0x2061b89a, 0x3c56fbba, 0x1fc8aebb,
+ 0x3bf88166, 0x1f344ba6, 0x3b9bbd80, 0x1ea45d4b,
+ 0x3b40a2f0, 0x1e18b448, 0x3ae7252e, 0x1d9123b9,
+ 0x3a8f382c, 0x1d0d8113, 0x3a38d062, 0x1c8da3fb,
+ 0x39e3e2b6, 0x1c116628, 0x39906482, 0x1b98a33c,
+ 0x393e4b8a, 0x1b2338aa, 0x38ed8df6, 0x1ab1059a,
+ 0x389e2250, 0x1a41eace, 0x384fff7a, 0x19d5ca89,
+ 0x38031cb2, 0x196c887b, 0x37b77184, 0x190609a9,
+ 0x376cf5ce, 0x18a2345b, 0x3723a1ba, 0x1840f00a,
+ 0x36db6db6, 0x17e22550, 0x36945276, 0x1785bdd2,
+ 0x364e48f4, 0x172ba43e, 0x36094a64, 0x16d3c42c,
+ 0x35c55036, 0x167e0a1f, 0x35825414, 0x162a6372,
+ 0x35404fde, 0x15d8be4d, 0x34ff3dac, 0x1589099d,
+ 0x34bf17c4, 0x153b3508, 0x347fd89e, 0x14ef30e4,
+ 0x34417ade, 0x14a4ee2a, 0x3403f956, 0x145c5e76,
+ 0x33c74f02, 0x141573fa, 0x338b7706, 0x13d02171,
+ 0x33506cae, 0x138c5a28, 0x33162b6a, 0x134a11e6,
+ 0x32dcaed0, 0x13093cee, 0x32a3f294, 0x12c9cffa,
+ 0x326bf292, 0x128bc034, 0x3234aac0, 0x124f0330,
+ 0x31fe1736, 0x12138ee5, 0x31c83428, 0x11d959b0,
+ 0x3192fde6, 0x11a05a45, 0x315e70de, 0x116887b5,
+ 0x312a8994, 0x1131d960, 0x30f744aa, 0x10fc46fd,
+ 0x30c49ed6, 0x10c7c88b, 0x309294ea, 0x10945653,
+ 0x306123cc, 0x1061e8e6, 0x30304878, 0x10307919
+};
+ #endif /* !defined(ARM_DSP_CONFIG_TABLES) defined(ARM_ALL_FAST_TABLES) */
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FAST_TABLES) || defined(ARM_TABLE_FAST_SQRT_Q15_MVE)
+const q15_t sqrtTable_Q15[256] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x5fff, 0x7fff, 0x5e88, 0x7a39,
+ 0x5d22, 0x74df, 0x5bcb, 0x6fe6,
+ 0x5a82, 0x6b45, 0x5947, 0x66f3,
+ 0x5818, 0x62ea, 0x56f5, 0x5f22,
+ 0x55dd, 0x5b96, 0x54cf, 0x5842,
+ 0x53cb, 0x5520, 0x52d0, 0x522c,
+ 0x51de, 0x4f63, 0x50f4, 0x4cc1,
+ 0x5011, 0x4a44, 0x4f36, 0x47e8,
+ 0x4e62, 0x45ac, 0x4d94, 0x438d,
+ 0x4ccc, 0x4189, 0x4c0b, 0x3f9e,
+ 0x4b4f, 0x3dca, 0x4a98, 0x3c0d,
+ 0x49e6, 0x3a64, 0x4939, 0x38ce,
+ 0x4891, 0x374a, 0x47ee, 0x35d7,
+ 0x474e, 0x3474, 0x46b3, 0x3320,
+ 0x461b, 0x31da, 0x4588, 0x30a2,
+ 0x44f7, 0x2f76, 0x446b, 0x2e56,
+ 0x43e1, 0x2d41, 0x435b, 0x2c36,
+ 0x42d8, 0x2b36, 0x4258, 0x2a3f,
+ 0x41da, 0x2952, 0x4160, 0x286d,
+ 0x40e8, 0x2790, 0x4072, 0x26ba,
+ 0x3fff, 0x25ed, 0x3f8f, 0x2526,
+ 0x3f21, 0x2466, 0x3eb4, 0x23ac,
+ 0x3e4b, 0x22f8, 0x3de3, 0x224a,
+ 0x3d7d, 0x21a2, 0x3d19, 0x20ff,
+ 0x3cb7, 0x2061, 0x3c56, 0x1fc8,
+ 0x3bf8, 0x1f34, 0x3b9b, 0x1ea4,
+ 0x3b40, 0x1e18, 0x3ae7, 0x1d91,
+ 0x3a8f, 0x1d0d, 0x3a38, 0x1c8d,
+ 0x39e3, 0x1c11, 0x3990, 0x1b98,
+ 0x393e, 0x1b23, 0x38ed, 0x1ab1,
+ 0x389e, 0x1a41, 0x384f, 0x19d5,
+ 0x3803, 0x196c, 0x37b7, 0x1906,
+ 0x376c, 0x18a2, 0x3723, 0x1840,
+ 0x36db, 0x17e2, 0x3694, 0x1785,
+ 0x364e, 0x172b, 0x3609, 0x16d3,
+ 0x35c5, 0x167e, 0x3582, 0x162a,
+ 0x3540, 0x15d8, 0x34ff, 0x1589,
+ 0x34bf, 0x153b, 0x347f, 0x14ef,
+ 0x3441, 0x14a4, 0x3403, 0x145c,
+ 0x33c7, 0x1415, 0x338b, 0x13d0,
+ 0x3350, 0x138c, 0x3316, 0x134a,
+ 0x32dc, 0x1309, 0x32a3, 0x12c9,
+ 0x326b, 0x128b, 0x3234, 0x124f,
+ 0x31fe, 0x1213, 0x31c8, 0x11d9,
+ 0x3192, 0x11a0, 0x315e, 0x1168,
+ 0x312a, 0x1131, 0x30f7, 0x10fc,
+ 0x30c4, 0x10c7, 0x3092, 0x1094,
+ 0x3061, 0x1061, 0x3030, 0x1030
+};
+ #endif
+#endif /* defined(ARM_MATH_MVEI) */
+
+
#endif /* if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FAST_TABLES) */
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt b/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt
index fbf40c7..87ea102 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/CMakeLists.txt
@@ -7,13 +7,47 @@
file(GLOB SRC "./*_*.c")
-add_library(CMSISDSPComplexMath STATIC ${SRC})
+add_library(CMSISDSPComplexMath STATIC)
configLib(CMSISDSPComplexMath ${ROOT})
configDsp(CMSISDSPComplexMath ${ROOT})
+
+include(interpol)
+interpol(CMSISDSPFastMath)
+
+if (CONFIGTABLE AND ALLFAST)
+ target_compile_definitions(CMSISDSPComplexMath PUBLIC ARM_ALL_FAST_TABLES)
+endif()
+
+# MVE code is using a table for computing the fast sqrt arm_cmplx_mag_q31
+# There is the possibility of not compiling this function and not including
+# the table.
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q31 OR (NOT HELIUM AND NOT MVEI))
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q31.c)
+endif()
+
+if (NOT CONFIGTABLE OR ALLFAST OR ARM_CMPLX_MAG_Q15 OR (NOT HELIUM AND NOT MVEI))
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_q15.c)
+endif()
+
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_conj_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_dot_prod_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mag_squared_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_cmplx_q31.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_f32.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q15.c)
+target_sources(CMSISDSPComplexMath PRIVATE arm_cmplx_mult_real_q31.c)
+
+
### Includes
target_include_directories(CMSISDSPComplexMath PUBLIC "${DSP}/Include")
-
-
-
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
index df5db00..601b74d 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_f32.c
@@ -68,15 +68,66 @@
@return none
*/
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
void arm_cmplx_conj_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t numSamples)
+{
+ static const float32_t cmplx_conj_sign[4] = { 1.0, -1.0, 1.0, -1.0 };
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ f32x4_t vecSrc;
+ f32x4_t vecSign;
+
+ /*
+ * load sign vector
+ */
+ vecSign = *(f32x4_t *) cmplx_conj_sign;
+
+ /* Compute 4 real samples at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld1q(pSrc);
+ vst1q(pDst,vmulq(vecSrc, vecSign));
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = (blockSize & 0x3) >> 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ *pDst++ = -*pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+void arm_cmplx_conj_f32(
const float32_t * pSrc,
float32_t * pDst,
uint32_t numSamples)
{
uint32_t blkCnt; /* Loop counter */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4_t zero;
float32x4x2_t vec;
@@ -105,7 +156,7 @@
blkCnt = numSamples & 0x3;
#else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -155,6 +206,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_conj group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
index 073a337..65eb910 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q15.c
@@ -49,6 +49,58 @@
The Q15 value -1 (0x8000) is saturated to the maximum allowable positive value 0x7FFF.
*/
+
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_conj_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ q31_t in1;
+
+ q15x8_t vecSrc;
+ q15x8_t vecSign;
+
+ /*
+ * {2, 0, 2, 0, 2, 0, 2, 0} - {1, 1, 1, 1, 1, 1, 1, 1}
+ */
+ vecSign = vsubq(vdwdupq_u16(2, 4, 2), vdupq_n_u16(1));
+
+
+ /* Compute 8 real samples at a time */
+ blkCnt = blockSize >> 3U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld1q(pSrc);
+ vst1q(pDst,vmulq(vecSrc, vecSign));
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blkCnt --;
+ }
+
+ /* Tail */
+ blkCnt = (blockSize & 0x7) >> 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ in1 = *pSrc++;
+ *pDst++ = __SSAT(-in1, 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
void arm_cmplx_conj_q15(
const q15_t * pSrc,
q15_t * pDst,
@@ -151,6 +203,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_conj group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
index 6ef1ddb..b8f5dc2 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_conj_q31.c
@@ -49,6 +49,62 @@
The Q31 value -1 (0x80000000) is saturated to the maximum allowable positive value 0x7FFFFFFF.
*/
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_conj_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t numSamples)
+{
+
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ q31x4_t vecSrc;
+ q31x4_t vecSign;
+ q31_t in; /* Temporary input variable */
+
+ /*
+ * {2, 0, 2, 0} - {1, 1, 1, 1}
+ */
+ vecSign = vsubq(vdwdupq_u32(2, 4, 2), vdupq_n_u32(1));
+
+ /* Compute 4 real samples at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+
+ vecSrc = vld1q(pSrc);
+ vst1q(pDst,vmulq(vecSrc, vecSign));
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ /* Tail */
+ blkCnt = (blockSize & 0x3) >> 1;
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] + jC[1] = A[0]+ j(-1)A[1] */
+
+ /* Calculate Complex Conjugate and store result in destination buffer. */
+ *pDst++ = *pSrc++;
+ in = *pSrc++;
+ *pDst++ = __QSUB(0, in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+
+}
+#else
+
void arm_cmplx_conj_q31(
const q31_t * pSrc,
q31_t * pDst,
@@ -131,6 +187,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_conj group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
index 06f1bfa..c22ec2b 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -74,6 +74,74 @@
@return none
*/
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_dot_prod_f32(
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ uint32_t numSamples,
+ float32_t * realResult,
+ float32_t * imagResult)
+{
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ float32_t real_sum, imag_sum;
+ f32x4_t vecSrcA, vecSrcB;
+ f32x4_t vec_acc = vdupq_n_f32(0.0f);
+ float32_t a0,b0,c0,d0;
+
+ /* Compute 2 complex samples at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ vecSrcA = vld1q(pSrcA);
+ vecSrcB = vld1q(pSrcB);
+
+ vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB);
+ vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
+
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrcA += 4;
+ pSrcB += 4;
+ blkCnt--;
+ }
+
+
+ real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2);
+ imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3);
+
+ /* Tail */
+ blkCnt = (blockSize & 3) >> 1;
+
+ while (blkCnt > 0U)
+ {
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
+
+ real_sum += a0 * c0;
+ imag_sum += a0 * d0;
+ real_sum -= b0 * d0;
+ imag_sum += b0 * c0;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+
+ /*
+ * Store the real and imaginary results in the destination buffers
+ */
+ *realResult = real_sum;
+ *imagResult = imag_sum;
+}
+
+#else
void arm_cmplx_dot_prod_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
@@ -85,7 +153,7 @@
float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */
float32_t a0,b0,c0,d0;
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vec1,vec2,vec3,vec4;
float32x4_t accR,accI;
float32x2_t accum = vdup_n_f32(0);
@@ -145,7 +213,7 @@
blkCnt = numSamples & 0x7;
#else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -227,6 +295,7 @@
*realResult = real_sum;
*imagResult = imag_sum;
}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_dot_prod group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
index 2ecd801..5fe5f0a 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q15.c
@@ -54,6 +54,86 @@
The return results <code>realResult</code> and <code>imagResult</code> are in 8.24 format.
*/
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_dot_prod_q15(
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ uint32_t numSamples,
+ q31_t * realResult,
+ q31_t * imagResult)
+{
+
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ q15_t a0,b0,c0,d0;
+
+ q63_t accReal = 0LL; q63_t accImag = 0LL;
+ q15x8_t vecSrcA, vecSrcB;
+
+
+
+ /* should give more freedom to generate stall free code */
+ vecSrcA = vld1q(pSrcA);
+ vecSrcB = vld1q(pSrcB);
+ pSrcA += 8;
+ pSrcB += 8;
+
+ /* Compute 4 complex samples at a time */
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ q15x8_t vecSrcC, vecSrcD;
+
+ accReal = vmlsldavaq(accReal, vecSrcA, vecSrcB);
+ vecSrcC = vld1q(pSrcA);
+ pSrcA += 8;
+
+ accImag = vmlaldavaxq(accImag, vecSrcA, vecSrcB);
+ vecSrcD = vld1q(pSrcB);
+ pSrcB += 8;
+
+ accReal = vmlsldavaq(accReal, vecSrcC, vecSrcD);
+ vecSrcA = vld1q(pSrcA);
+ pSrcA += 8;
+
+ accImag = vmlaldavaxq(accImag, vecSrcC, vecSrcD);
+ vecSrcB = vld1q(pSrcB);
+ pSrcB += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Tail */
+ pSrcA -= 8;
+ pSrcB -= 8;
+
+ blkCnt = (blockSize & 7) >> 1;
+
+ while (blkCnt > 0U)
+ {
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
+
+ accReal += (q31_t)a0 * c0;
+ accImag += (q31_t)a0 * d0;
+ accReal -= (q31_t)b0 * d0;
+ accImag += (q31_t)b0 * c0;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store real and imaginary result in 8.24 format */
+ /* Convert real data in 34.30 to 8.24 by 6 right shifts */
+ *realResult = (q31_t) (accReal >> 6);
+ /* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
+ *imagResult = (q31_t) (accImag >> 6);
+}
+#else
void arm_cmplx_dot_prod_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
@@ -66,7 +146,6 @@
q15_t a0,b0,c0,d0;
#if defined (ARM_MATH_LOOPUNROLL)
-
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -148,6 +227,7 @@
/* Convert imaginary data in 34.30 to 8.24 by 6 right shifts */
*imagResult = (q31_t) (imag_sum >> 6);
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_dot_prod group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
index d715d98..821c6f3 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_dot_prod_q31.c
@@ -55,6 +55,72 @@
Input down scaling is not required.
*/
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_dot_prod_q31(
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ uint32_t numSamples,
+ q63_t * realResult,
+ q63_t * imagResult)
+{
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ q31x4_t vecSrcA, vecSrcB;
+ q63_t accReal = 0LL;
+ q63_t accImag = 0LL;
+
+ q31_t a0,b0,c0,d0;
+
+ /* Compute 2 complex samples at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+
+ vecSrcA = vld1q(pSrcA);
+ vecSrcB = vld1q(pSrcB);
+
+ accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
+ accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
+
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrcA += 4;
+ pSrcB += 4;
+ blkCnt --;
+ }
+
+ accReal = asrl(accReal, (14 - 8));
+ accImag = asrl(accImag, (14 - 8));
+
+ /* Tail */
+ blkCnt = (blockSize & 3) >> 1;
+
+ while (blkCnt > 0U)
+ {
+ a0 = *pSrcA++;
+ b0 = *pSrcA++;
+ c0 = *pSrcB++;
+ d0 = *pSrcB++;
+
+ accReal += ((q63_t)a0 * c0) >> 14;
+ accImag += ((q63_t)a0 * d0) >> 14;
+ accReal -= ((q63_t)b0 * d0) >> 14;
+ accImag += ((q63_t)b0 * c0) >> 14;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store real and imaginary result in destination buffer. */
+ *realResult = accReal;
+ *imagResult = accImag;
+}
+
+#else
void arm_cmplx_dot_prod_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
@@ -147,6 +213,7 @@
*realResult = real_sum;
*imagResult = imag_sum;
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_dot_prod group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
index 84812dc..5dce795 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_f32.c
@@ -69,6 +69,86 @@
@return none
*/
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_cmplx_mag_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blockSize = numSamples; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+ f32x4x2_t vecSrc;
+ f32x4_t sum;
+ float32_t real, imag; /* Temporary variables to hold input values */
+
+ /* Compute 4 complex samples at a time */
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t newtonStartVec;
+ f32x4_t sumHalf, invSqrt;
+
+ vecSrc = vld2q(pSrc);
+ pSrc += 8;
+ sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
+ sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
+
+ /*
+ * inlined Fast SQRT using inverse SQRT newton-raphson method
+ */
+
+ /* compute initial value */
+ newtonStartVec = vdupq_n_s32(INVSQRT_MAGIC_F32) - vshrq((q31x4_t) sum, 1);
+ sumHalf = sum * 0.5f;
+ /*
+ * compute 3 x iterations
+ *
+ * The more iterations, the more accuracy.
+ * If you need to trade a bit of accuracy for more performance,
+ * you can comment out the 3rd use of the macro.
+ */
+ INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, (f32x4_t) newtonStartVec);
+ INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
+ INVSQRT_NEWTON_MVE_F32(invSqrt, sumHalf, invSqrt);
+ /*
+ * set negative values to 0
+ */
+ invSqrt = vdupq_m(invSqrt, 0.0f, vcmpltq(invSqrt, 0.0f));
+ /*
+ * sqrt(x) = x * invSqrt(x)
+ */
+ sum = vmulq(sum, invSqrt);
+ vst1q(pDst, sum);
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+
+ /* store result in destination buffer. */
+ arm_sqrt_f32((real * real) + (imag * imag), pDst++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
void arm_cmplx_mag_f32(
const float32_t * pSrc,
float32_t * pDst,
@@ -77,7 +157,7 @@
uint32_t blkCnt; /* loop counter */
float32_t real, imag; /* Temporary variables to hold input values */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vecA;
float32x4_t vRealA;
@@ -125,7 +205,7 @@
#else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -182,6 +262,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
index a493274..498c0e6 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q15.c
@@ -47,7 +47,65 @@
@par Scaling and Overflow Behavior
The function implements 1.15 by 1.15 multiplications and finally output is converted into 2.14 format.
*/
+#if defined(ARM_MATH_MVEI)
+#include "arm_helium_utils.h"
+
+void arm_cmplx_mag_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t numSamples)
+{
+
+ int32_t blockSize = numSamples; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+ q15x8x2_t vecSrc;
+ q15x8_t sum;
+ q31_t in;
+ q31_t acc0;
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld2q(pSrc);
+ pSrc += 16;
+ sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
+ vmulhq(vecSrc.val[1], vecSrc.val[1]));
+
+ sum = vshrq(sum, 1);
+
+ sum = FAST_VSQRT_Q15(sum);
+
+ vst1q(pDst, sum);
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 7;
+
+ while (blkCnt > 0U)
+ {
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+
+ /* store result in 2.14 format in destination buffer. */
+ arm_sqrt_q15((q15_t) (acc0 >> 17), pDst++);
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
void arm_cmplx_mag_q15(
const q15_t * pSrc,
q15_t * pDst,
@@ -156,6 +214,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_mag group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
index 873e566..d0950ee 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_q31.c
@@ -49,6 +49,76 @@
Input down scaling is not required.
*/
+#if defined(ARM_MATH_MVEI)
+
+#include "arm_helium_utils.h"
+
+void arm_cmplx_mag_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blockSize = numSamples; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+
+ q31x4x2_t vecSrc;
+ q31x4_t sum;
+
+ q31_t real, imag; /* Temporary input variables */
+ q31_t acc0, acc1; /* Accumulators */
+
+ /* Compute 4 complex samples at a time */
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld2q(pSrc);
+
+ sum = vqaddq(vmulhq(vecSrc.val[0], vecSrc.val[0]),
+ vmulhq(vecSrc.val[1], vecSrc.val[1]));
+
+ sum = vshrq(sum, 1);
+
+ /*
+
+ This function is using a table. There are compilations flags to avoid
+ including this table (and in this case, arm_cmplx_maq_q31 must not
+ be built and linked.)
+
+ */
+ sum = FAST_VSQRT_Q31(sum);
+
+ vst1q(pDst, sum);
+
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ pSrc += 8;
+ pDst += 4;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C[0] = sqrt(A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+
+ /* store result in 2.30 format in destination buffer. */
+ arm_sqrt_q31(acc0 + acc1, pDst++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
void arm_cmplx_mag_q31(
const q31_t * pSrc,
q31_t * pDst,
@@ -124,6 +194,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of cmplx_mag group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
index 99f051c..1cea04e 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_f32.c
@@ -69,6 +69,56 @@
@return none
*/
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mag_squared_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blockSize = numSamples; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+ f32x4x2_t vecSrc;
+ f32x4_t sum;
+ float32_t real, imag; /* Temporary input variables */
+
+ /* Compute 4 complex samples at a time */
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld2q(pSrc);
+ sum = vmulq(vecSrc.val[0], vecSrc.val[0]);
+ sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]);
+ vst1q(pDst, sum);
+
+ pSrc += 8;
+ pDst += 4;
+
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+
+ /* store result in destination buffer. */
+ *pDst++ = (real * real) + (imag * imag);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
void arm_cmplx_mag_squared_f32(
const float32_t * pSrc,
float32_t * pDst,
@@ -77,7 +127,7 @@
uint32_t blkCnt; /* Loop counter */
float32_t real, imag; /* Temporary input variables */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t vecA;
float32x4_t vRealA;
float32x4_t vImagA;
@@ -123,7 +173,7 @@
blkCnt = numSamples & 7;
#else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -178,6 +228,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of cmplx_mag_squared group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
index fa5f4e6..a6107be 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q15.c
@@ -48,6 +48,64 @@
The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_mag_squared_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blockSize = numSamples; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+ q31_t in;
+ q31_t acc0; /* Accumulators */
+ q15x8x2_t vecSrc;
+ q15x8_t vReal, vImag;
+ q15x8_t vMagSq;
+
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld2q(pSrc);
+ vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
+ vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
+ vMagSq = vqaddq(vReal, vImag);
+ vMagSq = vshrq(vMagSq, 1);
+
+ vst1q(pDst, vMagSq);
+
+ pSrc += 16;
+ pDst += 8;
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ blkCnt --;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ acc0 = __SMUAD(in, in);
+
+ /* store result in 3.13 format in destination buffer. */
+ *pDst++ = (q15_t) (acc0 >> 17);
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
void arm_cmplx_mag_squared_q15(
const q15_t * pSrc,
q15_t * pDst,
@@ -156,6 +214,8 @@
}
+#endif /* defined(ARM_MATH_MVEI) */
+
/**
@} end of cmplx_mag_squared group
*/
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
index 54863ef..7127e6e 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mag_squared_q31.c
@@ -49,6 +49,62 @@
Input down scaling is not required.
*/
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_mag_squared_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blockSize = numSamples; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+ q31x4x2_t vecSrc;
+ q31x4_t vReal, vImag;
+ q31x4_t vMagSq;
+ q31_t real, imag; /* Temporary input variables */
+ q31_t acc0, acc1; /* Accumulators */
+
+ /* Compute 4 complex samples at a time */
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld2q(pSrc);
+ vReal = vmulhq(vecSrc.val[0], vecSrc.val[0]);
+ vImag = vmulhq(vecSrc.val[1], vecSrc.val[1]);
+ vMagSq = vqaddq(vReal, vImag);
+ vMagSq = vshrq(vMagSq, 1);
+
+ vst1q(pDst, vMagSq);
+
+ pSrc += 8;
+ pDst += 4;
+ /*
+ * Decrement the blkCnt loop counter
+ * Advance vector source and destination pointers
+ */
+ blkCnt --;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C[0] = (A[0] * A[0] + A[1] * A[1]) */
+
+ real = *pSrc++;
+ imag = *pSrc++;
+ acc0 = (q31_t) (((q63_t) real * real) >> 33);
+ acc1 = (q31_t) (((q63_t) imag * imag) >> 33);
+
+ /* store result in 3.29 format in destination buffer. */
+ *pDst++ = acc0 + acc1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
void arm_cmplx_mag_squared_q31(
const q31_t * pSrc,
q31_t * pDst,
@@ -124,6 +180,8 @@
}
+#endif /* defined(ARM_MATH_MVEI) */
+
/**
@} end of cmplx_mag_squared group
*/
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
index 8d14821..1de597d 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f32.c
@@ -68,6 +68,67 @@
@return none
*/
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mult_cmplx_f32(
+ const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ float32_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blkCnt; /* loop counters */
+ int32_t blockSize = numSamples; /* loop counters */
+ float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
+
+ f32x4x2_t vecA;
+ f32x4x2_t vecB;
+ f32x4x2_t vecDst;
+
+ /* Compute 4 complex outputs at a time */
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ vecA = vld2q(pSrcA); // load & separate real/imag pSrcA (de-interleave 2)
+ vecB = vld2q(pSrcB); // load & separate real/imag pSrcB
+ pSrcA += 8;
+ pSrcB += 8;
+
+ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
+ vecDst.val[0] = vmulq(vecA.val[0], vecB.val[0]);
+ vecDst.val[0] = vfmsq(vecDst.val[0],vecA.val[1], vecB.val[1]);
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
+ vecDst.val[1] = vmulq(vecA.val[0], vecB.val[1]);
+ vecDst.val[1] = vfmaq(vecDst.val[1], vecA.val[1], vecB.val[0]);
+
+ vst2q(pDst, vecDst);
+ pDst += 8;
+
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
+
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+
+ /* store result in destination buffer. */
+ *pDst++ = (a * c) - (b * d);
+ *pDst++ = (a * d) + (b * c);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
void arm_cmplx_mult_cmplx_f32(
const float32_t * pSrcA,
const float32_t * pSrcB,
@@ -77,7 +138,7 @@
uint32_t blkCnt; /* Loop counter */
float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4x2_t va, vb;
float32x4_t real, imag;
float32x4x2_t outCplx;
@@ -115,7 +176,7 @@
blkCnt = numSamples & 3;
#else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -188,6 +249,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of CmplxByCmplxMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
index 6659427..6650179 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q15.c
@@ -49,6 +49,65 @@
The function implements 1.15 by 1.15 multiplications and finally output is converted into 3.13 format.
*/
+#if defined(ARM_MATH_MVEI)
+
+void arm_cmplx_mult_cmplx_q15(
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t numSamples)
+{
+ int32_t blkCnt; /* loop counters */
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ q15_t a, b, c, d;
+
+ q15x8_t vecA;
+ q15x8_t vecB;
+ q15x8_t vecDst;
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q(pSrcA);
+ vecB = vld1q(pSrcB);
+ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
+ vecDst = vqdmlsdhq_s16(vuninitializedq_f16(), vecA, vecB);
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
+ vecDst = vqdmladhxq_s16(vecDst, vecA, vecB);
+
+ vecDst = vshrq(vecDst, 2);
+
+ vst1q(pDst, vecDst);
+
+ blkCnt --;
+ pSrcA += 8;
+ pSrcB += 8;
+ pDst += 8;
+ };
+
+ /*
+ * tail
+ */
+ blkCnt = (blockSize & 7) >> 1;
+ while (blkCnt > 0U)
+ {
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
+
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+
+ /* store result in 3.13 format in destination buffer. */
+ *pDst++ = (q15_t) ( (((q31_t) a * c) >> 17) - (((q31_t) b * d) >> 17) );
+ *pDst++ = (q15_t) ( (((q31_t) a * d) >> 17) + (((q31_t) b * c) >> 17) );
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
void arm_cmplx_mult_cmplx_q15(
const q15_t * pSrcA,
const q15_t * pSrcB,
@@ -130,6 +189,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of CmplxByCmplxMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
index f6d6dc6..dbf7e2b 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_q31.c
@@ -50,6 +50,62 @@
Input down scaling is not required.
*/
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_mult_cmplx_q31(
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t numSamples)
+{
+
+ int32_t blkCnt; /* loop counters */
+ int32_t blockSize = numSamples * CMPLX_DIM; /* loop counters */
+ q31x4_t vecA;
+ q31x4_t vecB;
+ q31x4_t vecDst;
+ q31_t a, b, c, d; /* Temporary variables */
+
+ /* Compute 2 complex outputs at a time */
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vecA = vld1q(pSrcA);
+ vecB = vld1q(pSrcB);
+ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */
+ vecDst = vqdmlsdhq(vuninitializedq_s32(),vecA, vecB);
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */
+ vecDst = vqdmladhxq(vecDst, vecA, vecB);
+
+ vecDst = vshrq(vecDst, 2);
+ vst1q(pDst, vecDst);
+
+ blkCnt --;
+ pSrcA += 4;
+ pSrcB += 4;
+ pDst += 4;
+ };
+
+ blkCnt = (blockSize & 3) >> 1;
+ while (blkCnt > 0U)
+ {
+ /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */
+ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */
+
+ a = *pSrcA++;
+ b = *pSrcA++;
+ c = *pSrcB++;
+ d = *pSrcB++;
+
+ /* store result in 3.29 format in destination buffer. */
+ *pDst++ = (q31_t) ( (((q63_t) a * c) >> 33) - (((q63_t) b * d) >> 33) );
+ *pDst++ = (q31_t) ( (((q63_t) a * d) >> 33) + (((q63_t) b * c) >> 33) );
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
void arm_cmplx_mult_cmplx_q31(
const q31_t * pSrcA,
const q31_t * pSrcB,
@@ -131,6 +187,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of CmplxByCmplxMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
index 9651999..6e556b5 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_f32.c
@@ -69,6 +69,60 @@
@return none
*/
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_cmplx_mult_real_f32(
+ const float32_t * pSrcCmplx,
+ const float32_t * pSrcReal,
+ float32_t * pCmplxDst,
+ uint32_t numSamples)
+{
+ const static uint32_t stride_cmplx_x_real_32[4] = { 0, 0, 1, 1 };
+
+ int32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ f32x4_t rVec;
+ f32x4_t cmplxVec;
+ f32x4_t dstVec;
+ uint32x4_t strideVec;
+ float32_t in;
+
+
+ /* stride vector for pairs of real generation */
+ strideVec = vld1q(stride_cmplx_x_real_32);
+
+ /* Compute 4 complex outputs at a time */
+ blkCnt = blockSizeC >> 2;
+ while (blkCnt > 0U)
+ {
+ cmplxVec = vld1q(pSrcCmplx);
+ rVec = vldrwq_gather_shifted_offset_f32(pSrcReal, strideVec);
+ dstVec = vmulq(cmplxVec, rVec);
+ vst1q(pCmplxDst, dstVec);
+
+ pSrcReal += 2;
+ pSrcCmplx += 4;
+ pCmplxDst += 4;
+ blkCnt--;
+ }
+
+ blkCnt = (blockSizeC & 3) >> 1;
+ while (blkCnt > 0U)
+ {
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+ in = *pSrcReal++;
+ /* store result in destination buffer. */
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+ *pCmplxDst++ = *pSrcCmplx++ * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
void arm_cmplx_mult_real_f32(
const float32_t * pSrcCmplx,
const float32_t * pSrcReal,
@@ -78,7 +132,7 @@
uint32_t blkCnt; /* Loop counter */
float32_t in; /* Temporary variable */
-#if defined(ARM_MATH_NEON)
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
float32x4_t r;
float32x4x2_t ab,outCplx;
@@ -106,7 +160,7 @@
/* Tail */
blkCnt = numSamples & 3;
#else
-#if defined (ARM_MATH_LOOPUNROLL)
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
/* Loop unrolling: Compute 4 outputs at a time */
blkCnt = numSamples >> 2U;
@@ -163,6 +217,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
/**
@} end of CmplxByRealMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
index 4877d20..854d9a6 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q15.c
@@ -49,6 +49,7 @@
The function uses saturating arithmetic.
Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
*/
+#if defined(ARM_MATH_MVEI)
void arm_cmplx_mult_real_q15(
const q15_t * pSrcCmplx,
@@ -56,6 +57,60 @@
q15_t * pCmplxDst,
uint32_t numSamples)
{
+ const static uint16_t stride_cmplx_x_real_16[8] = {
+ 0, 0, 1, 1, 2, 2, 3, 3
+ };
+ q15x8_t rVec;
+ q15x8_t cmplxVec;
+ q15x8_t dstVec;
+ uint16x8_t strideVec;
+ int32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ q15_t in;
+
+ /*
+ * stride vector for pairs of real generation
+ */
+ strideVec = vld1q(stride_cmplx_x_real_16);
+
+ blkCnt = blockSizeC >> 3;
+
+ while (blkCnt > 0U)
+ {
+ cmplxVec = vld1q(pSrcCmplx);
+ rVec = vldrhq_gather_shifted_offset_s16(pSrcReal, strideVec);
+ dstVec = vqdmulhq(cmplxVec, rVec);
+ vst1q(pCmplxDst, dstVec);
+
+ pSrcReal += 4;
+ pSrcCmplx += 8;
+ pCmplxDst += 8;
+ blkCnt --;
+ }
+
+ /* Tail */
+ blkCnt = (blockSizeC & 7) >> 1;
+ while (blkCnt > 0U)
+ {
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+ in = *pSrcReal++;
+ /* store the result in the destination buffer. */
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+ *pCmplxDst++ = (q15_t) __SSAT((((q31_t) *pSrcCmplx++ * in) >> 15), 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_cmplx_mult_real_q15(
+ const q15_t * pSrcCmplx,
+ const q15_t * pSrcReal,
+ q15_t * pCmplxDst,
+ uint32_t numSamples)
+{
uint32_t blkCnt; /* Loop counter */
q15_t in; /* Temporary variable */
@@ -176,6 +231,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of CmplxByRealMult group
diff --git a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
index 906410f..7125cc8 100644
--- a/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
+++ b/CMSIS/DSP/Source/ComplexMathFunctions/arm_cmplx_mult_real_q31.c
@@ -50,6 +50,61 @@
Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
*/
+#if defined(ARM_MATH_MVEI)
+void arm_cmplx_mult_real_q31(
+ const q31_t * pSrcCmplx,
+ const q31_t * pSrcReal,
+ q31_t * pCmplxDst,
+ uint32_t numSamples)
+{
+
+ const static uint32_t stride_cmplx_x_real_32[4] = {
+ 0, 0, 1, 1
+ };
+ q31x4_t rVec;
+ q31x4_t cmplxVec;
+ q31x4_t dstVec;
+ uint32x4_t strideVec;
+ int32_t blockSizeC = numSamples * CMPLX_DIM; /* loop counters */
+ int32_t blkCnt;
+ q31_t in;
+
+ /*
+ * stride vector for pairs of real generation
+ */
+ strideVec = vld1q(stride_cmplx_x_real_32);
+
+ /* Compute 4 complex outputs at a time */
+ blkCnt = blockSizeC >> 2;
+ while (blkCnt > 0U)
+ {
+ cmplxVec = vld1q(pSrcCmplx);
+ rVec = vldrwq_gather_shifted_offset_s32(pSrcReal, strideVec);
+ dstVec = vqdmulhq(cmplxVec, rVec);
+ vst1q(pCmplxDst, dstVec);
+
+ pSrcReal += 2;
+ pSrcCmplx += 4;
+ pCmplxDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = (blockSizeC & 3) >> 1;
+ while (blkCnt > 0U)
+ {
+ /* C[2 * i ] = A[2 * i ] * B[i]. */
+ /* C[2 * i + 1] = A[2 * i + 1] * B[i]. */
+
+ in = *pSrcReal++;
+ /* store saturated result in 1.31 format to destination buffer */
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+ *pCmplxDst++ = (__SSAT((q31_t) (((q63_t) *pSrcCmplx++ * in) >> 32), 31) << 1);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
void arm_cmplx_mult_real_q31(
const q31_t * pSrcCmplx,
const q31_t * pSrcReal,
@@ -142,6 +197,7 @@
}
}
+#endif /* defined(ARM_MATH_MVEI) */
/**
@} end of CmplxByRealMult group
diff --git a/CMSIS/DSP/Source/interpol.cmake b/CMSIS/DSP/Source/interpol.cmake
index 80282cf..2538d91 100644
--- a/CMSIS/DSP/Source/interpol.cmake
+++ b/CMSIS/DSP/Source/interpol.cmake
@@ -40,4 +40,12 @@
target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_RECIP_Q15)
endif()
+if (CONFIGTABLE AND ARM_CMPLX_MAG_Q31 AND (MVEI OR HELIUM))
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_FAST_SQRT_Q31_MVE)
+endif()
+
+if (CONFIGTABLE AND ARM_CMPLX_MAG_Q15 AND (MVEI OR HELIUM))
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_FAST_SQRT_Q15_MVE)
+endif()
+
endfunction()
\ No newline at end of file