Added logical operators + sorting + spline
- Added NEON bitwise AND, NOT, OR, XOR (q7, q15, q31)
- Added Sorting algorithms f32 (NEON bitonic sort)
- Added cubic spline interpolation function
- Added test patterns for all
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q15.c
new file mode 100644
index 0000000..e53706f
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q15.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_and_q15.c
+ * Description: Q15 bitwise AND
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup And Vector bitwise AND
+
+ Compute the logical bitwise AND.
+
+ There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup And
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise AND of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_and_q15(
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int16x8_t vecA, vecB;
+
+ /* Compute 8 outputs at a time */
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s16(pSrcA);
+ vecB = vld1q_s16(pSrcB);
+
+ vst1q_s16(pDst, vandq_s16(vecA, vecB) );
+
+ pSrcA += 8;
+ pSrcB += 8;
+ pDst += 8;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 7;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of And group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q31.c
new file mode 100644
index 0000000..0348a59
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q31.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_and_q31.c
+ * Description: Q31 bitwise AND
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup And
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise AND of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_and_q31(
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int32x4_t vecA, vecB;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s32(pSrcA);
+ vecB = vld1q_s32(pSrcB);
+
+ vst1q_s32(pDst, vandq_s32(vecA, vecB) );
+
+ pSrcA += 4;
+ pSrcB += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of And group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q7.c
new file mode 100644
index 0000000..97734a8
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q7.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_and_q7.c
+ * Description: Q7 bitwise AND
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup And
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise AND of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_and_q7(
+ const q7_t * pSrcA,
+ const q7_t * pSrcB,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int8x16_t vecA, vecB;
+
+ /* Compute 16 outputs at a time */
+ blkCnt = blockSize >> 4U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s8(pSrcA);
+ vecB = vld1q_s8(pSrcB);
+
+ vst1q_s8(pDst, vandq_s8(vecA, vecB) );
+
+ pSrcA += 16;
+ pSrcB += 16;
+ pDst += 16;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0xF;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of And group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q15.c
new file mode 100644
index 0000000..7cfe07e
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q15.c
@@ -0,0 +1,100 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_not_q15.c
+ * Description: Q15 bitwise NOT
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Not Vector bitwise NOT
+
+ Compute the logical bitwise NOT.
+
+ There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Not
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise NOT of a fixed-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_not_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int16x8_t inV;
+
+ /* Compute 8 outputs at a time */
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ inV = vld1q_s16(pSrc);
+
+ vst1q_s16(pDst, vmvnq_s16(inV) );
+
+ pSrc += 8;
+ pDst += 8;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 7;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = ~(*pSrc++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Not group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q31.c
new file mode 100644
index 0000000..c6b08bd
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q31.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_not_q31.c
+ * Description: Q31 bitwise NOT
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Not
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise NOT of a fixed-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_not_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int32x4_t inV;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ inV = vld1q_s32(pSrc);
+
+ vst1q_s32(pDst, vmvnq_s32(inV) );
+
+ pSrc += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = ~(*pSrc++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Not group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q7.c
new file mode 100644
index 0000000..46baf2c
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q7.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_not_q7.c
+ * Description: Q7 bitwise NOT
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Not
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise NOT of a fixed-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_not_q7(
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int8x16_t inV;
+
+ /* Compute 16 outputs at a time */
+ blkCnt = blockSize >> 4U;
+
+ while (blkCnt > 0U)
+ {
+ inV = vld1q_s8(pSrc);
+
+ vst1q_s8(pDst, vmvnq_s8(inV) );
+
+ pSrc += 16;
+ pDst += 16;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0xF;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = ~(*pSrc++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Not group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q15.c
new file mode 100644
index 0000000..d092071
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q15.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_or_q15.c
+ * Description: Q15 bitwise inclusive OR
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Or Vector bitwise inclusive OR
+
+ Compute the logical bitwise OR.
+
+ There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Or
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise OR of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_or_q15(
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int16x8_t vecA, vecB;
+
+ /* Compute 8 outputs at a time */
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s16(pSrcA);
+ vecB = vld1q_s16(pSrcB);
+
+ vst1q_s16(pDst, vorrq_s16(vecA, vecB) );
+
+ pSrcA += 8;
+ pSrcB += 8;
+ pDst += 8;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 7;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Or group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q31.c
new file mode 100644
index 0000000..749165a
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q31.c
@@ -0,0 +1,95 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_or_q31.c
+ * Description: Q31 bitwise inclusive OR
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Or
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise OR of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_or_q31(
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int32x4_t vecA, vecB;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s32(pSrcA);
+ vecB = vld1q_s32(pSrcB);
+
+ vst1q_s32(pDst, vorrq_s32(vecA, vecB) );
+
+ pSrcA += 4;
+ pSrcB += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+/**
+ @} end of Or group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q7.c
new file mode 100644
index 0000000..1b563d2
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q7.c
@@ -0,0 +1,95 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_or_q7.c
+ * Description: Q7 bitwise inclusive OR
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Or
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise OR of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+void arm_or_q7(
+ const q7_t * pSrcA,
+ const q7_t * pSrcB,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int8x16_t vecA, vecB;
+
+ /* Compute 16 outputs at a time */
+ blkCnt = blockSize >> 4U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s8(pSrcA);
+ vecB = vld1q_s8(pSrcB);
+
+ vst1q_s8(pDst, vorrq_s8(vecA, vecB) );
+
+ pSrcA += 16;
+ pSrcB += 16;
+ pDst += 16;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0xF;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+/**
+ @} end of Or group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q15.c
new file mode 100644
index 0000000..d0a6a72
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q15.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_xor_q15.c
+ * Description: Q15 bitwise exclusive OR
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Xor Vector bitwise exclusive OR
+
+ Compute the logical bitwise XOR.
+
+ There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Xor
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise XOR of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+#include "arm_math.h"
+
+void arm_xor_q15(
+ const q15_t * pSrcA,
+ const q15_t * pSrcB,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int16x8_t vecA, vecB;
+
+ /* Compute 8 outputs at a time */
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s16(pSrcA);
+ vecB = vld1q_s16(pSrcB);
+
+ vst1q_s16(pDst, veorq_s16(vecA, vecB) );
+
+ pSrcA += 8;
+ pSrcB += 8;
+ pDst += 8;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 7;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Xor group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q31.c
new file mode 100644
index 0000000..51cba23
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q31.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_xor_q31.c
+ * Description: Q31 bitwise exclusive OR
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Xor
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise XOR of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+#include "arm_math.h"
+
+void arm_xor_q31(
+ const q31_t * pSrcA,
+ const q31_t * pSrcB,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int32x4_t vecA, vecB;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s32(pSrcA);
+ vecB = vld1q_s32(pSrcB);
+
+ vst1q_s32(pDst, veorq_s32(vecA, vecB) );
+
+ pSrcA += 4;
+ pSrcB += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 3;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Xor group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q7.c
new file mode 100644
index 0000000..4c75e22
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q7.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_xor_q7.c
+ * Description: Q7 bitwise exclusive OR
+ *
+ * $Date: 14 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Xor
+ @{
+ */
+
+/**
+ @brief Compute the logical bitwise XOR of two fixed-point vectors.
+ @param[in] pSrcA points to input vector A
+ @param[in] pSrcB points to input vector B
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+
+#include "arm_math.h"
+
+void arm_xor_q7(
+ const q7_t * pSrcA,
+ const q7_t * pSrcB,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+ int8x16_t vecA, vecB;
+
+ /* Compute 16 outputs at a time */
+ blkCnt = blockSize >> 4U;
+
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q_s8(pSrcA);
+ vecB = vld1q_s8(pSrcB);
+
+ vst1q_s8(pDst, veorq_s8(vecA, vecB) );
+
+ pSrcA += 16;
+ pSrcB += 16;
+ pDst += 16;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0xF;
+#else
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+#endif
+
+ while (blkCnt > 0U)
+ {
+ *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ @} end of Xor group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
new file mode 100644
index 0000000..67ecc8f
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
@@ -0,0 +1,1032 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitonic_sort_f32.c
+ * Description: Floating point bitonic sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Sorting Vector sorting algorithms
+
+ Sort the elements of a vector
+
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ */
+
+static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
+{
+ uint32_t step;
+ uint32_t k, j;
+ float32_t *leftPtr, *rightPtr;
+ float32_t temp;
+
+ step = n>>1;
+ leftPtr = pSrc;
+ rightPtr = pSrc+n-1;
+
+ for(k=0; k<step; k++)
+ {
+ if(dir == (*leftPtr > *rightPtr))
+ {
+ // Swap
+ temp=*leftPtr;
+ *leftPtr=*rightPtr;
+ *rightPtr=temp;
+ }
+
+ leftPtr++; // Move right
+ rightPtr--; // Move left
+ }
+
+ // Merge
+ for(step=(n>>2); step>0; step/=2)
+ {
+ for(j=0; j<n; j=j+step*2)
+ {
+ leftPtr = pSrc+j;
+ rightPtr = pSrc+j+step;
+
+ for(k=0; k<step; k++)
+ {
+ if(*leftPtr > *rightPtr)
+ {
+ // Swap
+ temp=*leftPtr;
+ *leftPtr=*rightPtr;
+ *rightPtr=temp;
+ }
+
+ leftPtr++;
+ rightPtr++;
+ }
+ }
+ }
+}
+
+#ifdef ARM_MATH_NEON
+
+static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
+{
+ float32x4_t a;
+ float32x4_t b;
+ float32x4_t c;
+ float32x4_t d;
+
+ // Load 16 samples
+ a = vld1q_f32(pSrc);
+ b = vld1q_f32(pSrc+4);
+ c = vld1q_f32(pSrc+8);
+ d = vld1q_f32(pSrc+12);
+
+ // Bitonic sorting network for 4 samples x 4 times
+ if(dir)
+ {
+ vminmaxq(a, b);
+ vminmaxq(c, d);
+
+ vminmaxq(a, d);
+ vminmaxq(b, c);
+
+ vminmaxq(a, b);
+ vminmaxq(c, d);
+ }
+ else
+ {
+ vminmaxq(b, a);
+ vminmaxq(d, c);
+
+ vminmaxq(d, a);
+ vminmaxq(c, b);
+
+ vminmaxq(b, a);
+ vminmaxq(d, c);
+ }
+
+ float32x4x2_t ab = vtrnq_f32 (a, b);
+ float32x4x2_t cd = vtrnq_f32 (c, d);
+
+ // Transpose 4 ordered arrays of 4 samples
+ a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
+ b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
+ c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
+ d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
+
+ // Merge pairs of arrays of 4 samples
+ ab = arm_bitonic_merge_8_f32(a, b, dir);
+ cd = arm_bitonic_merge_8_f32(c, d, dir);
+
+ // Merge arrays of 8 samples
+ arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
+}
+
+static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+ // Merge two preordered float32x4x2_t
+ vrev256q_f32(b);
+
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ arm_bitonic_resort_16_f32(pOut, a, b, dir);
+}
+
+static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+ /* Start with two vectors:
+ * +---+---+---+---+---+---+---+---+
+ * | a | b | c | d | e | f | g | h |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | i | j | k | l | m | n | o | p |
+ * +---+---+---+---+---+---+---+---+
+ * All the elements of the first are guaranteed to be less than or equal to
+ * all of the elements in the second, and both vectors are bitonic.
+ * We need to perform these operations to completely sort both lists:
+ * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
+ * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
+ * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
+ */
+
+ vtrn256_128q(a, b);
+ /* +---+---+---+---+---+---+---+---+
+ * | a | b | c | d | i | j | k | l |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | e | f | g | h | m | n | o | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ vtrn256_64q(a, b);
+
+ /* +---+---+---+---+---+---+---+---+
+ * | a | b | e | f | i | j | m | n |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | c | d | g | h | k | l | o | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ vtrn256_32q(a, b);
+ /* We now have:
+ * +---+---+---+---+---+---+---+---+
+ * | a | c | e | g | i | k | m | o |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | b | d | f | h | j | l | n | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
+ float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
+
+ vst1q_f32(pOut, out1.val[0]);
+ vst1q_f32(pOut+4, out1.val[1]);
+ vst1q_f32(pOut+8, out2.val[0]);
+ vst1q_f32(pOut+12, out2.val[1]);
+}
+
+static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ /* a and b are guaranteed to be bitonic */
+ // Reverse the element of the second vector
+ b = vrev128q_f32(b);
+
+ // Compare the two vectors
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ // Merge the two vectors
+ float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
+
+ return ab;
+}
+
+static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ /* Start with two vectors:
+ * +---+---+---+---+
+ * | a | b | c | d |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | e | f | g | h |
+ * +---+---+---+---+
+ * All the elements of the first are guaranteed to be less than or equal to
+ * all of the elements in the second, and both vectors are bitonic.
+ * We need to perform these operations to completely sort both lists:
+ * vminmax([abcd],[efgh])
+ * vminmax([acbd],[egfh])
+ */
+ vtrn128_64q(a, b);
+ /* +---+---+---+---+
+ * | a | b | e | f |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | c | d | g | h |
+ * +---+---+---+---+
+ */
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ vtrn128_32q(a, b);
+ /* +---+---+---+---+
+ * | a | c | e | g |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | b | d | f | h |
+ * +---+---+---+---+
+ */
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ return vzipq_f32(a, b);
+}
+
+static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
+{
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ }
+ //Transpose 256
+ float32x4_t temp;
+
+ temp = ab2.val[0];
+ ab2.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab2.val[1];
+ ab2.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ }
+
+ //Transpose 128
+ arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir);
+ arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
+}
+
+static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4;
+ float32x4x2_t cd1, cd2, cd3, cd4;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12);
+ ab3.val[0] = vld1q_f32(pSrc+16);
+ ab3.val[1] = vld1q_f32(pSrc+20);
+ ab4.val[0] = vld1q_f32(pSrc+24);
+ ab4.val[1] = vld1q_f32(pSrc+28);
+
+ vldrev128q_f32(cd4.val[1], pSrc+32);
+ vldrev128q_f32(cd4.val[0], pSrc+36);
+ vldrev128q_f32(cd3.val[1], pSrc+40);
+ vldrev128q_f32(cd3.val[0], pSrc+44);
+ vldrev128q_f32(cd2.val[1], pSrc+48);
+ vldrev128q_f32(cd2.val[0], pSrc+52);
+ vldrev128q_f32(cd1.val[1], pSrc+56);
+ vldrev128q_f32(cd1.val[0], pSrc+60);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ }
+
+ //Transpose 512
+ float32x4_t temp;
+
+ temp = ab3.val[0];
+ ab3.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab3.val[1];
+ ab3.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab4.val[0];
+ ab4.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab4.val[1];
+ ab4.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ }
+
+ //Transpose 256
+ arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir);
+ arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
+}
+
+static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+ float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12);
+ ab3.val[0] = vld1q_f32(pSrc+16);
+ ab3.val[1] = vld1q_f32(pSrc+20);
+ ab4.val[0] = vld1q_f32(pSrc+24);
+ ab4.val[1] = vld1q_f32(pSrc+28);
+ ab5.val[0] = vld1q_f32(pSrc+32);
+ ab5.val[1] = vld1q_f32(pSrc+36);
+ ab6.val[0] = vld1q_f32(pSrc+40);
+ ab6.val[1] = vld1q_f32(pSrc+44);
+ ab7.val[0] = vld1q_f32(pSrc+48);
+ ab7.val[1] = vld1q_f32(pSrc+52);
+ ab8.val[0] = vld1q_f32(pSrc+56);
+ ab8.val[1] = vld1q_f32(pSrc+60);
+
+ vldrev128q_f32(cd8.val[1], pSrc+64);
+ vldrev128q_f32(cd8.val[0], pSrc+68);
+ vldrev128q_f32(cd7.val[1], pSrc+72);
+ vldrev128q_f32(cd7.val[0], pSrc+76);
+ vldrev128q_f32(cd6.val[1], pSrc+80);
+ vldrev128q_f32(cd6.val[0], pSrc+84);
+ vldrev128q_f32(cd5.val[1], pSrc+88);
+ vldrev128q_f32(cd5.val[0], pSrc+92);
+ vldrev128q_f32(cd4.val[1], pSrc+96);
+ vldrev128q_f32(cd4.val[0], pSrc+100);
+ vldrev128q_f32(cd3.val[1], pSrc+104);
+ vldrev128q_f32(cd3.val[0], pSrc+108);
+ vldrev128q_f32(cd2.val[1], pSrc+112);
+ vldrev128q_f32(cd2.val[0], pSrc+116);
+ vldrev128q_f32(cd1.val[1], pSrc+120);
+ vldrev128q_f32(cd1.val[0], pSrc+124);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ vminmax256q(ab5, cd5);
+ vminmax256q(ab6, cd6);
+ vminmax256q(ab7, cd7);
+ vminmax256q(ab8, cd8);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ vminmax256q(cd5, ab5);
+ vminmax256q(cd6, ab6);
+ vminmax256q(cd7, ab7);
+ vminmax256q(cd8, ab8);
+ }
+
+ //Transpose
+ float32x4_t temp;
+
+ temp = ab5.val[0];
+ ab5.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab5.val[1];
+ ab5.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab6.val[0];
+ ab6.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab6.val[1];
+ ab6.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+ temp = ab7.val[0];
+ ab7.val[0] = cd3.val[0];
+ cd3.val[0] = temp;
+ temp = ab7.val[1];
+ ab7.val[1] = cd3.val[1];
+ cd3.val[1] = temp;
+ temp = ab8.val[0];
+ ab8.val[0] = cd4.val[0];
+ cd4.val[0] = temp;
+ temp = ab8.val[1];
+ ab8.val[1] = cd4.val[1];
+ cd4.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ vminmax256q(ab5, cd5);
+ vminmax256q(ab6, cd6);
+ vminmax256q(ab7, cd7);
+ vminmax256q(ab8, cd8);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ vminmax256q(cd5, ab5);
+ vminmax256q(cd6, ab6);
+ vminmax256q(cd7, ab7);
+ vminmax256q(cd8, ab8);
+ }
+
+ vst1q_f32(pSrc, ab1.val[0]);
+ vst1q_f32(pSrc+4, ab1.val[1]);
+ vst1q_f32(pSrc+8, ab2.val[0]);
+ vst1q_f32(pSrc+12, ab2.val[1]);
+ vst1q_f32(pSrc+16, ab3.val[0]);
+ vst1q_f32(pSrc+20, ab3.val[1]);
+ vst1q_f32(pSrc+24, ab4.val[0]);
+ vst1q_f32(pSrc+28, ab4.val[1]);
+ vst1q_f32(pSrc+32, cd1.val[0]);
+ vst1q_f32(pSrc+36, cd1.val[1]);
+ vst1q_f32(pSrc+40, cd2.val[0]);
+ vst1q_f32(pSrc+44, cd2.val[1]);
+ vst1q_f32(pSrc+48, cd3.val[0]);
+ vst1q_f32(pSrc+52, cd3.val[1]);
+ vst1q_f32(pSrc+56, cd4.val[0]);
+ vst1q_f32(pSrc+60, cd4.val[1]);
+ vst1q_f32(pSrc+64, ab5.val[0]);
+ vst1q_f32(pSrc+68, ab5.val[1]);
+ vst1q_f32(pSrc+72, ab6.val[0]);
+ vst1q_f32(pSrc+76, ab6.val[1]);
+ vst1q_f32(pSrc+80, ab7.val[0]);
+ vst1q_f32(pSrc+84, ab7.val[1]);
+ vst1q_f32(pSrc+88, ab8.val[0]);
+ vst1q_f32(pSrc+92, ab8.val[1]);
+ vst1q_f32(pSrc+96, cd5.val[0]);
+ vst1q_f32(pSrc+100, cd5.val[1]);
+ vst1q_f32(pSrc+104, cd6.val[0]);
+ vst1q_f32(pSrc+108, cd6.val[1]);
+ vst1q_f32(pSrc+112, cd7.val[0]);
+ vst1q_f32(pSrc+116, cd7.val[1]);
+ vst1q_f32(pSrc+120, cd8.val[0]);
+ vst1q_f32(pSrc+124, cd8.val[1]);
+
+ //Transpose
+ arm_bitonic_merge_64_f32(pSrc+0 , dir);
+ arm_bitonic_merge_64_f32(pSrc+64, dir);
+}
+
+static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+ float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
+ float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+ float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12 );
+ ab3.val[0] = vld1q_f32(pSrc+16 );
+ ab3.val[1] = vld1q_f32(pSrc+20 );
+ ab4.val[0] = vld1q_f32(pSrc+24 );
+ ab4.val[1] = vld1q_f32(pSrc+28 );
+ ab5.val[0] = vld1q_f32(pSrc+32 );
+ ab5.val[1] = vld1q_f32(pSrc+36 );
+ ab6.val[0] = vld1q_f32(pSrc+40 );
+ ab6.val[1] = vld1q_f32(pSrc+44 );
+ ab7.val[0] = vld1q_f32(pSrc+48 );
+ ab7.val[1] = vld1q_f32(pSrc+52 );
+ ab8.val[0] = vld1q_f32(pSrc+56 );
+ ab8.val[1] = vld1q_f32(pSrc+60 );
+ ab9.val[0] = vld1q_f32(pSrc+64 );
+ ab9.val[1] = vld1q_f32(pSrc+68 );
+ ab10.val[0] = vld1q_f32(pSrc+72 );
+ ab10.val[1] = vld1q_f32(pSrc+76 );
+ ab11.val[0] = vld1q_f32(pSrc+80 );
+ ab11.val[1] = vld1q_f32(pSrc+84 );
+ ab12.val[0] = vld1q_f32(pSrc+88 );
+ ab12.val[1] = vld1q_f32(pSrc+92 );
+ ab13.val[0] = vld1q_f32(pSrc+96 );
+ ab13.val[1] = vld1q_f32(pSrc+100);
+ ab14.val[0] = vld1q_f32(pSrc+104);
+ ab14.val[1] = vld1q_f32(pSrc+108);
+ ab15.val[0] = vld1q_f32(pSrc+112);
+ ab15.val[1] = vld1q_f32(pSrc+116);
+ ab16.val[0] = vld1q_f32(pSrc+120);
+ ab16.val[1] = vld1q_f32(pSrc+124);
+
+ vldrev128q_f32(cd16.val[1], pSrc+128);
+ vldrev128q_f32(cd16.val[0], pSrc+132);
+ vldrev128q_f32(cd15.val[1], pSrc+136);
+ vldrev128q_f32(cd15.val[0], pSrc+140);
+ vldrev128q_f32(cd14.val[1], pSrc+144);
+ vldrev128q_f32(cd14.val[0], pSrc+148);
+ vldrev128q_f32(cd13.val[1], pSrc+152);
+ vldrev128q_f32(cd13.val[0], pSrc+156);
+ vldrev128q_f32(cd12.val[1], pSrc+160);
+ vldrev128q_f32(cd12.val[0], pSrc+164);
+ vldrev128q_f32(cd11.val[1], pSrc+168);
+ vldrev128q_f32(cd11.val[0], pSrc+172);
+ vldrev128q_f32(cd10.val[1], pSrc+176);
+ vldrev128q_f32(cd10.val[0], pSrc+180);
+ vldrev128q_f32(cd9.val[1] , pSrc+184);
+ vldrev128q_f32(cd9.val[0] , pSrc+188);
+ vldrev128q_f32(cd8.val[1] , pSrc+192);
+ vldrev128q_f32(cd8.val[0] , pSrc+196);
+ vldrev128q_f32(cd7.val[1] , pSrc+200);
+ vldrev128q_f32(cd7.val[0] , pSrc+204);
+ vldrev128q_f32(cd6.val[1] , pSrc+208);
+ vldrev128q_f32(cd6.val[0] , pSrc+212);
+ vldrev128q_f32(cd5.val[1] , pSrc+216);
+ vldrev128q_f32(cd5.val[0] , pSrc+220);
+ vldrev128q_f32(cd4.val[1] , pSrc+224);
+ vldrev128q_f32(cd4.val[0] , pSrc+228);
+ vldrev128q_f32(cd3.val[1] , pSrc+232);
+ vldrev128q_f32(cd3.val[0] , pSrc+236);
+ vldrev128q_f32(cd2.val[1] , pSrc+240);
+ vldrev128q_f32(cd2.val[0] , pSrc+244);
+ vldrev128q_f32(cd1.val[1] , pSrc+248);
+ vldrev128q_f32(cd1.val[0] , pSrc+252);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1 , cd1 );
+ vminmax256q(ab2 , cd2 );
+ vminmax256q(ab3 , cd3 );
+ vminmax256q(ab4 , cd4 );
+ vminmax256q(ab5 , cd5 );
+ vminmax256q(ab6 , cd6 );
+ vminmax256q(ab7 , cd7 );
+ vminmax256q(ab8 , cd8 );
+ vminmax256q(ab9 , cd9 );
+ vminmax256q(ab10, cd10);
+ vminmax256q(ab11, cd11);
+ vminmax256q(ab12, cd12);
+ vminmax256q(ab13, cd13);
+ vminmax256q(ab14, cd14);
+ vminmax256q(ab15, cd15);
+ vminmax256q(ab16, cd16);
+ }
+ else
+ {
+ vminmax256q(cd1 , ab1 );
+ vminmax256q(cd2 , ab2 );
+ vminmax256q(cd3 , ab3 );
+ vminmax256q(cd4 , ab4 );
+ vminmax256q(cd5 , ab5 );
+ vminmax256q(cd6 , ab6 );
+ vminmax256q(cd7 , ab7 );
+ vminmax256q(cd8 , ab8 );
+ vminmax256q(cd9 , ab9 );
+ vminmax256q(cd10, ab10);
+ vminmax256q(cd11, ab11);
+ vminmax256q(cd12, ab12);
+ vminmax256q(cd13, ab13);
+ vminmax256q(cd14, ab14);
+ vminmax256q(cd15, ab15);
+ vminmax256q(cd16, ab16);
+ }
+
+ //Transpose
+ float32x4_t temp;
+
+ temp = ab9.val[0];
+ ab9.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab9.val[1];
+ ab9.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab10.val[0];
+ ab10.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab10.val[1];
+ ab10.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+ temp = ab11.val[0];
+ ab11.val[0] = cd3.val[0];
+ cd3.val[0] = temp;
+ temp = ab11.val[1];
+ ab11.val[1] = cd3.val[1];
+ cd3.val[1] = temp;
+ temp = ab12.val[0];
+ ab12.val[0] = cd4.val[0];
+ cd4.val[0] = temp;
+ temp = ab12.val[1];
+ ab12.val[1] = cd4.val[1];
+ cd4.val[1] = temp;
+ temp = ab13.val[0];
+ ab13.val[0] = cd5.val[0];
+ cd5.val[0] = temp;
+ temp = ab13.val[1];
+ ab13.val[1] = cd5.val[1];
+ cd5.val[1] = temp;
+ temp = ab14.val[0];
+ ab14.val[0] = cd6.val[0];
+ cd6.val[0] = temp;
+ temp = ab14.val[1];
+ ab14.val[1] = cd6.val[1];
+ cd6.val[1] = temp;
+ temp = ab15.val[0];
+ ab15.val[0] = cd7.val[0];
+ cd7.val[0] = temp;
+ temp = ab15.val[1];
+ ab15.val[1] = cd7.val[1];
+ cd7.val[1] = temp;
+ temp = ab16.val[0];
+ ab16.val[0] = cd8.val[0];
+ cd8.val[0] = temp;
+ temp = ab16.val[1];
+ ab16.val[1] = cd8.val[1];
+ cd8.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1 , cd1 );
+ vminmax256q(ab2 , cd2 );
+ vminmax256q(ab3 , cd3 );
+ vminmax256q(ab4 , cd4 );
+ vminmax256q(ab5 , cd5 );
+ vminmax256q(ab6 , cd6 );
+ vminmax256q(ab7 , cd7 );
+ vminmax256q(ab8 , cd8 );
+ vminmax256q(ab9 , cd9 );
+ vminmax256q(ab10, cd10);
+ vminmax256q(ab11, cd11);
+ vminmax256q(ab12, cd12);
+ vminmax256q(ab13, cd13);
+ vminmax256q(ab14, cd14);
+ vminmax256q(ab15, cd15);
+ vminmax256q(ab16, cd16);
+ }
+ else
+ {
+ vminmax256q(cd1 , ab1 );
+ vminmax256q(cd2 , ab2 );
+ vminmax256q(cd3 , ab3 );
+ vminmax256q(cd4 , ab4 );
+ vminmax256q(cd5 , ab5 );
+ vminmax256q(cd6 , ab6 );
+ vminmax256q(cd7 , ab7 );
+ vminmax256q(cd8 , ab8 );
+ vminmax256q(cd9 , ab9 );
+ vminmax256q(cd10, ab10);
+ vminmax256q(cd11, ab11);
+ vminmax256q(cd12, ab12);
+ vminmax256q(cd13, ab13);
+ vminmax256q(cd14, ab14);
+ vminmax256q(cd15, ab15);
+ vminmax256q(cd16, ab16);
+ }
+
+ vst1q_f32(pSrc, ab1.val[0] );
+ vst1q_f32(pSrc+4, ab1.val[1] );
+ vst1q_f32(pSrc+8, ab2.val[0] );
+ vst1q_f32(pSrc+12, ab2.val[1] );
+ vst1q_f32(pSrc+16, ab3.val[0] );
+ vst1q_f32(pSrc+20, ab3.val[1] );
+ vst1q_f32(pSrc+24, ab4.val[0] );
+ vst1q_f32(pSrc+28, ab4.val[1] );
+ vst1q_f32(pSrc+32, ab5.val[0] );
+ vst1q_f32(pSrc+36, ab5.val[1] );
+ vst1q_f32(pSrc+40, ab6.val[0] );
+ vst1q_f32(pSrc+44, ab6.val[1] );
+ vst1q_f32(pSrc+48, ab7.val[0] );
+ vst1q_f32(pSrc+52, ab7.val[1] );
+ vst1q_f32(pSrc+56, ab8.val[0] );
+ vst1q_f32(pSrc+60, ab8.val[1] );
+ vst1q_f32(pSrc+64, cd1.val[0] );
+ vst1q_f32(pSrc+68, cd1.val[1] );
+ vst1q_f32(pSrc+72, cd2.val[0] );
+ vst1q_f32(pSrc+76, cd2.val[1] );
+ vst1q_f32(pSrc+80, cd3.val[0] );
+ vst1q_f32(pSrc+84, cd3.val[1] );
+ vst1q_f32(pSrc+88, cd4.val[0] );
+ vst1q_f32(pSrc+92, cd4.val[1] );
+ vst1q_f32(pSrc+96, cd5.val[0] );
+ vst1q_f32(pSrc+100, cd5.val[1] );
+ vst1q_f32(pSrc+104, cd6.val[0] );
+ vst1q_f32(pSrc+108, cd6.val[1] );
+ vst1q_f32(pSrc+112, cd7.val[0] );
+ vst1q_f32(pSrc+116, cd7.val[1] );
+ vst1q_f32(pSrc+120, cd8.val[0] );
+ vst1q_f32(pSrc+124, cd8.val[1] );
+ vst1q_f32(pSrc+128, ab9.val[0] );
+ vst1q_f32(pSrc+132, ab9.val[1] );
+ vst1q_f32(pSrc+136, ab10.val[0]);
+ vst1q_f32(pSrc+140, ab10.val[1]);
+ vst1q_f32(pSrc+144, ab11.val[0]);
+ vst1q_f32(pSrc+148, ab11.val[1]);
+ vst1q_f32(pSrc+152, ab12.val[0]);
+ vst1q_f32(pSrc+156, ab12.val[1]);
+ vst1q_f32(pSrc+160, ab13.val[0]);
+ vst1q_f32(pSrc+164, ab13.val[1]);
+ vst1q_f32(pSrc+168, ab14.val[0]);
+ vst1q_f32(pSrc+172, ab14.val[1]);
+ vst1q_f32(pSrc+176, ab15.val[0]);
+ vst1q_f32(pSrc+180, ab15.val[1]);
+ vst1q_f32(pSrc+184, ab16.val[0]);
+ vst1q_f32(pSrc+188, ab16.val[1]);
+ vst1q_f32(pSrc+192, cd9.val[0] );
+ vst1q_f32(pSrc+196, cd9.val[1] );
+ vst1q_f32(pSrc+200, cd10.val[0]);
+ vst1q_f32(pSrc+204, cd10.val[1]);
+ vst1q_f32(pSrc+208, cd11.val[0]);
+ vst1q_f32(pSrc+212, cd11.val[1]);
+ vst1q_f32(pSrc+216, cd12.val[0]);
+ vst1q_f32(pSrc+220, cd12.val[1]);
+ vst1q_f32(pSrc+224, cd13.val[0]);
+ vst1q_f32(pSrc+228, cd13.val[1]);
+ vst1q_f32(pSrc+232, cd14.val[0]);
+ vst1q_f32(pSrc+236, cd14.val[1]);
+ vst1q_f32(pSrc+240, cd15.val[0]);
+ vst1q_f32(pSrc+244, cd15.val[1]);
+ vst1q_f32(pSrc+248, cd16.val[0]);
+ vst1q_f32(pSrc+252, cd16.val[1]);
+
+ //Transpose
+ arm_bitonic_merge_128_f32(pSrc+0 , dir);
+ arm_bitonic_merge_128_f32(pSrc+128, dir);
+}
+
+static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ a = arm_bitonic_sort_4_f32(a, dir);
+ b = arm_bitonic_sort_4_f32(b, dir);
+ return arm_bitonic_merge_8_f32(a, b, dir);
+}
+
+static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
+{
+ float32_t temp;
+
+ if( dir==(a[0]>a[1]) )
+ {
+ temp = a[1];
+ a[1] = a[0];
+ a[0] = temp;
+ }
+ if( dir==(a[2]>a[3]) )
+ {
+ temp = a[3];
+ a[3] = a[2];
+ a[2] = temp;
+ }
+
+ if( dir==(a[0]>a[3]) )
+ {
+ temp = a[3];
+ a[3] = a[0];
+ a[0] = temp;
+ }
+ if( dir==(a[1]>a[2]) )
+ {
+ temp = a[2];
+ a[2] = a[1];
+ a[1] = temp;
+ }
+
+ if( dir==(a[0]>a[1]) )
+ {
+ temp = a[1];
+ a[1] = a[0];
+ a[0] = temp;
+ }
+ if( dir==(a[2]>a[3]) )
+ {
+ temp = a[3];
+ a[3] = a[2];
+ a[2] = temp;
+ }
+
+ return a;
+}
+
+#endif
+
+void arm_bitonic_sort_f32(
+const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint16_t s, i;
+ uint8_t dir = S->dir;
+
+#ifdef ARM_MATH_NEON
+
+ float32_t * pOut;
+ uint16_t counter = blockSize>>5;
+
+ if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+ {
+ if(pSrc == pDst) // in-place
+ pOut = pSrc;
+ else
+ pOut = pDst;
+
+ float32x4x2_t ab1, ab2;
+ float32x4x2_t cd1, cd2;
+
+ if(blockSize == 1)
+ pOut = pSrc;
+ else if(blockSize == 2)
+ {
+ float32_t temp;
+
+ if( dir==(pSrc[0]>pSrc[1]) )
+ {
+ temp = pSrc[1];
+ pOut[1] = pSrc[0];
+ pOut[0] = temp;
+ }
+ else
+ pOut = pSrc;
+ }
+ else if(blockSize == 4)
+ {
+ float32x4_t a = vld1q_f32(pSrc);
+
+ a = arm_bitonic_sort_4_f32(a, dir);
+
+ vst1q_f32(pOut, a);
+ }
+ else if(blockSize == 8)
+ {
+ float32x4_t a;
+ float32x4_t b;
+ float32x4x2_t ab;
+
+ a = vld1q_f32(pSrc);
+ b = vld1q_f32(pSrc+4);
+
+ ab = arm_bitonic_sort_8_f32(a, b, dir);
+
+ vst1q_f32(pOut, ab.val[0]);
+ vst1q_f32(pOut+4, ab.val[1]);
+ }
+ else if(blockSize >=16)
+ {
+ // Order 16 bits long vectors
+ for(i=0; i<blockSize; i=i+16)
+ arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
+
+ // Merge
+ for(i=0; i<counter; i++)
+ {
+ // Load and reverse second vector
+ ab1.val[0] = vld1q_f32(pOut+32*i+0 );
+ ab1.val[1] = vld1q_f32(pOut+32*i+4 );
+ ab2.val[0] = vld1q_f32(pOut+32*i+8 );
+ ab2.val[1] = vld1q_f32(pOut+32*i+12);
+
+ vldrev128q_f32(cd2.val[1], pOut+32*i+16);
+ vldrev128q_f32(cd2.val[0], pOut+32*i+20);
+ vldrev128q_f32(cd1.val[1], pOut+32*i+24);
+ vldrev128q_f32(cd1.val[0], pOut+32*i+28);
+
+ arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
+ }
+
+ counter = counter>>1;
+ for(i=0; i<counter; i++)
+ arm_bitonic_merge_64_f32(pOut+64*i, dir);
+
+ counter = counter>>1;
+ for(i=0; i<counter; i++)
+ arm_bitonic_merge_128_f32(pOut+128*i, dir);
+
+ counter = counter>>1;
+ for(i=0; i<counter; i++)
+ arm_bitonic_merge_256_f32(pOut+256*i, dir);
+
+ // Etc...
+ }
+ }
+
+#else
+
+ float32_t * pA;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+
+ if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+ {
+ for(s=2; s<=blockSize; s=s*2)
+ {
+ for(i=0; i<blockSize; i=i+s)
+ arm_bitonic_sort_core_f32(pA+i, s, dir);
+ }
+ }
+#endif
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
new file mode 100644
index 0000000..fcc4ceb
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bubble_sort_f32.c
+ * Description: Floating point bubble sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The bubble sort algorithm is a simple comparison algorithm that
+ * reads the elements of a vector from the beginning to the end,
+ * compares the adjacent ones and swaps them if they are in the
+ * wrong order. The procedure is repeated until there is nothing
+ * left to swap. Bubble sort is fast for input vectors that are
+ * nearly sorted.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed
+ */
+
+void arm_bubble_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint8_t dir = S->dir;
+ uint32_t i;
+ uint8_t swapped =1;
+ float32_t * pA;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ while(swapped==1) // If nothing has been swapped after one loop stop
+ {
+ swapped=0;
+
+ for(i=0; i<blockSize-1; i++)
+ {
+ if(dir==(pA[i]>pA[i+1]))
+ {
+ // Swap
+ temp = pA[i];
+ pA[i] = pA[i+1];
+ pA[i+1] = temp;
+
+ // Update flag
+ swapped = 1;
+ }
+ }
+
+ blockSize--;
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
new file mode 100644
index 0000000..09ae924
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
@@ -0,0 +1,117 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_heap_sort_f32.c
+ * Description: Floating point heap sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The heap sort algorithm is a comparison algorithm that
+ * divides the input array into a sorted and an unsorted region,
+ * and shrinks the unsorted region by extracting the largest
+ * element and moving it to the sorted region. A heap data
+ * structure is used to find the maximum.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+static void arm_heapify(float32_t * pSrc, uint32_t n, uint32_t i, uint8_t dir)
+{
+ /* Put all the elements of pSrc in heap order */
+ uint32_t k = i; // Initialize largest/smallest as root
+ uint32_t l = 2*i + 1; // left = 2*i + 1
+ uint32_t r = 2*i + 2; // right = 2*i + 2
+ float32_t temp;
+
+ if (l < n && dir==(pSrc[l] > pSrc[k]) )
+ k = l;
+
+ if (r < n && dir==(pSrc[r] > pSrc[k]) )
+ k = r;
+
+ if (k != i)
+ {
+ temp = pSrc[i];
+ pSrc[i]=pSrc[k];
+ pSrc[k]=temp;
+
+ arm_heapify(pSrc, n, k, dir);
+ }
+}
+
+void arm_heap_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+ int32_t i;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ // Build the heap array so that the largest value is the root
+ for (i = blockSize/2 - 1; i >= 0; i--)
+ arm_heapify(pA, blockSize, i, S->dir);
+
+ for (i = blockSize - 1; i >= 0; i--)
+ {
+ // Swap
+ temp = pA[i];
+ pA[i] = pA[0];
+ pA[0] = temp;
+
+ // Restore heap order
+ arm_heapify(pA, i, 0, S->dir);
+ }
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
new file mode 100644
index 0000000..92394c9
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_insertion_sort_f32.c
+ * Description: Floating point insertion sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The insertion sort is a simple sorting algorithm that
+ * reads all the element of the input array and removes one element
+ * at a time, finds the location it belongs in the final sorted list,
+ * and inserts it there.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_insertion_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t *pSrc,
+ float32_t* pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+ uint8_t dir = S->dir;
+ uint32_t i, j;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ // Real all the element of the input array
+ for(i=0; i<blockSize; i++)
+ {
+ // Move the i-th element to the right position
+ for (j = i; j>0 && dir==(pA[j]<pA[j-1]); j--)
+ {
+ // Swap
+ temp = pA[j];
+ pA[j] = pA[j-1];
+ pA[j-1] = temp;
+ }
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
new file mode 100644
index 0000000..60cceca
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
@@ -0,0 +1,113 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_merge_sort_f32.c
+ * Description: Floating point merge sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The merge sort algorithm is a comparison algorithm that
+ * divide the input array in sublists and merge them to produce
+ * longer sorted sublists until there is only one list remaining.
+ *
+ * @par A work array is always needed, hence pSrc and pDst cannot be
+ * equal and the results will be stored in pDst.
+ */
+
+static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end, float32_t * pA, uint8_t dir)
+{
+ if((int32_t)end - (int32_t)begin >= 2 ) // If run size != 1 divide
+ {
+ int32_t middle = (end + begin) / 2; // Take the middle point
+
+ arm_merge_sort_core_f32(pA, begin, middle, pB, dir); // Sort the left part
+ arm_merge_sort_core_f32(pA, middle, end, pB, dir); // Sort the right part
+
+ topDownMerge(pB, begin, middle, end, pA, dir);
+ }
+}
+
+static void topDownMerge(float32_t * pA, uint32_t begin, uint32_t middle, uint32_t end, float32_t * pB, uint8_t dir)
+{
+ /* Left array is pA[begin:middle-1]
+ * Right Array is pA[middle:end-1]
+ * They are merged in pB
+ */
+
+ uint32_t i = begin;
+ uint32_t j = middle;
+ uint32_t k;
+
+ // Read all the elements in the sublist
+ for (k = begin; k < end; k++)
+ {
+ // Merge
+ if (i < middle && (j >= end || dir==(pA[i] <= pA[j])) )
+ {
+ pB[k] = pA[i];
+ i++;
+ }
+ else
+ {
+ pB[k] = pA[j];
+ j++;
+ }
+ }
+}
+
+void arm_merge_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t *pSrc,
+ float32_t *pDst,
+ uint32_t blockSize)
+{
+ // A work array is needed: pDst
+ if(pSrc != pDst) // out-of-place only
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+
+ arm_merge_sort_core_f32(pSrc, 0, blockSize, pDst, S->dir);
+ }
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
new file mode 100644
index 0000000..ad47310
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quick_sort_f32.c
+ * Description: Floating point quick sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The quick sort algorithm is a comparison algorithm that
+ * divides the input array into two smaller sub-arrays and
+ * recursively sort them. An element of the array (the pivot)
+ * is chosen, all the elements with values smaller than the
+ * pivot are moved before the pivot, while all elements with
+ * values greater than the pivot are moved after it (partition).
+ *
+ * @par
+ * In this implementation the Hoare partition scheme has been
+ * used and the first element has always been chosen as the pivot.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+static void arm_quick_sort_core_f32(float32_t *pSrc, uint32_t first, uint32_t last, uint8_t dir)
+{
+ uint32_t i, j, pivot;
+ float32_t temp;
+
+ if(dir)
+ {
+ if(first<last)
+ {
+ pivot=first; // First element chosen as pivot
+ i=first;
+ j=last;
+
+ // Look for a pair of elements (one greater than the pivot, one
+ // smaller) that are in the wrong order relative to each other.
+ while(i<j)
+ {
+ // Compare left elements with pivot
+ while(pSrc[i]<=pSrc[pivot] && i<last)
+ i++; // Move to the right
+
+ // Compare right elements with pivot
+ while(pSrc[j]>pSrc[pivot])
+ j--; // Move to the left
+
+ if(i<j)
+ {
+ // Swap i <-> j
+ temp=pSrc[i];
+ pSrc[i]=pSrc[j];
+ pSrc[j]=temp;
+ }
+ }
+
+ // Swap pivot <-> j
+ temp=pSrc[pivot];
+ pSrc[pivot]=pSrc[j];
+ pSrc[j]=temp;
+
+ arm_quick_sort_core_f32(pSrc, first, j-1, dir);
+ arm_quick_sort_core_f32(pSrc, j+1, last, dir);
+ }
+ }
+ else
+ {
+ if(first<last)
+ {
+ pivot=first;
+ i=first;
+ j=last;
+
+ while(i<j)
+ {
+ while(pSrc[i]>=pSrc[pivot] && i<last)
+ i++;
+
+ while(pSrc[j]<pSrc[pivot])
+ j--;
+
+ if(i<j)
+ {
+ temp=pSrc[i];
+ pSrc[i]=pSrc[j];
+ pSrc[j]=temp;
+ }
+ }
+
+ temp=pSrc[pivot];
+ pSrc[pivot]=pSrc[j];
+ pSrc[j]=temp;
+
+ arm_quick_sort_core_f32(pSrc, first, j-1, dir);
+ arm_quick_sort_core_f32(pSrc, j+1, last, dir);
+ }
+ }
+
+}
+
+void arm_quick_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ arm_quick_sort_core_f32(pA, 0, blockSize-1, S->dir);
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
new file mode 100644
index 0000000..4821696
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_selection_sort_f32.c
+ * Description: Floating point selection sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The Selection sort algorithm is a comparison algorithm that
+ * divides the input array into a sorted and an unsorted sublist
+ * (initially the sorted sublist is empty and the unsorted sublist
+ * is the input array), looks for the smallest (or biggest)
+ * element in the unsorted sublist, swapping it with the leftmost
+ * one, and moving the sublists boundary one element to the right.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_selection_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t i, j, k;
+ uint8_t dir = S->dir;
+ float32_t temp;
+
+ float32_t * pA;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ /* Move the boundary one element to the right */
+ for (i=0; i<blockSize-1; i++)
+ {
+ /* Initialize the minimum/maximum as the first element */
+ k = i;
+
+ /* Look in the unsorted list to find the minimum/maximum value */
+ for (j=i+1; j<blockSize; j++)
+ {
+ if (dir==(pA[j] < pA[k]) )
+ {
+ /* Update value */
+ k = j;
+ }
+ }
+
+ if (k != i)
+ {
+ /* Swap the minimum/maximum with the leftmost element */
+ temp=pA[i];
+ pA[i]=pA[k];
+ pA[k]=temp;
+ }
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
new file mode 100644
index 0000000..5824b2d
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_sort_f32.c
+ * Description: Floating point sort
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ */
+
+void arm_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ switch(S->alg)
+ {
+ case ARM_SORT_BITONIC:
+ arm_bitonic_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_BUBBLE:
+ arm_bubble_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_HEAP:
+ arm_heap_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_INSERTION:
+ arm_insertion_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_MERGE:
+ arm_merge_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_QUICK:
+ arm_quick_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_SELECTION:
+ arm_selection_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+ }
+}
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
new file mode 100644
index 0000000..80f10a5
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_sort_init_f32.c
+ * Description: Floating point sort initialization function
+ *
+ * $Date: 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+void arm_sort_init_f32(arm_sort_instance_f32 * S, arm_sort_alg alg, arm_sort_dir dir)
+{
+ S->alg = alg;
+ S->dir = dir;
+}
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c
new file mode 100644
index 0000000..d70f2f0
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_spline_interp_f32.c
+ * Description: Floating-point cubic spline interpolation
+ *
+ * $Date: 13 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupInterpolation
+ */
+
+/**
+ * @defgroup SplineInterpolate Cubic Spline Interpolation
+ *
+ * Spline interpolation is a method of interpolation where the interpolant
+ * is a piecewise-defined polynomial called "spline".
+ *
+ * \par Introduction
+ * Given a function f defined on the interval [a,b], a set of n nodes x(i)
+ * where a=x(1)<x(2)<...<x(n)=b and a set of n values y(i) = f(x(i)),
+ * a cubic spline interpolant S(x) is defined as: <br>
+ * <pre>
+ * S1(x) x(1) < x < x(2)
+ * S(x) = ...
+ * Sn-1(x) x(n-1) < x < x(n)
+ * <\pre><br>
+ * where<br>
+ * <pre>
+ * Si(x) = a_i+b_i(x-xi)+c_i(x-xi)^2+d_i(x-xi)^3 i=1, ..., n-1
+ * <\pre>
+ *
+ * \par Algorithm
+ * Having defined h(i) = x(i+1) - x(i)<br>
+ * <pre>
+ * h(i-1)c(i-1)+2[h(i-1)+h(i)]c(i)+h(i)c(i+1) = 3/h(i)*[a(i+1)-a(i)]-3/h(i-1)*[a(i)-a(i-1)] i=2, ..., n-1
+ * <\pre><br>
+ * It is possible to write the previous conditions in matrix form (Ax=B).<br>
+ * In order to solve the system two boundary conidtions are needed.<br>
+ * - Natural spline: S1''(x1)=2*c(1)=0 ; Sn''(xn)=2*c(n)=0<br>
+ * In matrix form:<br>
+ * <pre>
+ * | 1 0 0 ... 0 0 0 || c(1) | | 0 |
+ * | h(0) 2[h(0)+h(1)] h(1) ... 0 0 0 || c(2) | | 3/h(2)*[a(3)-a(2)]-3/h(1)*[a(2)-a(1)] |
+ * | ... ... ... ... ... ... ... || ... |=| ... |
+ * | 0 0 0 ... h(n-2) 2[h(n-2)+h(n-1)] h(n-1) || c(n-1) | | 3/h(n-1)*[a(n)-a(n-1)]-3/h(n-2)*[a(n-1)-a(n-2)] |
+ * | 0 0 0 ... 0 0 1 || c(n) | | 0 |
+ * </pre><br>
+ * - Parabolic runout spline: S1''(x1)=2*c(1)=S2''(x2)=2*c(2) ; Sn-1''(xn-1)=2*c(n-1)=Sn''(xn)=2*c(n)<br>
+ * In matrix form:<br>
+ * <pre>
+ * | 1 -1 0 ... 0 0 0 || c(1) | | 0 |
+ * | h(0) 2[h(0)+h(1)] h(1) ... 0 0 0 || c(2) | | 3/h(2)*[a(3)-a(2)]-3/h(1)*[a(2)-a(1)] |
+ * | ... ... ... ... ... ... ... || ... |=| ... |
+ * | 0 0 0 ... h(n-2) 2[h(n-2)+h(n-1)] h(n-1) || c(n-1) | | 3/h(n-1)*[a(n)-a(n-1)]-3/h(n-2)*[a(n-1)-a(n-2)] |
+ * | 0 0 0 ... 0 -1 1 || c(n) | | 0 |
+ * </pre><br>
+ * A is a tridiagonal matrix (a band matrix of bandwidth 3) of size N=n+1. The factorization
+ * algorithms (A=LU) can be simplified considerably because a large number of zeros appear
+ * in regular patterns. The Crout method has been used:<br>
+ * 1) Solve LZ=B<br>
+ * <pre>
+ * u(1,2) = A(1,2)/A(1,1)
+ * z(1) = B(1)/l(11)
+ *
+ * FOR i=2, ..., N-1
+ * l(i,i) = A(i,i)-A(i,i-1)u(i-1,i)
+ * u(i,i+1) = a(i,i+1)/l(i,i)
+ * z(i) = [B(i)-A(i,i-1)z(i-1)]/l(i,i)
+ *
+ * l(N,N) = A(N,N)-A(N,N-1)u(N-1,N)
+ * z(N) = [B(N)-A(N,N-1)z(N-1)]/l(N,N)
+ * </pre><br>
+ * 2) Solve UX=Z<br>
+ * <pre>
+ * c(N)=z(N)
+ *
+ * FOR i=N-1, ..., 1
+ * c(i)=z(i)-u(i,i+1)c(i+1)
+ * </pre><br>
+ * c(i) for i=1, ..., n-1 are needed to compute the n-1 polynomials. <br>
+ * b(i) and d(i) are computed as:<br>
+ * - b(i) = [y(i+1)-y(i)]/h(i)-h(i)*[c(i+1)+2*c(i)]/3 <br>
+ * - d(i) = [c(i+1)-c(i)]/[3*h(i)] <br>
+ * Moreover, a(i)=y(i).
+ *
+ * \par Usage
+ * The x input array must be strictly sorted in ascending order and it must
+ * not contain twice the same value (x(i)<x(i+1)).
+ *
+ * \par
+ * It is possible to compute the interpolated vector for x values outside the
+ * input range (xq<x(1); xq>x(n)). The coefficients used to compute the y values for
+ * xq<x(1) are going to be the ones used for the first interval, while for xq>x(n) the
+ * coefficients used for the last interval.
+ *
+ */
+/**
+ @addtogroup SplineInterpolate
+ @{
+ */
+/**
+ * @brief Processing function for the floating-point cubic spline interpolation.
+ * @param[in] S points to an instance of the floating-point spline structure.
+ * @param[in] x points to the x values of the known data points.
+ * @param[in] y points to the y values of the known data points.
+ * @param[in] xq points to the x values ot the interpolated data points.
+ * @param[out] pDst points to the block of output data.
+ * @param[in] blockSize number of samples of output data.
+ */
+
+void arm_spline_f32(
+ arm_spline_instance_f32 * S,
+ const float32_t * x,
+ const float32_t * y,
+ const float32_t * xq,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t n = S->n_x;
+ arm_spline_type type = S->type;
+
+ float32_t hi, hm1;
+
+ /* Temporary variables for system AX=B */
+ float32_t u[n-1], z[n], c[n];
+ float32_t Bi, li;
+
+ float32_t bi, di;
+ float32_t x_sc;
+
+ const float32_t * pXq = xq;
+
+ int32_t blkCnt = (int32_t)blockSize;
+ int32_t blkCnt2;
+ int32_t i, j;
+
+#ifdef ARM_MATH_NEON
+ float32x4_t xiv;
+ float32x4_t aiv;
+ float32x4_t biv;
+ float32x4_t civ;
+ float32x4_t div;
+
+ float32x4_t xqv;
+
+ float32x4_t temp;
+ float32x4_t diff;
+ float32x4_t yv;
+#endif
+
+ /* === Solve LZ=B to obtain z(i) and u(i) === */
+ /* --- Row 1 --- */
+ /* B(0) = 0, not computed */
+ /* u(1,2) = a(1,2)/a(1,1) = a(1,2) */
+ if(type == ARM_SPLINE_NATURAL)
+ u[0] = 0; // a(1,2) = 0
+ else if(type == ARM_SPLINE_PARABOLIC_RUNOUT)
+ u[0] = -1; // a(1,2) = -1
+
+ z[0] = 0; // z(1) = B(1)/a(1,1) = 0
+
+ /* --- Rows 2 to N-1 (N=n+1) --- */
+ hm1 = x[1] - x[0]; // x(2)-x(1)
+
+ for (i=1; i<n-1; i++)
+ {
+ /* Compute B(i) */
+ hi = x[i+1]-x[i];
+ Bi = 3*(y[i+1]-y[i])/hi - 3*(y[i]-y[i-1])/hm1;
+ /* l(i) = a(ii)-a(i,i-1)*u(i-1) = 2[h(i-1)+h(i)]-h(i-1)*u(i-1) */
+ li = 2*(hi+hm1) - hm1*u[i-1];
+ /* u(i) = a(i,i+1)/l(i) = h(i)/l(i) */
+ u[i] = hi/li;
+ /* z(i) = [B(i)-h(i-1)*z(i-1)]/l(i) */
+ z[i] = (Bi-hm1*z[i-1])/li;
+ /* Update h(i-1) */
+ hm1 = hi;
+ }
+
+ /* --- Row N --- */
+ /* l(N) = a(N,N)-a(N,N-1)u(N-1) */
+ /* z(N) = [-a(N,N-1)z(N-1)]/l(N) */
+ if(type == ARM_SPLINE_NATURAL)
+ {
+ //li = 1; // a(N,N)=1; a(N,N-1)=0
+ z[n-1] = 0; // a(N,N-1)=0
+ }
+ else if(type == ARM_SPLINE_PARABOLIC_RUNOUT)
+ {
+ li = 1+u[n-2]; // a(N,N)=1; a(N,N-1)=-1
+ z[n-1] = z[n-2]/li; // a(N,N-1)=-1
+ }
+
+ /* === Solve UX = Z to obtain c(i) === */
+ /* c(N) = z(N) */
+ c[n-1] = z[n-1];
+ /* c(i) = z(i)-u(i+1)c(i+1) */
+ for (j = n-1-1; j >= 0; --j)
+ c[j] = z[j] - u[j] * c[j + 1];
+
+ /* === Compute b(i) and d(i) from c(i) and create output for x(i)<x<x(i+1) === */
+ for (i=0; i<n-1; i++)
+ {
+ hi = x[i+1]-x[i];
+ bi = (y[i+1]-y[i])/hi-hi*(c[i+1]+2*c[i])/3;
+ di = (c[i+1]-c[i])/(3*hi);
+
+#ifdef ARM_MATH_NEON
+ xiv = vdupq_n_f32(x[i]);
+
+ aiv = vdupq_n_f32(y[i]);
+ biv = vdupq_n_f32(bi);
+ civ = vdupq_n_f32(c[i]);
+ div = vdupq_n_f32(di);
+
+ while( *(pXq+4) <= x[i+1] && blkCnt > 4U )
+ {
+ /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
+ xqv = vld1q_f32(pXq);
+ pXq+=4;
+
+ /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+ diff = vsubq_f32(xqv, xiv);
+ temp = diff;
+
+ /* y(i) = a(i) + ... */
+ yv = aiv;
+ /* ... + b(i)*(x-x(i)) + ... */
+ yv = vmlaq_f32(yv, biv, temp);
+ /* ... + c(i)*(x-x(i))^2 + ... */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, civ, temp);
+ /* ... + d(i)*(x-x(i))^3 */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, div, temp);
+
+ /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
+ vst1q_f32(pDst, yv);
+ pDst+=4;
+
+ blkCnt-=4;
+ }
+#endif
+ while( *pXq <= x[i+1] && blkCnt > 0U )
+ {
+ x_sc = *pXq++;
+
+ *pDst = y[i]+bi*(x_sc-x[i])+c[i]*(x_sc-x[i])*(x_sc-x[i])+di*(x_sc-x[i])*(x_sc-x[i])*(x_sc-x[i]);
+
+ pDst++;
+ blkCnt--;
+ }
+ }
+
+ /* == Create output for remaining samples (x>=x(n)) == */
+#ifdef ARM_MATH_NEON
+ /* Compute 4 outputs at a time */
+ blkCnt2 = blkCnt >> 2;
+
+ while(blkCnt2 > 0U)
+ {
+ /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
+ xqv = vld1q_f32(pXq);
+ pXq+=4;
+
+ /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+ diff = vsubq_f32(xqv, xiv);
+ temp = diff;
+
+ /* y(i) = a(i) + ... */
+ yv = aiv;
+ /* ... + b(i)*(x-x(i)) + ... */
+ yv = vmlaq_f32(yv, biv, temp);
+ /* ... + c(i)*(x-x(i))^2 + ... */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, civ, temp);
+ /* ... + d(i)*(x-x(i))^3 */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, div, temp);
+
+ /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
+ vst1q_f32(pDst, yv);
+ pDst+=4;
+
+ blkCnt2--;
+ }
+
+ /* Tail */
+ blkCnt2 = blkCnt & 3;
+#else
+ blkCnt2 = blkCnt;
+#endif
+
+ while(blkCnt2 > 0U)
+ {
+ x_sc = *pXq++;
+
+ *pDst = y[i-1]+bi*(x_sc-x[i-1])+c[i]*(x_sc-x[i-1])*(x_sc-x[i-1])+di*(x_sc-x[i-1])*(x_sc-x[i-1])*(x_sc-x[i-1]);
+
+ pDst++;
+ blkCnt2--;
+ }
+}
+
+/**
+ * @} end of SplineInterpolate group
+ * */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c
new file mode 100644
index 0000000..f9b5ee9
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_spline_interp_init_f32.c
+ * Description: Floating-point cubic spline initialization function
+ *
+ * $Date: 13 November 2019
+ * $Revision: V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupInterpolation
+ */
+
+/**
+ @addtogroup SplineInterpolate
+ @{
+ */
+
+/**
+ * @brief Initialization function for the floating-point cubic spline interpolation.
+ * @param[in,out] S points to an instance of the floating-point spline structure.
+ * @param[in] n number of known data points.
+ * @param[in] type type of cubic spline interpolation (boundary conditions)
+ */
+
+void arm_spline_init_f32(
+ arm_spline_instance_f32 * S,
+ uint32_t n,
+ arm_spline_type type)
+{
+ S->n_x = n;
+ S->type = type;
+
+ /* Type (boundary conditions):
+ * 0: Natural spline ( S1''(x1)=0 ; Sn''(xn)=0 )
+ * 1: Parabolic runout spline ( S1''(x1)=S2''(x2) ; Sn-1''(xn-1)=Sn''(xn) )
+ */
+}
+
+/**
+ * @} end of SplineInterpolate group
+ */
diff --git a/CMSIS/DSP/Source/configDsp.cmake b/CMSIS/DSP/Source/configDsp.cmake
index 08a4d70..32c11a0 100644
--- a/CMSIS/DSP/Source/configDsp.cmake
+++ b/CMSIS/DSP/Source/configDsp.cmake
@@ -30,8 +30,8 @@
target_compile_definitions(${project} PRIVATE ARM_MATH_FLOAT16)
endif()
-if (HELIUM OR MVEF)
+if (HELIUM OR MVEF OR SUPPORT)
target_include_directories(${project} PRIVATE "${root}/CMSIS/DSP/PrivateInclude")
endif()
-endfunction()
\ No newline at end of file
+endfunction()