Added logical operators + sorting + spline

- Added NEON bitwise AND, NOT, OR, XOR (q7, q15, q31)
- Added Sorting algorithms f32 (NEON bitonic sort)
- Added cubic spline interpolation function
- Added test patterns for all
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q15.c
new file mode 100644
index 0000000..e53706f
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q15.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_q15.c
+ * Description:  Q15 bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup And Vector bitwise AND
+
+  Compute the logical bitwise AND.
+
+  There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s16(pSrcA);
+        vecB = vld1q_s16(pSrcB);
+
+        vst1q_s16(pDst, vandq_s16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of And group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q31.c
new file mode 100644
index 0000000..0348a59
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q31.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_q31.c
+ * Description:  Q31 bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s32(pSrcA);
+        vecB = vld1q_s32(pSrcB);
+
+        vst1q_s32(pDst, vandq_s32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of And group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q7.c
new file mode 100644
index 0000000..97734a8
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_and_q7.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_and_q7.c
+ * Description:  Q7 bitwise AND
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup And
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise AND of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_and_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s8(pSrcA);
+        vecB = vld1q_s8(pSrcB);
+
+        vst1q_s8(pDst, vandq_s8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)&(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of And group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q15.c
new file mode 100644
index 0000000..7cfe07e
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q15.c
@@ -0,0 +1,100 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_q15.c
+ * Description:  Q15 bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup Not Vector bitwise NOT
+
+  Compute the logical bitwise NOT.
+
+  There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector 
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_q15(
+    const q15_t * pSrc,
+          q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int16x8_t inV;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_s16(pSrc);
+
+        vst1q_s16(pDst, vmvnq_s16(inV) );
+
+        pSrc += 8;
+        pDst += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Not group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q31.c
new file mode 100644
index 0000000..c6b08bd
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q31.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_q31.c
+ * Description:  Q31 bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector 
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_q31(
+    const q31_t * pSrc,
+          q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int32x4_t inV;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_s32(pSrc);
+
+        vst1q_s32(pDst, vmvnq_s32(inV) );
+
+        pSrc += 4;
+        pDst += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Not group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q7.c
new file mode 100644
index 0000000..46baf2c
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_not_q7.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_not_q7.c
+ * Description:  Q7 bitwise NOT
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Not
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise NOT of a fixed-point vector.
+  @param[in]     pSrc       points to input vector 
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_not_q7(
+    const q7_t * pSrc,
+          q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int8x16_t inV;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        inV = vld1q_s8(pSrc);
+
+        vst1q_s8(pDst, vmvnq_s8(inV) );
+
+        pSrc += 16;
+        pDst += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = ~(*pSrc++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Not group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q15.c
new file mode 100644
index 0000000..d092071
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q15.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_q15.c
+ * Description:  Q15 bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup Or Vector bitwise inclusive OR
+
+  Compute the logical bitwise OR.
+
+  There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s16(pSrcA);
+        vecB = vld1q_s16(pSrcB);
+
+        vst1q_s16(pDst, vorrq_s16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Or group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q31.c
new file mode 100644
index 0000000..749165a
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q31.c
@@ -0,0 +1,95 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_q31.c
+ * Description:  Q31 bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s32(pSrcA);
+        vecB = vld1q_s32(pSrcB);
+
+        vst1q_s32(pDst, vorrq_s32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+/**
+  @} end of Or group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q7.c
new file mode 100644
index 0000000..1b563d2
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_or_q7.c
@@ -0,0 +1,95 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_or_q7.c
+ * Description:  Q7 bitwise inclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Or
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise OR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+void arm_or_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s8(pSrcA);
+        vecB = vld1q_s8(pSrcB);
+
+        vst1q_s8(pDst, vorrq_s8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)|(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+/**
+  @} end of Or group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q15.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q15.c
new file mode 100644
index 0000000..d0a6a72
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q15.c
@@ -0,0 +1,104 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_q15.c
+ * Description:  Q15 bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup Xor Vector bitwise exclusive OR
+
+  Compute the logical bitwise XOR.
+
+  There are separate functions for Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#include "arm_math.h"
+
+void arm_xor_q15(
+    const q15_t * pSrcA,
+    const q15_t * pSrcB,
+    q15_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int16x8_t vecA, vecB;
+
+    /* Compute 8 outputs at a time */
+    blkCnt = blockSize >> 3U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s16(pSrcA);
+        vecB = vld1q_s16(pSrcB);
+
+        vst1q_s16(pDst, veorq_s16(vecA, vecB) );
+
+        pSrcA += 8;
+        pSrcB += 8;
+        pDst  += 8;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 7;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Xor group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q31.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q31.c
new file mode 100644
index 0000000..51cba23
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q31.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_q31.c
+ * Description:  Q31 bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#include "arm_math.h"
+
+void arm_xor_q31(
+    const q31_t * pSrcA,
+    const q31_t * pSrcB,
+    q31_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int32x4_t vecA, vecB;
+
+    /* Compute 4 outputs at a time */
+    blkCnt = blockSize >> 2U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s32(pSrcA);
+        vecB = vld1q_s32(pSrcB);
+
+        vst1q_s32(pDst, veorq_s32(vecA, vecB) );
+
+        pSrcA += 4;
+        pSrcB += 4;
+        pDst  += 4;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 3;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Xor group
+ */
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q7.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q7.c
new file mode 100644
index 0000000..4c75e22
--- /dev/null
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_xor_q7.c
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_xor_q7.c
+ * Description:  Q7 bitwise exclusive OR
+ *
+ * $Date:        14 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Xor
+  @{
+ */
+
+/**
+  @brief         Compute the logical bitwise XOR of two fixed-point vectors.
+  @param[in]     pSrcA      points to input vector A
+  @param[in]     pSrcB      points to input vector B
+  @param[out]    pDst       points to output vector
+  @param[in]     blockSize  number of samples in each vector
+  @return        none
+ */
+
+#include "arm_math.h"
+
+void arm_xor_q7(
+    const q7_t * pSrcA,
+    const q7_t * pSrcB,
+    q7_t * pDst,
+    uint32_t blockSize)
+{
+    uint32_t blkCnt;      /* Loop counter */
+
+#if defined(ARM_MATH_NEON)
+    int8x16_t vecA, vecB;
+
+    /* Compute 16 outputs at a time */
+    blkCnt = blockSize >> 4U;
+
+    while (blkCnt > 0U)
+    {
+        vecA = vld1q_s8(pSrcA);
+        vecB = vld1q_s8(pSrcB);
+
+        vst1q_s8(pDst, veorq_s8(vecA, vecB) );
+
+        pSrcA += 16;
+        pSrcB += 16;
+        pDst  += 16;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Tail */
+    blkCnt = blockSize & 0xF;
+#else
+    /* Initialize blkCnt with number of samples */
+    blkCnt = blockSize;
+#endif
+
+    while (blkCnt > 0U)
+    {
+        *pDst++ = (*pSrcA++)^(*pSrcB++);
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+}
+
+/**
+  @} end of Xor group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
new file mode 100644
index 0000000..67ecc8f
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
@@ -0,0 +1,1032 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bitonic_sort_f32.c
+ * Description:  Floating point bitonic sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @defgroup Sorting Vector sorting algorithms
+
+  Sort the elements of a vector
+
+  There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   */
+
+static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
+{
+    uint32_t step;
+    uint32_t k, j;
+    float32_t *leftPtr, *rightPtr;
+    float32_t temp;
+
+    step = n>>1;
+    leftPtr = pSrc;
+    rightPtr = pSrc+n-1;
+
+    for(k=0; k<step; k++)
+    {
+	if(dir == (*leftPtr > *rightPtr))
+	{
+            // Swap
+	    temp=*leftPtr;
+	    *leftPtr=*rightPtr;
+	    *rightPtr=temp;
+	}
+
+	leftPtr++;  // Move right
+	rightPtr--; // Move left
+    }
+
+    // Merge
+    for(step=(n>>2); step>0; step/=2)
+    {
+	for(j=0; j<n; j=j+step*2)
+	{
+	    leftPtr  = pSrc+j;
+	    rightPtr = pSrc+j+step;
+
+	    for(k=0; k<step; k++)
+	    {
+		if(*leftPtr > *rightPtr)
+		{
+		    // Swap
+	    	    temp=*leftPtr;
+		    *leftPtr=*rightPtr;
+		    *rightPtr=temp;
+		}
+
+		leftPtr++;
+		rightPtr++;
+	    }
+	}
+    }
+}
+
+#ifdef ARM_MATH_NEON
+
+static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
+{
+    float32x4_t a;
+    float32x4_t b;
+    float32x4_t c;
+    float32x4_t d;
+
+    // Load 16 samples
+    a = vld1q_f32(pSrc);
+    b = vld1q_f32(pSrc+4);
+    c = vld1q_f32(pSrc+8); 
+    d = vld1q_f32(pSrc+12);
+    
+    // Bitonic sorting network for 4 samples x 4 times
+    if(dir)
+    {
+        vminmaxq(a, b);
+        vminmaxq(c, d);
+        
+        vminmaxq(a, d);
+        vminmaxq(b, c);
+        
+        vminmaxq(a, b);
+        vminmaxq(c, d);
+    }
+    else
+    {
+        vminmaxq(b, a);
+        vminmaxq(d, c);
+        
+        vminmaxq(d, a);
+        vminmaxq(c, b);
+        
+        vminmaxq(b, a);
+        vminmaxq(d, c);
+    }
+
+    float32x4x2_t ab = vtrnq_f32 (a, b);
+    float32x4x2_t cd = vtrnq_f32 (c, d);
+    
+    // Transpose 4 ordered arrays of 4 samples
+    a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
+    b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
+    c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
+    d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
+
+    // Merge pairs of arrays of 4 samples
+    ab = arm_bitonic_merge_8_f32(a, b, dir);
+    cd = arm_bitonic_merge_8_f32(c, d, dir);
+    
+    // Merge arrays of 8 samples
+    arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
+}
+
+static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+    // Merge two preordered float32x4x2_t
+    vrev256q_f32(b);
+
+    if(dir)
+        vminmax256q(a, b);
+    else
+        vminmax256q(b, a);
+
+    arm_bitonic_resort_16_f32(pOut, a, b, dir);
+}
+
+static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+    /* Start with two vectors:
+     * +---+---+---+---+---+---+---+---+
+     * | a | b | c | d | e | f | g | h |
+     * +---+---+---+---+---+---+---+---+
+     * +---+---+---+---+---+---+---+---+
+     * | i | j | k | l | m | n | o | p |
+     * +---+---+---+---+---+---+---+---+
+     * All the elements of the first are guaranteed to be less than or equal to
+     * all of the elements in the second, and both vectors are bitonic.
+     * We need to perform these operations to completely sort both lists:
+     * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
+     * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
+     * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
+     */
+
+    vtrn256_128q(a, b);
+    /* +---+---+---+---+---+---+---+---+
+     * | a | b | c | d | i | j | k | l |
+     * +---+---+---+---+---+---+---+---+
+     * +---+---+---+---+---+---+---+---+
+     * | e | f | g | h | m | n | o | p |
+     * +---+---+---+---+---+---+---+---+
+     */
+    if(dir)
+        vminmax256q(a, b);
+    else
+        vminmax256q(b, a);
+    
+    vtrn256_64q(a, b);
+    
+    /* +---+---+---+---+---+---+---+---+
+     * | a | b | e | f | i | j | m | n |
+     * +---+---+---+---+---+---+---+---+
+     * +---+---+---+---+---+---+---+---+
+     * | c | d | g | h | k | l | o | p |
+     * +---+---+---+---+---+---+---+---+
+     */
+    if(dir)
+        vminmax256q(a, b);
+    else
+        vminmax256q(b, a);
+    
+    vtrn256_32q(a, b);
+    /* We now have:
+     * +---+---+---+---+---+---+---+---+
+     * | a | c | e | g | i | k | m | o |
+     * +---+---+---+---+---+---+---+---+
+     * +---+---+---+---+---+---+---+---+
+     * | b | d | f | h | j | l | n | p |
+     * +---+---+---+---+---+---+---+---+
+     */
+    if(dir)
+        vminmax256q(a, b);
+    else
+        vminmax256q(b, a);
+    
+    float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
+    float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
+    
+    vst1q_f32(pOut, out1.val[0]);
+    vst1q_f32(pOut+4, out1.val[1]);
+    vst1q_f32(pOut+8, out2.val[0]);
+    vst1q_f32(pOut+12, out2.val[1]);
+} 
+
+static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+    /* a and b are guaranteed to be bitonic */
+    // Reverse the element of the second vector
+    b = vrev128q_f32(b);
+
+    // Compare the two vectors
+    if(dir)
+        vminmaxq(a, b);
+    else
+	vminmaxq(b, a);
+
+    // Merge the two vectors
+    float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
+
+    return ab;
+}
+
+static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+    /* Start with two vectors:
+     * +---+---+---+---+
+     * | a | b | c | d |
+     * +---+---+---+---+
+     * +---+---+---+---+
+     * | e | f | g | h |
+     * +---+---+---+---+
+     * All the elements of the first are guaranteed to be less than or equal to
+     * all of the elements in the second, and both vectors are bitonic.
+     * We need to perform these operations to completely sort both lists:
+     * vminmax([abcd],[efgh]) 
+     * vminmax([acbd],[egfh]) 
+     */
+    vtrn128_64q(a, b);
+    /* +---+---+---+---+
+     * | a | b | e | f |
+     * +---+---+---+---+
+     * +---+---+---+---+
+     * | c | d | g | h |
+     * +---+---+---+---+
+     */
+    if(dir)
+        vminmaxq(a, b);
+    else
+        vminmaxq(b, a);
+    
+    vtrn128_32q(a, b);
+    /* +---+---+---+---+
+     * | a | c | e | g |
+     * +---+---+---+---+
+     * +---+---+---+---+
+     * | b | d | f | h |
+     * +---+---+---+---+
+     */
+    if(dir)
+        vminmaxq(a, b);
+    else
+        vminmaxq(b, a);
+    
+    return vzipq_f32(a, b);
+}
+
+static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
+{
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1, cd1);
+        vminmax256q(ab2, cd2);
+    }
+    else
+    {
+        vminmax256q(cd1, ab1);
+        vminmax256q(cd2, ab2);
+    }
+    //Transpose 256
+    float32x4_t temp;
+
+    temp = ab2.val[0];
+    ab2.val[0] = cd1.val[0];
+    cd1.val[0] = temp;
+    temp = ab2.val[1];
+    ab2.val[1] = cd1.val[1];
+    cd1.val[1] = temp;
+
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1, cd1);
+        vminmax256q(ab2, cd2);
+    }
+    else
+    {
+        vminmax256q(cd1, ab1);
+        vminmax256q(cd2, ab2);
+    }
+    
+    //Transpose 128
+    arm_bitonic_merge_16_f32(pSrc+0,  ab1, cd1, dir);
+    arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
+}
+
+static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
+{
+    float32x4x2_t ab1, ab2, ab3, ab4;
+    float32x4x2_t cd1, cd2, cd3, cd4;
+
+    //Load and reverse second array
+    ab1.val[0] = vld1q_f32(pSrc+0 );
+    ab1.val[1] = vld1q_f32(pSrc+4 );
+    ab2.val[0] = vld1q_f32(pSrc+8 ); 
+    ab2.val[1] = vld1q_f32(pSrc+12);
+    ab3.val[0] = vld1q_f32(pSrc+16);
+    ab3.val[1] = vld1q_f32(pSrc+20);
+    ab4.val[0] = vld1q_f32(pSrc+24); 
+    ab4.val[1] = vld1q_f32(pSrc+28);
+
+    vldrev128q_f32(cd4.val[1], pSrc+32);
+    vldrev128q_f32(cd4.val[0], pSrc+36);
+    vldrev128q_f32(cd3.val[1], pSrc+40);
+    vldrev128q_f32(cd3.val[0], pSrc+44);
+    vldrev128q_f32(cd2.val[1], pSrc+48);
+    vldrev128q_f32(cd2.val[0], pSrc+52);
+    vldrev128q_f32(cd1.val[1], pSrc+56);
+    vldrev128q_f32(cd1.val[0], pSrc+60);
+    
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1, cd1);
+        vminmax256q(ab2, cd2);
+        vminmax256q(ab3, cd3);
+        vminmax256q(ab4, cd4);
+    }
+    else
+    {
+        vminmax256q(cd1, ab1);
+        vminmax256q(cd2, ab2);
+        vminmax256q(cd3, ab3);
+        vminmax256q(cd4, ab4);
+    }
+
+    //Transpose 512
+    float32x4_t temp;
+
+    temp = ab3.val[0];
+    ab3.val[0] = cd1.val[0];
+    cd1.val[0] = temp;
+    temp = ab3.val[1];
+    ab3.val[1] = cd1.val[1];
+    cd1.val[1] = temp;
+    temp = ab4.val[0];
+    ab4.val[0] = cd2.val[0];
+    cd2.val[0] = temp;
+    temp = ab4.val[1];
+    ab4.val[1] = cd2.val[1];
+    cd2.val[1] = temp;
+
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1, cd1);
+        vminmax256q(ab2, cd2);
+        vminmax256q(ab3, cd3);
+        vminmax256q(ab4, cd4);
+    }
+    else
+    {
+        vminmax256q(cd1, ab1);
+        vminmax256q(cd2, ab2);
+        vminmax256q(cd3, ab3);
+        vminmax256q(cd4, ab4);
+    }
+    
+    //Transpose 256
+    arm_bitonic_merge_32_f32(pSrc+0,  ab1, ab2, cd1, cd2, dir);
+    arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
+}
+
+static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
+{
+    float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+    float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+
+    //Load and reverse second array
+    ab1.val[0] = vld1q_f32(pSrc+0 );
+    ab1.val[1] = vld1q_f32(pSrc+4 );
+    ab2.val[0] = vld1q_f32(pSrc+8 ); 
+    ab2.val[1] = vld1q_f32(pSrc+12);
+    ab3.val[0] = vld1q_f32(pSrc+16);
+    ab3.val[1] = vld1q_f32(pSrc+20);
+    ab4.val[0] = vld1q_f32(pSrc+24); 
+    ab4.val[1] = vld1q_f32(pSrc+28);
+    ab5.val[0] = vld1q_f32(pSrc+32);
+    ab5.val[1] = vld1q_f32(pSrc+36);
+    ab6.val[0] = vld1q_f32(pSrc+40); 
+    ab6.val[1] = vld1q_f32(pSrc+44);
+    ab7.val[0] = vld1q_f32(pSrc+48);
+    ab7.val[1] = vld1q_f32(pSrc+52);
+    ab8.val[0] = vld1q_f32(pSrc+56); 
+    ab8.val[1] = vld1q_f32(pSrc+60);
+
+    vldrev128q_f32(cd8.val[1], pSrc+64);
+    vldrev128q_f32(cd8.val[0], pSrc+68);
+    vldrev128q_f32(cd7.val[1], pSrc+72);
+    vldrev128q_f32(cd7.val[0], pSrc+76);
+    vldrev128q_f32(cd6.val[1], pSrc+80);
+    vldrev128q_f32(cd6.val[0], pSrc+84);
+    vldrev128q_f32(cd5.val[1], pSrc+88);
+    vldrev128q_f32(cd5.val[0], pSrc+92);
+    vldrev128q_f32(cd4.val[1], pSrc+96);
+    vldrev128q_f32(cd4.val[0], pSrc+100);
+    vldrev128q_f32(cd3.val[1], pSrc+104);
+    vldrev128q_f32(cd3.val[0], pSrc+108);
+    vldrev128q_f32(cd2.val[1], pSrc+112);
+    vldrev128q_f32(cd2.val[0], pSrc+116);
+    vldrev128q_f32(cd1.val[1], pSrc+120);
+    vldrev128q_f32(cd1.val[0], pSrc+124);
+    
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1, cd1);
+        vminmax256q(ab2, cd2);
+        vminmax256q(ab3, cd3);
+        vminmax256q(ab4, cd4);
+        vminmax256q(ab5, cd5);
+        vminmax256q(ab6, cd6);
+        vminmax256q(ab7, cd7);
+        vminmax256q(ab8, cd8);
+    }
+    else
+    {
+        vminmax256q(cd1, ab1);
+        vminmax256q(cd2, ab2);
+        vminmax256q(cd3, ab3);
+        vminmax256q(cd4, ab4);
+        vminmax256q(cd5, ab5);
+        vminmax256q(cd6, ab6);
+        vminmax256q(cd7, ab7);
+        vminmax256q(cd8, ab8);
+    }
+    
+    //Transpose
+    float32x4_t temp;
+
+    temp = ab5.val[0];
+    ab5.val[0] = cd1.val[0];
+    cd1.val[0] = temp;
+    temp = ab5.val[1];
+    ab5.val[1] = cd1.val[1];
+    cd1.val[1] = temp;
+    temp = ab6.val[0];
+    ab6.val[0] = cd2.val[0];
+    cd2.val[0] = temp;
+    temp = ab6.val[1];
+    ab6.val[1] = cd2.val[1];
+    cd2.val[1] = temp;
+    temp = ab7.val[0];
+    ab7.val[0] = cd3.val[0];
+    cd3.val[0] = temp;
+    temp = ab7.val[1];
+    ab7.val[1] = cd3.val[1];
+    cd3.val[1] = temp;
+    temp = ab8.val[0];
+    ab8.val[0] = cd4.val[0];
+    cd4.val[0] = temp;
+    temp = ab8.val[1];
+    ab8.val[1] = cd4.val[1];
+    cd4.val[1] = temp;
+
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1, cd1);
+        vminmax256q(ab2, cd2);
+        vminmax256q(ab3, cd3);
+        vminmax256q(ab4, cd4);
+        vminmax256q(ab5, cd5);
+        vminmax256q(ab6, cd6);
+        vminmax256q(ab7, cd7);
+        vminmax256q(ab8, cd8);
+    }
+    else
+    {
+        vminmax256q(cd1, ab1);
+        vminmax256q(cd2, ab2);
+        vminmax256q(cd3, ab3);
+        vminmax256q(cd4, ab4);
+        vminmax256q(cd5, ab5);
+        vminmax256q(cd6, ab6);
+        vminmax256q(cd7, ab7);
+        vminmax256q(cd8, ab8);
+    }
+
+    vst1q_f32(pSrc,     ab1.val[0]);
+    vst1q_f32(pSrc+4,   ab1.val[1]);
+    vst1q_f32(pSrc+8,   ab2.val[0]);
+    vst1q_f32(pSrc+12,  ab2.val[1]);
+    vst1q_f32(pSrc+16,  ab3.val[0]);
+    vst1q_f32(pSrc+20,  ab3.val[1]);
+    vst1q_f32(pSrc+24,  ab4.val[0]);
+    vst1q_f32(pSrc+28,  ab4.val[1]);
+    vst1q_f32(pSrc+32,  cd1.val[0]);
+    vst1q_f32(pSrc+36,  cd1.val[1]);
+    vst1q_f32(pSrc+40,  cd2.val[0]);
+    vst1q_f32(pSrc+44,  cd2.val[1]);
+    vst1q_f32(pSrc+48,  cd3.val[0]);
+    vst1q_f32(pSrc+52,  cd3.val[1]);
+    vst1q_f32(pSrc+56,  cd4.val[0]);
+    vst1q_f32(pSrc+60,  cd4.val[1]);
+    vst1q_f32(pSrc+64,  ab5.val[0]);
+    vst1q_f32(pSrc+68,  ab5.val[1]);
+    vst1q_f32(pSrc+72,  ab6.val[0]);
+    vst1q_f32(pSrc+76,  ab6.val[1]);
+    vst1q_f32(pSrc+80,  ab7.val[0]);
+    vst1q_f32(pSrc+84,  ab7.val[1]);
+    vst1q_f32(pSrc+88,  ab8.val[0]);
+    vst1q_f32(pSrc+92,  ab8.val[1]);
+    vst1q_f32(pSrc+96,  cd5.val[0]);
+    vst1q_f32(pSrc+100, cd5.val[1]);
+    vst1q_f32(pSrc+104, cd6.val[0]);
+    vst1q_f32(pSrc+108, cd6.val[1]);
+    vst1q_f32(pSrc+112, cd7.val[0]);
+    vst1q_f32(pSrc+116, cd7.val[1]);
+    vst1q_f32(pSrc+120, cd8.val[0]);
+    vst1q_f32(pSrc+124, cd8.val[1]);
+
+    //Transpose
+    arm_bitonic_merge_64_f32(pSrc+0 , dir);
+    arm_bitonic_merge_64_f32(pSrc+64, dir);
+}
+
+static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
+{
+    float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+    float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
+    float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+    float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
+
+    //Load and reverse second array
+    ab1.val[0]  = vld1q_f32(pSrc+0  );
+    ab1.val[1]  = vld1q_f32(pSrc+4  );
+    ab2.val[0]  = vld1q_f32(pSrc+8  ); 
+    ab2.val[1]  = vld1q_f32(pSrc+12 );
+    ab3.val[0]  = vld1q_f32(pSrc+16 );
+    ab3.val[1]  = vld1q_f32(pSrc+20 );
+    ab4.val[0]  = vld1q_f32(pSrc+24 ); 
+    ab4.val[1]  = vld1q_f32(pSrc+28 );
+    ab5.val[0]  = vld1q_f32(pSrc+32 );
+    ab5.val[1]  = vld1q_f32(pSrc+36 );
+    ab6.val[0]  = vld1q_f32(pSrc+40 ); 
+    ab6.val[1]  = vld1q_f32(pSrc+44 );
+    ab7.val[0]  = vld1q_f32(pSrc+48 );
+    ab7.val[1]  = vld1q_f32(pSrc+52 );
+    ab8.val[0]  = vld1q_f32(pSrc+56 ); 
+    ab8.val[1]  = vld1q_f32(pSrc+60 );
+    ab9.val[0]  = vld1q_f32(pSrc+64 );
+    ab9.val[1]  = vld1q_f32(pSrc+68 );
+    ab10.val[0] = vld1q_f32(pSrc+72 ); 
+    ab10.val[1] = vld1q_f32(pSrc+76 );
+    ab11.val[0] = vld1q_f32(pSrc+80 );
+    ab11.val[1] = vld1q_f32(pSrc+84 );
+    ab12.val[0] = vld1q_f32(pSrc+88 ); 
+    ab12.val[1] = vld1q_f32(pSrc+92 );
+    ab13.val[0] = vld1q_f32(pSrc+96 );
+    ab13.val[1] = vld1q_f32(pSrc+100);
+    ab14.val[0] = vld1q_f32(pSrc+104); 
+    ab14.val[1] = vld1q_f32(pSrc+108);
+    ab15.val[0] = vld1q_f32(pSrc+112);
+    ab15.val[1] = vld1q_f32(pSrc+116);
+    ab16.val[0] = vld1q_f32(pSrc+120); 
+    ab16.val[1] = vld1q_f32(pSrc+124);
+
+    vldrev128q_f32(cd16.val[1], pSrc+128);
+    vldrev128q_f32(cd16.val[0], pSrc+132);
+    vldrev128q_f32(cd15.val[1], pSrc+136);
+    vldrev128q_f32(cd15.val[0], pSrc+140);
+    vldrev128q_f32(cd14.val[1], pSrc+144);
+    vldrev128q_f32(cd14.val[0], pSrc+148);
+    vldrev128q_f32(cd13.val[1], pSrc+152);
+    vldrev128q_f32(cd13.val[0], pSrc+156);
+    vldrev128q_f32(cd12.val[1], pSrc+160);
+    vldrev128q_f32(cd12.val[0], pSrc+164);
+    vldrev128q_f32(cd11.val[1], pSrc+168);
+    vldrev128q_f32(cd11.val[0], pSrc+172);
+    vldrev128q_f32(cd10.val[1], pSrc+176);
+    vldrev128q_f32(cd10.val[0], pSrc+180);
+    vldrev128q_f32(cd9.val[1] , pSrc+184);
+    vldrev128q_f32(cd9.val[0] , pSrc+188);
+    vldrev128q_f32(cd8.val[1] , pSrc+192);
+    vldrev128q_f32(cd8.val[0] , pSrc+196);
+    vldrev128q_f32(cd7.val[1] , pSrc+200);
+    vldrev128q_f32(cd7.val[0] , pSrc+204);
+    vldrev128q_f32(cd6.val[1] , pSrc+208);
+    vldrev128q_f32(cd6.val[0] , pSrc+212);
+    vldrev128q_f32(cd5.val[1] , pSrc+216);
+    vldrev128q_f32(cd5.val[0] , pSrc+220);
+    vldrev128q_f32(cd4.val[1] , pSrc+224);
+    vldrev128q_f32(cd4.val[0] , pSrc+228);
+    vldrev128q_f32(cd3.val[1] , pSrc+232);
+    vldrev128q_f32(cd3.val[0] , pSrc+236);
+    vldrev128q_f32(cd2.val[1] , pSrc+240);
+    vldrev128q_f32(cd2.val[0] , pSrc+244);
+    vldrev128q_f32(cd1.val[1] , pSrc+248);
+    vldrev128q_f32(cd1.val[0] , pSrc+252);
+    
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1 , cd1 );
+        vminmax256q(ab2 , cd2 );
+        vminmax256q(ab3 , cd3 );
+        vminmax256q(ab4 , cd4 );
+        vminmax256q(ab5 , cd5 );
+        vminmax256q(ab6 , cd6 );
+        vminmax256q(ab7 , cd7 );
+        vminmax256q(ab8 , cd8 );
+        vminmax256q(ab9 , cd9 );
+        vminmax256q(ab10, cd10);
+        vminmax256q(ab11, cd11);
+        vminmax256q(ab12, cd12);
+        vminmax256q(ab13, cd13);
+        vminmax256q(ab14, cd14);
+        vminmax256q(ab15, cd15);
+        vminmax256q(ab16, cd16);
+    }
+    else
+    {
+        vminmax256q(cd1 , ab1 );
+        vminmax256q(cd2 , ab2 );
+        vminmax256q(cd3 , ab3 );
+        vminmax256q(cd4 , ab4 );
+        vminmax256q(cd5 , ab5 );
+        vminmax256q(cd6 , ab6 );
+        vminmax256q(cd7 , ab7 );
+        vminmax256q(cd8 , ab8 );
+        vminmax256q(cd9 , ab9 );
+        vminmax256q(cd10, ab10);
+        vminmax256q(cd11, ab11);
+        vminmax256q(cd12, ab12);
+        vminmax256q(cd13, ab13);
+        vminmax256q(cd14, ab14);
+        vminmax256q(cd15, ab15);
+        vminmax256q(cd16, ab16);
+    }
+
+    //Transpose
+    float32x4_t temp;
+
+    temp = ab9.val[0];
+    ab9.val[0] = cd1.val[0];
+    cd1.val[0] = temp;
+    temp = ab9.val[1];
+    ab9.val[1] = cd1.val[1];
+    cd1.val[1] = temp;
+    temp = ab10.val[0];
+    ab10.val[0] = cd2.val[0];
+    cd2.val[0] = temp;
+    temp = ab10.val[1];
+    ab10.val[1] = cd2.val[1];
+    cd2.val[1] = temp;
+    temp = ab11.val[0];
+    ab11.val[0] = cd3.val[0];
+    cd3.val[0] = temp;
+    temp = ab11.val[1];
+    ab11.val[1] = cd3.val[1];
+    cd3.val[1] = temp;
+    temp = ab12.val[0];
+    ab12.val[0] = cd4.val[0];
+    cd4.val[0] = temp;
+    temp = ab12.val[1];
+    ab12.val[1] = cd4.val[1];
+    cd4.val[1] = temp;
+    temp = ab13.val[0];
+    ab13.val[0] = cd5.val[0];
+    cd5.val[0] = temp;
+    temp = ab13.val[1];
+    ab13.val[1] = cd5.val[1];
+    cd5.val[1] = temp;
+    temp = ab14.val[0];
+    ab14.val[0] = cd6.val[0];
+    cd6.val[0] = temp;
+    temp = ab14.val[1];
+    ab14.val[1] = cd6.val[1];
+    cd6.val[1] = temp;
+    temp = ab15.val[0];
+    ab15.val[0] = cd7.val[0];
+    cd7.val[0] = temp;
+    temp = ab15.val[1];
+    ab15.val[1] = cd7.val[1];
+    cd7.val[1] = temp;
+    temp = ab16.val[0];
+    ab16.val[0] = cd8.val[0];
+    cd8.val[0] = temp;
+    temp = ab16.val[1];
+    ab16.val[1] = cd8.val[1];
+    cd8.val[1] = temp;
+
+    //Compare
+    if(dir)
+    {
+        vminmax256q(ab1 , cd1 );
+        vminmax256q(ab2 , cd2 );
+        vminmax256q(ab3 , cd3 );
+        vminmax256q(ab4 , cd4 );
+        vminmax256q(ab5 , cd5 );
+        vminmax256q(ab6 , cd6 );
+        vminmax256q(ab7 , cd7 );
+        vminmax256q(ab8 , cd8 );
+        vminmax256q(ab9 , cd9 );
+        vminmax256q(ab10, cd10);
+        vminmax256q(ab11, cd11);
+        vminmax256q(ab12, cd12);
+        vminmax256q(ab13, cd13);
+        vminmax256q(ab14, cd14);
+        vminmax256q(ab15, cd15);
+        vminmax256q(ab16, cd16);
+    }
+    else
+    {
+        vminmax256q(cd1 , ab1 );
+        vminmax256q(cd2 , ab2 );
+        vminmax256q(cd3 , ab3 );
+        vminmax256q(cd4 , ab4 );
+        vminmax256q(cd5 , ab5 );
+        vminmax256q(cd6 , ab6 );
+        vminmax256q(cd7 , ab7 );
+        vminmax256q(cd8 , ab8 );
+        vminmax256q(cd9 , ab9 );
+        vminmax256q(cd10, ab10);
+        vminmax256q(cd11, ab11);
+        vminmax256q(cd12, ab12);
+        vminmax256q(cd13, ab13);
+        vminmax256q(cd14, ab14);
+        vminmax256q(cd15, ab15);
+        vminmax256q(cd16, ab16);
+    }
+
+    vst1q_f32(pSrc,     ab1.val[0] );
+    vst1q_f32(pSrc+4,   ab1.val[1] );
+    vst1q_f32(pSrc+8,   ab2.val[0] );
+    vst1q_f32(pSrc+12,  ab2.val[1] );
+    vst1q_f32(pSrc+16,  ab3.val[0] );
+    vst1q_f32(pSrc+20,  ab3.val[1] );
+    vst1q_f32(pSrc+24,  ab4.val[0] );
+    vst1q_f32(pSrc+28,  ab4.val[1] );
+    vst1q_f32(pSrc+32,  ab5.val[0] );
+    vst1q_f32(pSrc+36,  ab5.val[1] );
+    vst1q_f32(pSrc+40,  ab6.val[0] );
+    vst1q_f32(pSrc+44,  ab6.val[1] );
+    vst1q_f32(pSrc+48,  ab7.val[0] );
+    vst1q_f32(pSrc+52,  ab7.val[1] );
+    vst1q_f32(pSrc+56,  ab8.val[0] );
+    vst1q_f32(pSrc+60,  ab8.val[1] );
+    vst1q_f32(pSrc+64,  cd1.val[0] );
+    vst1q_f32(pSrc+68,  cd1.val[1] );
+    vst1q_f32(pSrc+72,  cd2.val[0] );
+    vst1q_f32(pSrc+76,  cd2.val[1] );
+    vst1q_f32(pSrc+80,  cd3.val[0] );
+    vst1q_f32(pSrc+84,  cd3.val[1] );
+    vst1q_f32(pSrc+88,  cd4.val[0] );
+    vst1q_f32(pSrc+92,  cd4.val[1] );
+    vst1q_f32(pSrc+96,  cd5.val[0] );
+    vst1q_f32(pSrc+100, cd5.val[1] );
+    vst1q_f32(pSrc+104, cd6.val[0] );
+    vst1q_f32(pSrc+108, cd6.val[1] );
+    vst1q_f32(pSrc+112, cd7.val[0] );
+    vst1q_f32(pSrc+116, cd7.val[1] );
+    vst1q_f32(pSrc+120, cd8.val[0] );
+    vst1q_f32(pSrc+124, cd8.val[1] );
+    vst1q_f32(pSrc+128, ab9.val[0] );
+    vst1q_f32(pSrc+132, ab9.val[1] );
+    vst1q_f32(pSrc+136, ab10.val[0]);
+    vst1q_f32(pSrc+140, ab10.val[1]);
+    vst1q_f32(pSrc+144, ab11.val[0]);
+    vst1q_f32(pSrc+148, ab11.val[1]);
+    vst1q_f32(pSrc+152, ab12.val[0]);
+    vst1q_f32(pSrc+156, ab12.val[1]);
+    vst1q_f32(pSrc+160, ab13.val[0]);
+    vst1q_f32(pSrc+164, ab13.val[1]);
+    vst1q_f32(pSrc+168, ab14.val[0]);
+    vst1q_f32(pSrc+172, ab14.val[1]);
+    vst1q_f32(pSrc+176, ab15.val[0]);
+    vst1q_f32(pSrc+180, ab15.val[1]);
+    vst1q_f32(pSrc+184, ab16.val[0]);
+    vst1q_f32(pSrc+188, ab16.val[1]);
+    vst1q_f32(pSrc+192, cd9.val[0] );
+    vst1q_f32(pSrc+196, cd9.val[1] );
+    vst1q_f32(pSrc+200, cd10.val[0]);
+    vst1q_f32(pSrc+204, cd10.val[1]);
+    vst1q_f32(pSrc+208, cd11.val[0]);
+    vst1q_f32(pSrc+212, cd11.val[1]);
+    vst1q_f32(pSrc+216, cd12.val[0]);
+    vst1q_f32(pSrc+220, cd12.val[1]);
+    vst1q_f32(pSrc+224, cd13.val[0]);
+    vst1q_f32(pSrc+228, cd13.val[1]);
+    vst1q_f32(pSrc+232, cd14.val[0]);
+    vst1q_f32(pSrc+236, cd14.val[1]);
+    vst1q_f32(pSrc+240, cd15.val[0]);
+    vst1q_f32(pSrc+244, cd15.val[1]);
+    vst1q_f32(pSrc+248, cd16.val[0]);
+    vst1q_f32(pSrc+252, cd16.val[1]);
+
+    //Transpose
+    arm_bitonic_merge_128_f32(pSrc+0  , dir);
+    arm_bitonic_merge_128_f32(pSrc+128, dir);
+}
+
+static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+    a = arm_bitonic_sort_4_f32(a, dir);
+    b = arm_bitonic_sort_4_f32(b, dir);
+    return arm_bitonic_merge_8_f32(a, b, dir);
+}
+
+static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
+{
+    float32_t temp;
+
+    if( dir==(a[0]>a[1]) )
+    {
+	temp = a[1];
+	a[1] = a[0];
+	a[0] = temp;
+    }
+    if( dir==(a[2]>a[3]) )
+    {
+	temp = a[3];
+	a[3] = a[2];
+	a[2] = temp;
+    }
+
+    if( dir==(a[0]>a[3]) )
+    {
+	temp = a[3];
+	a[3] = a[0];
+	a[0] = temp;
+    }
+    if( dir==(a[1]>a[2]) )
+    {
+	temp = a[2];
+	a[2] = a[1];
+	a[1] = temp;
+    }
+
+    if( dir==(a[0]>a[1]) )
+    {
+	temp = a[1];
+	a[1] = a[0];
+	a[0] = temp;
+    }
+    if( dir==(a[2]>a[3]) )
+    {
+	temp = a[3];
+	a[3] = a[2];
+	a[2] = temp;
+    }
+
+    return a;
+}
+
+#endif
+
+void arm_bitonic_sort_f32(
+const arm_sort_instance_f32 * S, 
+      float32_t * pSrc,
+      float32_t * pDst, 
+      uint32_t blockSize)
+{
+    uint16_t s, i;
+    uint8_t dir = S->dir;
+
+#ifdef ARM_MATH_NEON
+
+    float32_t * pOut;
+    uint16_t counter = blockSize>>5;
+
+    if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+    {
+        if(pSrc == pDst) // in-place
+            pOut = pSrc;
+        else
+    	    pOut = pDst;
+    
+        float32x4x2_t ab1, ab2;
+        float32x4x2_t cd1, cd2;
+
+	if(blockSize == 1)
+		pOut = pSrc;
+	else if(blockSize == 2)
+	{
+            float32_t temp;
+            
+            if( dir==(pSrc[0]>pSrc[1]) )
+            {
+                temp = pSrc[1];
+                pOut[1] = pSrc[0];
+                pOut[0] = temp;
+            }
+	    else
+		pOut = pSrc;
+	}
+	else if(blockSize == 4)
+        {
+    	    float32x4_t a = vld1q_f32(pSrc);
+
+    	    a = arm_bitonic_sort_4_f32(a, dir);
+
+    	    vst1q_f32(pOut, a);
+        }
+        else if(blockSize == 8)
+        {
+            float32x4_t a;
+            float32x4_t b;
+            float32x4x2_t ab;
+        
+            a = vld1q_f32(pSrc);
+            b = vld1q_f32(pSrc+4);
+        
+            ab = arm_bitonic_sort_8_f32(a, b, dir);
+
+            vst1q_f32(pOut,   ab.val[0]);
+            vst1q_f32(pOut+4, ab.val[1]);
+        }
+        else if(blockSize >=16)
+        {
+            // Order 16 bits long vectors
+            for(i=0; i<blockSize; i=i+16)
+                arm_bitonic_sort_16_f32(pSrc+i, pOut+i, dir);
+        
+            // Merge
+            for(i=0; i<counter; i++)
+            {
+                // Load and reverse second vector
+                ab1.val[0] = vld1q_f32(pOut+32*i+0 );
+                ab1.val[1] = vld1q_f32(pOut+32*i+4 );
+                ab2.val[0] = vld1q_f32(pOut+32*i+8 ); 
+                ab2.val[1] = vld1q_f32(pOut+32*i+12);
+
+                vldrev128q_f32(cd2.val[1], pOut+32*i+16);
+                vldrev128q_f32(cd2.val[0], pOut+32*i+20);
+                vldrev128q_f32(cd1.val[1], pOut+32*i+24);
+                vldrev128q_f32(cd1.val[0], pOut+32*i+28);
+
+                arm_bitonic_merge_32_f32(pOut+32*i, ab1, ab2, cd1, cd2, dir);
+            }
+        
+            counter = counter>>1;
+            for(i=0; i<counter; i++)
+                arm_bitonic_merge_64_f32(pOut+64*i, dir);
+        
+            counter = counter>>1;
+            for(i=0; i<counter; i++)
+                arm_bitonic_merge_128_f32(pOut+128*i, dir);
+        
+            counter = counter>>1;
+            for(i=0; i<counter; i++)
+                arm_bitonic_merge_256_f32(pOut+256*i, dir);
+
+            // Etc...
+        }
+    }
+
+#else
+
+    float32_t * pA;
+
+    if(pSrc != pDst) // out-of-place
+    {   
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+
+    if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+    {
+        for(s=2; s<=blockSize; s=s*2)
+        {
+    	    for(i=0; i<blockSize; i=i+s)
+    	        arm_bitonic_sort_core_f32(pA+i, s, dir);
+        }
+    }
+#endif
+}
+
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
new file mode 100644
index 0000000..fcc4ceb
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_bubble_sort_f32.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_bubble_sort_f32.c
+ * Description:  Floating point bubble sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The bubble sort algorithm is a simple comparison algorithm that
+   *               reads the elements of a vector from the beginning to the end,
+   *               compares the adjacent ones and swaps them if they are in the 
+   *               wrong order. The procedure is repeated until there is nothing 
+   *               left to swap. Bubble sort is fast for input vectors that are
+   *               nearly sorted.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed
+   */
+
+void arm_bubble_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    uint8_t dir = S->dir;
+    uint32_t i;
+    uint8_t swapped =1;
+    float32_t * pA;
+    float32_t temp;
+
+    if(pSrc != pDst) // out-of-place
+    {
+	memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+	pA = pDst;
+    }
+    else
+	pA = pSrc;
+
+    while(swapped==1) // If nothing has been swapped after one loop stop
+    {
+	swapped=0;
+
+        for(i=0; i<blockSize-1; i++)
+	{
+	    if(dir==(pA[i]>pA[i+1]))
+	    {
+		// Swap
+		temp = pA[i];
+		pA[i] = pA[i+1];
+		pA[i+1] = temp;
+
+		// Update flag
+		swapped = 1;
+	    }
+	}
+
+	blockSize--;
+    }
+}
+
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
new file mode 100644
index 0000000..09ae924
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
@@ -0,0 +1,117 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_heap_sort_f32.c
+ * Description:  Floating point heap sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The heap sort algorithm is a comparison algorithm that
+   *               divides the input array into a sorted and an unsorted region, 
+   *               and shrinks the unsorted region by extracting the largest 
+   *               element and moving it to the sorted region. A heap data 
+   *               structure is used to find the maximum.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+
+static void arm_heapify(float32_t * pSrc, uint32_t n, uint32_t i, uint8_t dir)
+{
+    /* Put all the elements of pSrc in heap order */
+    uint32_t k = i; // Initialize largest/smallest as root 
+    uint32_t l = 2*i + 1; // left = 2*i + 1 
+    uint32_t r = 2*i + 2; // right = 2*i + 2 
+    float32_t temp;
+
+    if (l < n && dir==(pSrc[l] > pSrc[k]) )
+        k = l; 
+
+    if (r < n && dir==(pSrc[r] > pSrc[k]) )
+        k = r; 
+
+    if (k != i) 
+    { 
+	temp = pSrc[i];
+	pSrc[i]=pSrc[k];
+	pSrc[k]=temp;
+
+        arm_heapify(pSrc, n, k, dir); 
+    }
+}
+
+void arm_heap_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    float32_t * pA;
+    int32_t i;
+    float32_t temp;
+
+    if(pSrc != pDst) // out-of-place
+    {   
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+    // Build the heap array so that the largest value is the root
+    for (i = blockSize/2 - 1; i >= 0; i--) 
+        arm_heapify(pA, blockSize, i, S->dir); 
+
+    for (i = blockSize - 1; i >= 0; i--)
+    {
+        // Swap
+	temp = pA[i];
+	pA[i] = pA[0];
+        pA[0] = temp;
+
+        // Restore heap order
+	arm_heapify(pA, i, 0, S->dir);
+    }
+}
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
new file mode 100644
index 0000000..92394c9
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_insertion_sort_f32.c
+ * Description:  Floating point insertion sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The insertion sort is a simple sorting algorithm that
+   *               reads all the element of the input array and removes one element 
+   *               at a time, finds the location it belongs in the final sorted list, 
+   *               and inserts it there. 
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+
+void arm_insertion_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t *pSrc, 
+        float32_t* pDst, 
+        uint32_t blockSize)
+{
+    float32_t * pA;
+    uint8_t dir = S->dir;
+    uint32_t i, j;
+    float32_t temp;
+
+    if(pSrc != pDst) // out-of-place
+    {   
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+ 
+    // Real all the element of the input array
+    for(i=0; i<blockSize; i++)
+    {
+	// Move the i-th element to the right position
+        for (j = i; j>0 && dir==(pA[j]<pA[j-1]); j--)
+        {
+	    // Swap
+            temp = pA[j];
+	    pA[j] = pA[j-1];
+	    pA[j-1] = temp;
+        }
+    }
+} 
+
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
new file mode 100644
index 0000000..60cceca
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_f32.c
@@ -0,0 +1,113 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_merge_sort_f32.c
+ * Description:  Floating point merge sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The merge sort algorithm is a comparison algorithm that
+   *               divide the input array in sublists and merge them to produce
+   *               longer sorted sublists until there is only one list remaining.
+   *
+   * @par          A work array is always needed, hence pSrc and pDst cannot be
+   *               equal and the results will be stored in pDst.
+   */
+
+static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end, float32_t * pA, uint8_t dir)
+{
+    if((int32_t)end - (int32_t)begin >= 2 )           // If run size != 1 divide
+    {                                 
+        int32_t middle = (end + begin) / 2;           // Take the middle point
+
+        arm_merge_sort_core_f32(pA, begin,  middle, pB, dir);  // Sort the left part
+        arm_merge_sort_core_f32(pA, middle,    end, pB, dir);  // Sort the right part
+
+        topDownMerge(pB, begin, middle, end, pA, dir);
+    }
+}
+
+static void topDownMerge(float32_t * pA, uint32_t begin, uint32_t middle, uint32_t end, float32_t * pB, uint8_t dir)
+{
+    /* Left  array is pA[begin:middle-1]
+     * Right Array is pA[middle:end-1] 
+     * They are merged in pB
+     */
+
+    uint32_t i = begin;
+    uint32_t j = middle;
+    uint32_t k;
+ 
+    // Read all the elements in the sublist
+    for (k = begin; k < end; k++)
+    {
+	// Merge 
+        if (i < middle && (j >= end || dir==(pA[i] <= pA[j])) )
+        {
+            pB[k] = pA[i];
+            i++;
+        }
+        else
+        {
+            pB[k] = pA[j];
+            j++;
+        }
+    }
+}
+
+void arm_merge_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t *pSrc, 
+        float32_t *pDst, 
+        uint32_t blockSize)
+{
+    // A work array is needed: pDst
+    if(pSrc != pDst) // out-of-place only
+    {
+	memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+
+	arm_merge_sort_core_f32(pSrc, 0, blockSize, pDst, S->dir);
+    }
+}
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
new file mode 100644
index 0000000..ad47310
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_quick_sort_f32.c
+ * Description:  Floating point quick sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *                The quick sort algorithm is a comparison algorithm that
+   *                divides the input array into two smaller sub-arrays and 
+   *                recursively sort them. An element of the array (the pivot)
+   *                is chosen, all the elements with values smaller than the 
+   *                pivot are moved before the pivot, while all elements with 
+   *                values greater than the pivot are moved after it (partition).
+   *
+   * @par
+   * 		   In this implementation the Hoare partition scheme has been 
+   * 		   used and the first element has always been chosen as the pivot.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+
+static void arm_quick_sort_core_f32(float32_t *pSrc, uint32_t first, uint32_t last, uint8_t dir)
+{
+    uint32_t i, j, pivot;
+    float32_t temp;
+
+    if(dir)
+    {
+        if(first<last)
+        {
+            pivot=first; // First element chosen as pivot
+            i=first;
+            j=last;
+    
+	    // Look for a pair of elements (one greater than the pivot, one 
+	    // smaller) that are in the wrong order relative to each other.
+            while(i<j)
+            {
+		// Compare left elements with pivot
+                while(pSrc[i]<=pSrc[pivot] && i<last) 
+                    i++; // Move to the right
+    
+		// Compare right elements with pivot
+                while(pSrc[j]>pSrc[pivot])
+                    j--; // Move to the left
+    
+                if(i<j)
+                {
+                    // Swap i <-> j
+                    temp=pSrc[i];
+                    pSrc[i]=pSrc[j];
+                    pSrc[j]=temp;
+                }
+            }
+ 
+            // Swap pivot <-> j   
+            temp=pSrc[pivot];
+            pSrc[pivot]=pSrc[j];
+            pSrc[j]=temp;
+    
+            arm_quick_sort_core_f32(pSrc, first, j-1,  dir);
+            arm_quick_sort_core_f32(pSrc, j+1,   last, dir);
+        }
+    }
+    else
+    {
+        if(first<last)
+        {
+            pivot=first;
+            i=first;
+            j=last;
+    
+            while(i<j)
+            {
+                while(pSrc[i]>=pSrc[pivot] && i<last)
+                    i++;
+    
+                while(pSrc[j]<pSrc[pivot])
+                    j--;
+    
+                if(i<j)
+                {
+                    temp=pSrc[i];
+                    pSrc[i]=pSrc[j];
+                    pSrc[j]=temp;
+                }
+            }
+    
+            temp=pSrc[pivot];
+            pSrc[pivot]=pSrc[j];
+            pSrc[j]=temp;
+    
+            arm_quick_sort_core_f32(pSrc, first, j-1,  dir);
+            arm_quick_sort_core_f32(pSrc, j+1,   last, dir);
+        }
+    }
+
+}
+
+void arm_quick_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+   float32_t * pA;
+
+   if(pSrc != pDst) // out-of-place
+   {   
+       memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+       pA = pDst;
+   }
+   else
+       pA = pSrc;
+
+    arm_quick_sort_core_f32(pA, 0, blockSize-1, S->dir);
+}
+
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
new file mode 100644
index 0000000..4821696
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_selection_sort_f32.c
+ * Description:  Floating point selection sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+  @ingroup groupSupport
+ */
+
+/**
+  @addtogroup Sorting
+  @{
+ */
+
+/**
+   * @param[in]  S          points to an instance of the sorting structure.
+   * @param[in]  pSrc       points to the block of input data.
+   * @param[out] pDst       points to the block of output data
+   * @param[in]  blockSize  number of samples to process.
+   *
+   * @par        Algorithm
+   *               The Selection sort algorithm is a comparison algorithm that
+   *               divides the input array into a sorted and an unsorted sublist 
+   *               (initially the sorted sublist is empty and the unsorted sublist
+   *               is the input array), looks for the smallest (or biggest)
+   *               element in the unsorted sublist, swapping it with the leftmost
+   *               one, and moving the sublists boundary one element to the right.
+   *
+   * @par          It's an in-place algorithm. In order to obtain an out-of-place
+   *               function, a memcpy of the source vector is performed.
+   */
+
+void arm_selection_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    uint32_t i, j, k;
+    uint8_t dir = S->dir;
+    float32_t temp;
+
+    float32_t * pA;
+
+    if(pSrc != pDst) // out-of-place
+    {
+        memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+        pA = pDst;
+    }
+    else
+        pA = pSrc;
+
+    /*  Move the boundary one element to the right */
+    for (i=0; i<blockSize-1; i++)
+    {
+        /* Initialize the minimum/maximum as the first element */
+        k = i;
+
+        /* Look in the unsorted list to find the minimum/maximum value */
+        for (j=i+1; j<blockSize; j++)
+        {
+            if (dir==(pA[j] < pA[k]) )
+            {
+                /* Update value */
+                k = j;
+            }
+        }
+    
+        if (k != i) 
+        {
+            /* Swap the minimum/maximum with the leftmost element */
+            temp=pA[i];
+	    pA[i]=pA[k];
+	    pA[k]=temp;
+        }
+    }
+}
+
+/**
+  @} end of Sorting group
+ */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
new file mode 100644
index 0000000..5824b2d
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_sort_f32.c
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sort_f32.c
+ * Description:  Floating point sort
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+/**
+ * @param[in]  S          points to an instance of the sorting structure.
+ * @param[in]  pSrc       points to the block of input data.
+ * @param[out] pDst       points to the block of output data.
+ * @param[in]  blockSize  number of samples to process.
+ */
+
+void arm_sort_f32(
+  const arm_sort_instance_f32 * S, 
+        float32_t * pSrc, 
+        float32_t * pDst, 
+        uint32_t blockSize)
+{
+    switch(S->alg)
+    {
+        case ARM_SORT_BITONIC:
+        arm_bitonic_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_BUBBLE:
+        arm_bubble_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_HEAP:
+        arm_heap_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_INSERTION:
+        arm_insertion_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_MERGE:
+        arm_merge_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_QUICK:
+        arm_quick_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+
+        case ARM_SORT_SELECTION:
+        arm_selection_sort_f32(S, pSrc, pDst, blockSize);
+        break;
+    }
+}
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
new file mode 100644
index 0000000..80f10a5
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_sort_init_f32.c
+ * Description:  Floating point sort initialization function
+ *
+ * $Date:        2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_sorting.h"
+
+void arm_sort_init_f32(arm_sort_instance_f32 * S, arm_sort_alg alg, arm_sort_dir dir)
+{
+    S->alg         = alg;
+    S->dir         = dir;
+}
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c
new file mode 100644
index 0000000..d70f2f0
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_f32.c
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_spline_interp_f32.c
+ * Description:  Floating-point cubic spline interpolation
+ *
+ * $Date:        13 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupInterpolation
+ */
+
+/**
+ * @defgroup SplineInterpolate Cubic Spline Interpolation
+ *
+ * Spline interpolation is a method of interpolation where the interpolant
+ * is a piecewise-defined polynomial called "spline". 
+ *
+ * \par Introduction
+ * Given a function f defined on the interval [a,b], a set of n nodes x(i) 
+ * where a=x(1)<x(2)<...<x(n)=b and a set of n values y(i) = f(x(i)), 
+ * a cubic spline interpolant S(x) is defined as: <br>
+ * <pre>
+ *         S1(x)       x(1) < x < x(2)
+ * S(x) =   ...         
+ *         Sn-1(x)   x(n-1) < x < x(n)
+ * <\pre><br>
+ * where<br>
+ * <pre> 
+ * Si(x) = a_i+b_i(x-xi)+c_i(x-xi)^2+d_i(x-xi)^3    i=1, ..., n-1
+ * <\pre>
+ *
+ * \par Algorithm
+ * Having defined h(i) = x(i+1) - x(i)<br>
+ * <pre>
+ * h(i-1)c(i-1)+2[h(i-1)+h(i)]c(i)+h(i)c(i+1) = 3/h(i)*[a(i+1)-a(i)]-3/h(i-1)*[a(i)-a(i-1)]    i=2, ..., n-1
+ * <\pre><br>
+ * It is possible to write the previous conditions in matrix form (Ax=B).<br>
+ * In order to solve the system two boundary conidtions are needed.<br>
+ * - Natural spline: S1''(x1)=2*c(1)=0 ; Sn''(xn)=2*c(n)=0<br>
+ * In matrix form:<br>
+ * <pre>
+ * |  1        0         0  ...    0         0           0     ||  c(1)  | |                        0                        |
+ * | h(0) 2[h(0)+h(1)] h(1) ...    0         0           0     ||  c(2)  | |      3/h(2)*[a(3)-a(2)]-3/h(1)*[a(2)-a(1)]      |
+ * | ...      ...       ... ...   ...       ...         ...    ||  ...   |=|                       ...                       |
+ * |  0        0         0  ... h(n-2) 2[h(n-2)+h(n-1)] h(n-1) || c(n-1) | | 3/h(n-1)*[a(n)-a(n-1)]-3/h(n-2)*[a(n-1)-a(n-2)] |
+ * |  0        0         0  ...    0         0           1     ||  c(n)  | |                        0                        |
+ * </pre><br>
+ * - Parabolic runout spline: S1''(x1)=2*c(1)=S2''(x2)=2*c(2) ; Sn-1''(xn-1)=2*c(n-1)=Sn''(xn)=2*c(n)<br>
+ * In matrix form:<br>
+ * <pre>
+ * |  1       -1         0  ...    0         0           0     ||  c(1)  | |                        0                        |
+ * | h(0) 2[h(0)+h(1)] h(1) ...    0         0           0     ||  c(2)  | |      3/h(2)*[a(3)-a(2)]-3/h(1)*[a(2)-a(1)]      |
+ * | ...      ...       ... ...   ...       ...         ...    ||  ...   |=|                       ...                       |
+ * |  0        0         0  ... h(n-2) 2[h(n-2)+h(n-1)] h(n-1) || c(n-1) | | 3/h(n-1)*[a(n)-a(n-1)]-3/h(n-2)*[a(n-1)-a(n-2)] |
+ * |  0        0         0  ...    0        -1           1     ||  c(n)  | |                        0                        |
+ * </pre><br>
+ * A is a tridiagonal matrix (a band matrix of bandwidth 3) of size N=n+1. The factorization
+ * algorithms (A=LU) can be simplified considerably because a large number of zeros appear
+ * in regular patterns. The Crout method has been used:<br>
+ * 1) Solve LZ=B<br>
+ * <pre>
+ * u(1,2) = A(1,2)/A(1,1)
+ * z(1)   = B(1)/l(11)
+ *
+ * FOR i=2, ..., N-1
+ *   l(i,i)   = A(i,i)-A(i,i-1)u(i-1,i)
+ *   u(i,i+1) = a(i,i+1)/l(i,i)
+ *   z(i)     = [B(i)-A(i,i-1)z(i-1)]/l(i,i)
+ * 
+ * l(N,N) = A(N,N)-A(N,N-1)u(N-1,N)
+ * z(N)   = [B(N)-A(N,N-1)z(N-1)]/l(N,N)
+ * </pre><br>
+ * 2) Solve UX=Z<br>
+ * <pre>
+ * c(N)=z(N)
+ * 
+ * FOR i=N-1, ..., 1
+ *   c(i)=z(i)-u(i,i+1)c(i+1) 
+ * </pre><br>
+ * c(i) for i=1, ..., n-1 are needed to compute the n-1 polynomials. <br>
+ * b(i) and d(i) are computed as:<br>
+ * - b(i) = [y(i+1)-y(i)]/h(i)-h(i)*[c(i+1)+2*c(i)]/3 <br>
+ * - d(i) = [c(i+1)-c(i)]/[3*h(i)] <br>
+ * Moreover, a(i)=y(i).
+ * 
+ * \par Usage
+ * The x input array must be strictly sorted in ascending order and it must
+ * not contain twice the same value (x(i)<x(i+1)).
+ *
+ * \par
+ * It is possible to compute the interpolated vector for x values outside the 
+ * input range (xq<x(1); xq>x(n)). The coefficients used to compute the y values for
+ * xq<x(1) are going to be the ones used for the first interval, while for xq>x(n) the 
+ * coefficients used for the last interval.
+ *
+ */
+/**
+  @addtogroup SplineInterpolate
+  @{
+ */
+/**
+ * @brief Processing function for the floating-point cubic spline interpolation.
+ * @param[in]  S          points to an instance of the floating-point spline structure.
+ * @param[in]  x          points to the x values of the known data points.
+ * @param[in]  y          points to the y values of the known data points.
+ * @param[in]  xq         points to the x values ot the interpolated data points.
+ * @param[out] pDst       points to the block of output data.
+ * @param[in]  blockSize  number of samples of output data.
+ */
+
+void arm_spline_f32(
+        arm_spline_instance_f32 * S, 
+  const float32_t * x,
+  const float32_t * y,
+  const float32_t * xq,
+        float32_t * pDst,
+	uint32_t blockSize)
+{
+    uint32_t n = S->n_x;
+    arm_spline_type type = S->type;
+
+    float32_t hi, hm1;
+
+    /* Temporary variables for system AX=B */
+    float32_t u[n-1], z[n], c[n];
+    float32_t Bi, li;
+
+    float32_t bi, di;
+    float32_t x_sc;
+
+    const float32_t * pXq = xq;
+
+    int32_t blkCnt = (int32_t)blockSize;
+    int32_t blkCnt2;
+    int32_t i, j;
+
+#ifdef ARM_MATH_NEON
+    float32x4_t xiv;
+    float32x4_t aiv;
+    float32x4_t biv;
+    float32x4_t civ;
+    float32x4_t div;
+
+    float32x4_t xqv;
+
+    float32x4_t temp;
+    float32x4_t diff;
+    float32x4_t yv;
+#endif
+
+    /* === Solve LZ=B to obtain z(i) and u(i) === */
+    /* --- Row 1 --- */
+    /* B(0) = 0, not computed */
+    /* u(1,2) = a(1,2)/a(1,1) = a(1,2) */
+    if(type == ARM_SPLINE_NATURAL)
+        u[0] = 0;  // a(1,2) = 0
+    else if(type == ARM_SPLINE_PARABOLIC_RUNOUT)
+        u[0] = -1; // a(1,2) = -1
+
+    z[0] = 0;  // z(1) = B(1)/a(1,1) = 0
+
+    /* --- Rows 2 to N-1 (N=n+1) --- */
+    hm1 = x[1] - x[0]; // x(2)-x(1)
+
+    for (i=1; i<n-1; i++)
+    {
+        /* Compute B(i) */
+        hi = x[i+1]-x[i];
+        Bi = 3*(y[i+1]-y[i])/hi - 3*(y[i]-y[i-1])/hm1;
+        /* l(i) = a(ii)-a(i,i-1)*u(i-1) = 2[h(i-1)+h(i)]-h(i-1)*u(i-1) */
+	li = 2*(hi+hm1) - hm1*u[i-1];
+        /* u(i) = a(i,i+1)/l(i) = h(i)/l(i) */
+        u[i] = hi/li;
+        /* z(i) = [B(i)-h(i-1)*z(i-1)]/l(i) */
+        z[i] = (Bi-hm1*z[i-1])/li;
+        /* Update h(i-1) */
+        hm1 = hi;
+    }
+
+    /* --- Row N --- */
+    /* l(N) = a(N,N)-a(N,N-1)u(N-1) */
+    /* z(N) = [-a(N,N-1)z(N-1)]/l(N) */
+    if(type == ARM_SPLINE_NATURAL)
+    {
+        //li = 1;   // a(N,N)=1; a(N,N-1)=0
+        z[n-1] = 0; // a(N,N-1)=0
+    }
+    else if(type == ARM_SPLINE_PARABOLIC_RUNOUT)
+    {
+        li = 1+u[n-2];      // a(N,N)=1; a(N,N-1)=-1
+        z[n-1] = z[n-2]/li; // a(N,N-1)=-1
+    }
+
+    /* === Solve UX = Z to obtain c(i) === */
+    /* c(N) = z(N) */
+    c[n-1] = z[n-1]; 
+    /* c(i) = z(i)-u(i+1)c(i+1) */
+    for (j = n-1-1; j >= 0; --j) 
+        c[j] = z[j] - u[j] * c[j + 1];
+
+    /* === Compute b(i) and d(i) from c(i) and create output for x(i)<x<x(i+1) === */
+    for (i=0; i<n-1; i++)
+    {
+	hi = x[i+1]-x[i];
+        bi = (y[i+1]-y[i])/hi-hi*(c[i+1]+2*c[i])/3;
+        di = (c[i+1]-c[i])/(3*hi);
+
+#ifdef ARM_MATH_NEON
+	xiv = vdupq_n_f32(x[i]);
+
+        aiv = vdupq_n_f32(y[i]);
+        biv = vdupq_n_f32(bi);
+        civ = vdupq_n_f32(c[i]);
+        div = vdupq_n_f32(di);
+
+        while( *(pXq+4) <= x[i+1] && blkCnt > 4U )
+	{
+	    /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
+            xqv = vld1q_f32(pXq);
+	    pXq+=4;
+
+	    /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+            diff = vsubq_f32(xqv, xiv);
+	    temp = diff;
+
+	    /* y(i) = a(i) + ... */
+            yv = aiv;
+	    /* ... + b(i)*(x-x(i)) + ... */
+	    yv = vmlaq_f32(yv, biv, temp);
+	    /* ... + c(i)*(x-x(i))^2 + ... */
+	    temp = vmulq_f32(temp, diff);
+	    yv = vmlaq_f32(yv, civ, temp);
+            /* ... + d(i)*(x-x(i))^3 */
+	    temp = vmulq_f32(temp, diff);
+	    yv = vmlaq_f32(yv, div, temp);
+
+            /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
+	    vst1q_f32(pDst, yv);
+	    pDst+=4;
+
+	    blkCnt-=4;
+	}
+#endif
+        while( *pXq <= x[i+1] && blkCnt > 0U )	
+	{
+	    x_sc = *pXq++;
+
+            *pDst = y[i]+bi*(x_sc-x[i])+c[i]*(x_sc-x[i])*(x_sc-x[i])+di*(x_sc-x[i])*(x_sc-x[i])*(x_sc-x[i]);
+
+            pDst++;
+	    blkCnt--;
+	}
+    }
+
+    /* == Create output for remaining samples (x>=x(n)) == */
+#ifdef ARM_MATH_NEON
+    /* Compute 4 outputs at a time */
+    blkCnt2 = blkCnt >> 2;
+
+    while(blkCnt2 > 0U) 
+    { 
+        /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */ 
+        xqv = vld1q_f32(pXq);
+        pXq+=4;
+                                                         
+        /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+        diff = vsubq_f32(xqv, xiv);
+        temp = diff; 
+
+        /* y(i) = a(i) + ... */ 
+        yv = aiv; 
+        /* ... + b(i)*(x-x(i)) + ... */ 
+        yv = vmlaq_f32(yv, biv, temp);
+        /* ... + c(i)*(x-x(i))^2 + ... */
+        temp = vmulq_f32(temp, diff);
+        yv = vmlaq_f32(yv, civ, temp);
+        /* ... + d(i)*(x-x(i))^3 */
+        temp = vmulq_f32(temp, diff);
+        yv = vmlaq_f32(yv, div, temp);
+
+        /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
+        vst1q_f32(pDst, yv);
+        pDst+=4;
+
+        blkCnt2--;
+    } 
+
+    /* Tail */
+    blkCnt2 = blkCnt & 3;                                      
+#else                                                        
+    blkCnt2 = blkCnt;                                          
+#endif
+
+    while(blkCnt2 > 0U)                                       
+    { 
+        x_sc = *pXq++; 
+  
+        *pDst = y[i-1]+bi*(x_sc-x[i-1])+c[i]*(x_sc-x[i-1])*(x_sc-x[i-1])+di*(x_sc-x[i-1])*(x_sc-x[i-1])*(x_sc-x[i-1]);
+ 
+        pDst++; 
+        blkCnt2--;   
+    }                                                           
+}
+
+/**
+ *   @} end of SplineInterpolate group
+ *    */
diff --git a/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c
new file mode 100644
index 0000000..f9b5ee9
--- /dev/null
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_spline_interp_init_f32.c
@@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_spline_interp_init_f32.c
+ * Description:  Floating-point cubic spline initialization function
+ *
+ * $Date:        13 November 2019
+ * $Revision:    V1.6.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupInterpolation
+ */
+
+/**
+  @addtogroup SplineInterpolate
+  @{
+ */
+
+/**
+  * @brief Initialization function for the floating-point cubic spline interpolation.
+  * @param[in,out] S        points to an instance of the floating-point spline structure.
+  * @param[in]     n        number of known data points.
+  * @param[in]     type     type of cubic spline interpolation (boundary conditions)
+  */
+
+void arm_spline_init_f32(
+    arm_spline_instance_f32 * S, 
+    uint32_t n, 
+    arm_spline_type type)
+{
+    S->n_x    = n;
+    S->type   = type;
+
+    /* Type (boundary conditions):
+     *   0: Natural spline          ( S1''(x1)=0 ; Sn''(xn)=0 )
+     *   1: Parabolic runout spline ( S1''(x1)=S2''(x2) ; Sn-1''(xn-1)=Sn''(xn) )
+     */
+}
+
+/**
+ *   @} end of SplineInterpolate group
+ */
diff --git a/CMSIS/DSP/Source/configDsp.cmake b/CMSIS/DSP/Source/configDsp.cmake
index 08a4d70..32c11a0 100644
--- a/CMSIS/DSP/Source/configDsp.cmake
+++ b/CMSIS/DSP/Source/configDsp.cmake
@@ -30,8 +30,8 @@
     target_compile_definitions(${project} PRIVATE ARM_MATH_FLOAT16) 
 endif()
 
-if (HELIUM OR MVEF)
+if (HELIUM OR MVEF OR SUPPORT)
    target_include_directories(${project} PRIVATE "${root}/CMSIS/DSP/PrivateInclude")
 endif()
 
-endfunction()
\ No newline at end of file
+endfunction()