CMSIS-DSP: Improvement to testing scripts
diff --git a/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h b/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
index 65fc752..b2a0690 100755
--- a/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
+++ b/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
@@ -253,291 +253,223 @@
     acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    float32_t const *pSrcX;                                                                         \
-    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
-    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = (count - 1) >> 2;                                                                           \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
-        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = (count - 1) % 0x4U;                                                                         \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                        \
-    xVec = vldrwq_f32(&pSrcX[1]);                                                                   \
-    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                 \
-    xVec = vld1q(pSrcX);  pSrcX += 4;                                                   \
-    p0 = vctp32q(k+1);                                                                   \
-    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                 \
-                                                                                                    \
-    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
-    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)                             \
+{                                                                                                    \
+    float32_t const *pSrcX;                                                                          \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                          \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                     \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                     \
+    pSrcX = (float32_t const *) pX;                                                                  \
+    k = (count - 1) >> 2;                                                                            \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                                \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                    \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = (count - 1) % 0x4U;                                                                          \
+    mve_pred16_t p0 = vctp32q(k);                                                                    \
+    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                         \
+    xVec = vldrwq_f32(&pSrcX[1]);                                                                    \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                  \
+    xVec = vld1q(pSrcX);  pSrcX += 4;                                                                \
+    p0 = vctp32q(k+1);                                                                               \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                  \
+                                                                                                     \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                              \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                              \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    float32_t const *pSrcX;                                                                         \
-    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
-    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
-        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        xVec = vldrwq_f32(&pSrcX[1]);                                                               \
-        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                             \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                             \
-    }                                                                                               \
-    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
-    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)                           \
+{                                                                                                    \
+    float32_t const *pSrcX;                                                                          \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                          \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    acc0Vec = vdupq_n_f32(0.0f);                                                                     \
+    acc1Vec = vdupq_n_f32(0.0f);                                                                     \
+    pSrcX = (float32_t const *) pX;                                                                  \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                                \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                    \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        xVec = vldrwq_f32(&pSrcX[1]);                                                                \
+        acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                              \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                              \
+    }                                                                                                \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                              \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                              \
 }
 
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    float32_t   const *pSrcX;                                                                       \
-    const float32_t  *pY1 = pY + 1;                                                                       \
-    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                                                       \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    acc0Vec = vdupq_n_f32(0.0f);                                                                    \
-    acc1Vec = vdupq_n_f32(0.0f);                                                                    \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                                                   \
-        yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);                                   \
-        pY1-=4;                                                                                     \
-        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                                                   \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x4U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    xVec = vld1q(pSrcX);  pSrcX += 4;                                                   \
-    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                        \
-    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                                                 \
-    yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp32q(k+1);                                                                   \
-    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                                                 \
-                                                                                                    \
-    acc0 = vecAddAcrossF32Mve(acc0Vec);                                                             \
-    acc1 = vecAddAcrossF32Mve(acc1Vec);                                                             \
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    float32_t   const *pSrcX;                                           \
+    const float32_t  *pY1 = pY + 1;                                     \
+    f32x4_t   acc0Vec, acc1Vec, xVec, yVec;                             \
+    uint32_t    k;                                                      \
+                                                                        \
+    acc0Vec = vdupq_n_f32(0.0f);                                        \
+    acc1Vec = vdupq_n_f32(0.0f);                                        \
+    pSrcX = (float32_t const *) pX;                                     \
+    k = count >> 2;                                                     \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                               \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);        \
+        pY-=4;                                                          \
+        acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec);                       \
+        yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);       \
+        pY1-=4;                                                         \
+        acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec);                       \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x4U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    mve_pred16_t p0 = vctp32q(k);                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 4;                                   \
+    yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);            \
+    acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0);                     \
+    yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec);           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp32q(k+1);                                                  \
+    acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0);                     \
+                                                                        \
+    acc0 = vecAddAcrossF32Mve(acc0Vec);                                 \
+    acc1 = vecAddAcrossF32Mve(acc1Vec);                                 \
 }
 
-#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    float32_t const *pSrcX;                                                                         \
-    f32x4_t   accVec, xVec, yVec;                                                                 \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    accVec = vdupq_n_f32(0.0f);                                                                     \
-    pSrcX = (float32_t const *) pX;                                                                 \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        accVec = vfmaq_f32(accVec, xVec, yVec);                                                     \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        xVec = vld1q(pSrcX);  pSrcX += 4;                                               \
-        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                    \
-        accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);                                               \
-    }                                                                                               \
-    acc = vecAddAcrossF32Mve(accVec);                                                               \
+#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count)                                                 \
+{                                                                                                    \
+    float32_t const *pSrcX;                                                                          \
+    f32x4_t   accVec, xVec, yVec;                                                                    \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    accVec = vdupq_n_f32(0.0f);                                                                      \
+    pSrcX = (float32_t const *) pX;                                                                  \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        accVec = vfmaq_f32(accVec, xVec, yVec);                                                      \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        xVec = vld1q(pSrcX);  pSrcX += 4;                                                            \
+        yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec);                                     \
+        accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);                                                \
+    }                                                                                                \
+    acc = vecAddAcrossF32Mve(accVec);                                                                \
 }
 
 #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
 
 #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
 
-#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 31);                                                                            \
+#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count)                                                 \
+{                                                                                                    \
+    q31_t const *pSrcX;                                                                              \
+    q31x4_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q31_t const *) pX;                                                                      \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                           \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                     \
+    }                                                                                                \
+    acc = asrl(acc, 31);                                                                             \
 }
 
 
 
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    const q31_t       *pY1 = pY + 1;                                                                      \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);                                   \
-        pY1-=4;                                                                                     \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x4U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
-    yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                        \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-    yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp32q(k+1);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-}
-
-
-
-
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
 {                                                                       \
     q31_t const *pSrcX;                                                 \
+    const q31_t       *pY1 = pY + 1;                                    \
     q31x4_t   xVec, yVec;                                               \
     uint32_t    k;                                                      \
                                                                         \
     pSrcX = (q31_t const *) pX;                                         \
-    k = (count-1) >> 2;                                                 \
+    k = count >> 2;                                                     \
                                                                         \
     while (k > 0U)                                                      \
     {                                                                   \
-        /* note */                                                      \
-        /* could can be more efficient using Vector Scatter Store: */   \
-        /* + pre-increment + WB */                                      \
-        /* To be revisited when intrinsic available */                  \
-        /* SDCOMP-52618 */                                              \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                \
         yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);        \
         pY-=4;                                                          \
-        xVec = vldrwq_s32(&pSrcX[1]);                                   \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
-        xVec = vld1q(pSrcX);                                            \
-        pSrcX += 4;                                                     \
         acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);       \
+        pY1-=4;                                                         \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
         /*  Decrement the loop counter   */                             \
         k--;                                                            \
     }                                                                   \
-    k = (count - 1) % 0x4U;                                             \
+    k = count % 0x4U;                                                   \
     /* use predication to finalize MAC sum */                           \
-    /* acc1 requires exact number of sample (count-1)  */               \
+    /* acc0 requires exact number of sample  */                         \
     /* disable extra lanes in final MAC computation  */                 \
     mve_pred16_t p0 = vctp32q(k);                                       \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                    \
     yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);            \
-    xVec = vldrwq_s32(&pSrcX[1]);                                       \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
-    /* acc0 requires 1 additional sample  (count) */                    \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+    yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec);           \
+    /* acc1 requires 1 additional sample  */                            \
     /* so add 1 to unmask an extra lane  in final MAC computation  */   \
     p0 = vctp32q(k+1);                                                  \
-    xVec = vld1q(pSrcX);                                                \
-    pSrcX += 4;                                                         \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
                                                                         \
     acc0 = asrl(acc0, 31);                                              \
     acc1 = asrl(acc1, 31);                                              \
@@ -545,1110 +477,1103 @@
 
 
 
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \
+{                                                                        \
+    q31_t const *pSrcX;                                                  \
+    q31x4_t   xVec, yVec;                                                \
+    uint32_t    k;                                                       \
+                                                                         \
+    pSrcX = (q31_t const *) pX;                                          \
+    k = (count-1) >> 2;                                                  \
+                                                                         \
+    while (k > 0U)                                                       \
+    {                                                                    \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);         \
+        pY-=4;                                                           \
+        xVec = vldrwq_s32(&pSrcX[1]);                                    \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                             \
+        xVec = vld1q(pSrcX);                                             \
+        pSrcX += 4;                                                      \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                             \
+        /*  Decrement the loop counter   */                              \
+        k--;                                                             \
+    }                                                                    \
+    k = (count - 1) % 0x4U;                                              \
+    /* use predication to finalize MAC sum */                            \
+    /* acc1 requires exact number of sample (count-1)  */                \
+    /* disable extra lanes in final MAC computation  */                  \
+    mve_pred16_t p0 = vctp32q(k);                                        \
+    yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);             \
+    xVec = vldrwq_s32(&pSrcX[1]);                                        \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                           \
+    /* acc0 requires 1 additional sample  (count) */                     \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */    \
+    p0 = vctp32q(k+1);                                                   \
+    xVec = vld1q(pSrcX);                                                 \
+    pSrcX += 4;                                                          \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                           \
+                                                                         \
+    acc0 = asrl(acc0, 31);                                               \
+    acc1 = asrl(acc1, 31);                                               \
 }
 
 
 
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q31_t const *pSrcX;                                                                             \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        pY-=4;                                                                                      \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                    \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-    acc2 = asrl(acc2, 31);                                                                          \
-    acc3 = asrl(acc3, 31);                                                                          \
-}
-
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vldrwq_s32(&pSrcY[-1]);                                                              \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x4U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    mve_pred16_t p0 = vctp32q(k+1);                                                      \
-    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
-    yVec = vldrwq_s32(&pSrcY[-1]);                                                                  \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    p0 = vctp32q(k);                                                                     \
-    yVec = vld1q(pSrcY);  pSrcY += 4;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                                                       \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-}
-
-#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY = (q31_t const *) pY;                                                                     \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /*  tail predication expected here  */                                                          \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 31);                                                                            \
-}
-
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrwq_s32(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-    acc2 = asrl(acc2, 31);                                                                          \
-    acc3 = asrl(acc3, 31);                                                                          \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = count >> 2;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x4U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp32q(k);                                                    \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q31_t const *pSrcX, *pSrcY;                                                                     \
-    q31x4_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q31_t const *) pX;                                                                     \
-    pSrcY  = (q31_t const *) pY;                                                                    \
-    k = (count-1) >> 2;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY); pSrcY += 4;                                                \
-        xVec = vldrwq_s32(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX); pSrcX += 4;                                                \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    k = (count-1) % 0x4U;                                                                           \
-    mve_pred16_t p0 = vctp32q(k);                                                        \
-    yVec = vld1q(pSrcY);  pSrcY += 4;                                                   \
-    xVec = vldrwq_s32(&pSrcX[1]);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp32q(k+1);                                                                   \
-    xVec = vld1q(pSrcX); pSrcX += 4;                                                    \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 31);                                                                          \
-    acc1 = asrl(acc1, 31);                                                                          \
-}
-
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = count >> 3;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vldrhq_s16(&pSrcY[-1]);                                                              \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x8U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    mve_pred16_t p0 = vctp16q(k+1);                                                      \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    yVec = vldrhq_s16(&pSrcY[-1]);                                                                  \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    p0 = vctp16q(k);                                                                     \
-    yVec = vld1q(pSrcY);  pSrcY += 8;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                                                       \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-}
-
-#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY = (q15_t const *) pY;                                                                     \
-    k = count >> 3;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /*  tail predication expected here  */                                                          \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 15);                                                                            \
-    acc = __SSAT(acc, 16);                                                                          \
-}
-
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc2 = asrl(acc2, 15);                                                                          \
-    acc3 = asrl(acc3, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-    acc2 = __SSAT(acc2, 16);                                                                        \
-    acc3 = __SSAT(acc3, 16);                                                                        \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX, *pSrcY;                                                                     \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    pSrcY  = (q15_t const *) pY;                                                                    \
-    k = (count-1) >> 3;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 8;                                               \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    k = (count-1) % 0x8U;                                                                           \
-    mve_pred16_t p0 = vctp16q(k);                                                        \
-    yVec = vld1q(pSrcY);  pSrcY += 8;                                                   \
-    xVec = vldrhq_s16(&pSrcX[1]);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp16q(k+1);                                                                   \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    const q15_t       *pY1 = pY + 1;                                                                      \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);                                   \
-        pY1-=8;                                                                                     \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x8U;                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp16q(k);                                                        \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                        \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-    yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp16q(k+1);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-}
-
-#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count)                                                \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc = vmlaldavaq(acc, xVec, yVec);                                                          \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                    \
-    }                                                                                               \
-    acc = asrl(acc, 15);                                                                            \
-    acc = __SSAT(acc, 16);                                                                          \
-}
-
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)              \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                        \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[2]);                                                               \
-        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                  \
-        xVec = vldrhq_s16(&pSrcX[3]);                                                               \
-        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc2 = asrl(acc2, 15);                                                                          \
-    acc3 = asrl(acc3, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-    acc2 = __SSAT(acc2, 16);                                                                        \
-    acc3 = __SSAT(acc3, 16);                                                                        \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                          \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = count >> 3;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x8U;                                                                               \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp16q(k);                                                    \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                  \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                  \
-    }                                                                                               \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                            \
-{                                                                                                   \
-    q15_t const *pSrcX;                                                                             \
-    q15x8_t   xVec, yVec;                                                                         \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q15_t const *) pX;                                                                     \
-    k = (count-1) >> 3;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                    \
-        pY-=8;                                                                                      \
-        xVec = vldrhq_s16(&pSrcX[1]);                                                               \
-        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                        \
-        xVec = vld1q(pSrcX);  pSrcX += 8;                                               \
-        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                        \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = (count - 1) % 0x8U;                                                                         \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp16q(k);                                                        \
-    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                        \
-    xVec = vldrhq_s16(&pSrcX[1]);                                                                   \
-    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                      \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp16q(k+1);                                                                   \
-    xVec = vld1q(pSrcX);  pSrcX += 8;                                                   \
-    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                      \
-                                                                                                    \
-    acc0 = asrl(acc0, 15);                                                                          \
-    acc1 = asrl(acc1, 15);                                                                          \
-    acc0 = __SSAT(acc0, 16);                                                                        \
-    acc1 = __SSAT(acc1, 16);                                                                        \
-}
-
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vldrbq_s8(&pSrcY[-1]);                                                               \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x10U;                                                                              \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    mve_pred16_t p0 = vctp8q(k+1);                                                       \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    yVec = vldrbq_s8(&pSrcY[-1]);                                                                   \
-    acc1 = vmladavaq_p(acc1, xVec, yVec,p0);                                                        \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    p0 = vctp8q(k);                                                                      \
-    yVec = vld1q(pSrcY);  pSrcY += 16;                                                    \
-    acc0 = vmladavaq_p(acc0, xVec, yVec,p0);                                                        \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
-}
-
-#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)                                                 \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        acc = vmladavaq(acc, xVec, yVec);                                                           \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /*  tail predication expected here  */                                                          \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                     \
-    }                                                                                               \
-    acc =(acc >> 7);                                                                                \
-    acc = __SSAT(acc, 8);                                                                           \
-}
-
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)               \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq(acc2, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq(acc3, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc2 = (acc2 >> 7);                                                                             \
-    acc3 = (acc3 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
-    acc2 = __SSAT(acc2, 8);                                                                         \
-    acc3 = __SSAT(acc3, 8);                                                                         \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                           \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* loop + tail predication expected here  */                                                    \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX, *pSrcY;                                                                      \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    pSrcY = (q7_t const *) pY;                                                                      \
-    k = (count-1) >> 4;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        yVec = vld1q(pSrcY);  pSrcY += 16;                                                \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    k = (count-1) % 0x10U;                                                                          \
-    mve_pred16_t p0 = vctp8q(k);                                                         \
-    yVec = vld1q(pSrcY);  pSrcY += 16;                                                    \
-    xVec = vldrbq_s8(&pSrcX[1]);                                                                    \
-    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp8q(k+1);                                                                    \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    const q7_t       *pY1 = pY + 1;                                                                       \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                                            \
-        pY1-=16;                                                                                    \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = count % 0x10U;                                                                              \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc0 requires exact number of sample  */                                                     \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp8q(k);                                                         \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                 \
-    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
-    yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                                                \
-    /* acc1 requires 1 additional sample  */                                                        \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp8q(k+1);                                                                    \
-    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
-}
-
-#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count)                                                 \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc = vmladavaq(acc, xVec, yVec);                                                           \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                     \
-    }                                                                                               \
-    acc = __SSAT(acc >> 7, 8);                                                                      \
-}
-
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)               \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq(acc2, xVec, yVec);                                                         \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq(acc3, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[2]);                                                                \
-        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                   \
-        xVec = vldrbq_s8(&pSrcX[3]);                                                                \
-        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-    acc0 = __SSAT(acc0 >> 7, 8);                                                                    \
-    acc1 = __SSAT(acc1 >> 7, 8);                                                                    \
-    acc2 = __SSAT(acc2 >> 7, 8);                                                                    \
-    acc3 = __SSAT(acc3 >> 7, 8);                                                                    \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                           \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = count >> 4;                                                                                 \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    /* Loop with tail predication expected here  */                                                 \
-    k = count % 0x10U;                                                                              \
-    if (k > 0U)                                                                                     \
-    {                                                                                               \
-        mve_pred16_t p0 = vctp8q(k);                                                     \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                   \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                   \
-    }                                                                                               \
-    acc0 = __SSAT(acc0 >> 7, 8);                                                                    \
-    acc1 = __SSAT(acc1 >> 7, 8);                                                                    \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)                           \
+{                                                                                                    \
+    q31_t const *pSrcX;                                                                              \
+    q31x4_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q31_t const *) pX;                                                                      \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 31);                                                                           \
+    acc1 = asrl(acc1, 31);                                                                           \
 }
 
 
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                             \
-{                                                                                                   \
-    q7_t const *pSrcX;                                                                              \
-    q7x16_t   xVec, yVec;                                                                          \
-    uint32_t    k;                                                                                  \
-                                                                                                    \
-    pSrcX = (q7_t const *) pX;                                                                      \
-    k = (count-1) >> 4;                                                                             \
-                                                                                                    \
-    while (k > 0U)                                                                                  \
-    {                                                                                               \
-        /* note */                                                                                  \
-        /* could can be more efficient using Vector Scatter Store: */                               \
-        /* + pre-increment + WB */                                                                  \
-        /* To be revisited when intrinsic available */                                              \
-        /* SDCOMP-52618 */                                                                          \
-        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                             \
-        pY-=16;                                                                                     \
-        xVec = vldrbq_s8(&pSrcX[1]);                                                                \
-        acc1 = vmladavaq(acc1, xVec, yVec);                                                         \
-        xVec = vld1q(pSrcX);  pSrcX += 16;                                                \
-        acc0 = vmladavaq(acc0, xVec, yVec);                                                         \
-        /*  Decrement the loop counter   */                                                         \
-        k--;                                                                                        \
-    }                                                                                               \
-    k = (count - 1) % 0x10U;                                                                        \
-    /* use predication to finalize MAC sum */                                                       \
-    /* acc1 requires exact number of sample (count-1)  */                                           \
-    /* disable extra lanes in final MAC computation  */                                             \
-    mve_pred16_t p0 = vctp8q(k);                                                         \
-    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                 \
-    xVec = vldrbq_s8(&pSrcX[1]);                                                                    \
-    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                       \
-    /* acc0 requires 1 additional sample  (count) */                                                \
-    /* so add 1 to unmask an extra lane  in final MAC computation  */                               \
-    p0 = vctp8q(k+1);                                                                    \
-    xVec = vld1q(pSrcX);  pSrcX += 16;                                                    \
-    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                       \
-                                                                                                    \
-    acc0 = (acc0 >> 7);                                                                             \
-    acc1 = (acc1 >> 7);                                                                             \
-    acc0 = __SSAT(acc0, 8);                                                                         \
-    acc1 = __SSAT(acc1, 8);                                                                         \
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)               \
+{                                                                                                    \
+    q31_t const *pSrcX;                                                                              \
+    q31x4_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q31_t const *) pX;                                                                      \
+    k = count >> 2;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        pY-=4;                                                                                       \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                         \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x4U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp32q(k);                                                                \
+        yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec);                                     \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                   \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                                             \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 31);                                                                           \
+    acc1 = asrl(acc1, 31);                                                                           \
+    acc2 = asrl(acc2, 31);                                                                           \
+    acc3 = asrl(acc3, 31);                                                                           \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q31_t const *pSrcX, *pSrcY;                                         \
+    q31x4_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q31_t const *) pX;                                         \
+    pSrcY  = (q31_t const *) pY;                                        \
+    k = count >> 2;                                                     \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                \
+        yVec = vldrwq_s32(&pSrcY[-1]);                                  \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x4U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    mve_pred16_t p0 = vctp32q(k+1);                                     \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                    \
+    yVec = vldrwq_s32(&pSrcY[-1]);                                      \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    p0 = vctp32q(k);                                                    \
+    yVec = vld1q(pSrcY);  pSrcY += 4;                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                           \
+                                                                        \
+    acc0 = asrl(acc0, 31);                                              \
+    acc1 = asrl(acc1, 31);                                              \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\
+{                                                   \
+    q31_t const *pSrcX, *pSrcY;                     \
+    q31x4_t   xVec, yVec;                           \
+    uint32_t    k;                                  \
+                                                    \
+    pSrcX = (q31_t const *) pX;                     \
+    pSrcY = (q31_t const *) pY;                     \
+    k = count >> 2;                                 \
+                                                    \
+    while (k > 0U)                                  \
+    {                                               \
+        xVec = vld1q(pSrcX); pSrcX += 4;            \
+        yVec = vld1q(pSrcY); pSrcY += 4;            \
+        acc = vmlaldavaq(acc, xVec, yVec);          \
+        /*  Decrement the loop counter   */         \
+        k--;                                        \
+    }                                               \
+    /*  tail predication expected here  */          \
+    k = count % 0x4U;                               \
+    if (k > 0U)                                     \
+    {                                               \
+        mve_pred16_t p0 = vctp32q(k);               \
+        xVec = vld1q(pSrcX); pSrcX += 4;            \
+        yVec = vld1q(pSrcY); pSrcY += 4;            \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);    \
+    }                                               \
+    acc = asrl(acc, 31);                            \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                     \
+    q31_t const *pSrcX, *pSrcY;                                                       \
+    q31x4_t   xVec, yVec;                                                             \
+    uint32_t    k;                                                                    \
+                                                                                      \
+    pSrcX = (q31_t const *) pX;                                                       \
+    pSrcY  = (q31_t const *) pY;                                                      \
+    k = count >> 2;                                                                   \
+                                                                                      \
+    while (k > 0U)                                                                    \
+    {                                                                                 \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                              \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                          \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                          \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                          \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                              \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                          \
+        /*  Decrement the loop counter   */                                           \
+        k--;                                                                          \
+    }                                                                                 \
+    /* loop + tail predication expected here  */                                      \
+    k = count % 0x4U;                                                                 \
+    if (k > 0U)                                                                       \
+    {                                                                                 \
+        mve_pred16_t p0 = vctp32q(k);                                                 \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                              \
+        xVec = vldrwq_s32(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                    \
+        xVec = vldrwq_s32(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                    \
+        xVec = vldrwq_s32(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                    \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                              \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                    \
+    }                                                                                 \
+                                                                                      \
+    acc0 = asrl(acc0, 31);                                                            \
+    acc1 = asrl(acc1, 31);                                                            \
+    acc2 = asrl(acc2, 31);                                                            \
+    acc3 = asrl(acc3, 31);                                                            \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                         \
+    q31_t const *pSrcX, *pSrcY;                                           \
+    q31x4_t   xVec, yVec;                                                 \
+    uint32_t    k;                                                        \
+                                                                          \
+    pSrcX = (q31_t const *) pX;                                           \
+    pSrcY  = (q31_t const *) pY;                                          \
+    k = count >> 2;                                                       \
+                                                                          \
+    while (k > 0U)                                                        \
+    {                                                                     \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                  \
+        xVec = vldrwq_s32(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                              \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                  \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                              \
+        /*  Decrement the loop counter   */                               \
+        k--;                                                              \
+    }                                                                     \
+    /* loop + tail predication expected here  */                          \
+    k = count % 0x4U;                                                     \
+    if (k > 0U)                                                           \
+    {                                                                     \
+        mve_pred16_t p0 = vctp32q(k);                                     \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                  \
+        xVec = vldrwq_s32(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                        \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                  \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                        \
+    }                                                                     \
+                                                                          \
+    acc0 = asrl(acc0, 31);                                                \
+    acc1 = asrl(acc1, 31);                                                \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q31_t const *pSrcX, *pSrcY;                                         \
+    q31x4_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q31_t const *) pX;                                         \
+    pSrcY  = (q31_t const *) pY;                                        \
+    k = (count-1) >> 2;                                                 \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        yVec = vld1q(pSrcY); pSrcY += 4;                                \
+        xVec = vldrwq_s32(&pSrcX[1]);                                   \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX); pSrcX += 4;                                \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires exact number of sample (count-1)  */               \
+    /* disable extra lanes in final MAC computation  */                 \
+    k = (count-1) % 0x4U;                                               \
+    mve_pred16_t p0 = vctp32q(k);                                       \
+    yVec = vld1q(pSrcY);  pSrcY += 4;                                   \
+    xVec = vldrwq_s32(&pSrcX[1]);                                       \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                    \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp32q(k+1);                                                  \
+    xVec = vld1q(pSrcX); pSrcX += 4;                                    \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 31);                                              \
+    acc1 = asrl(acc1, 31);                                              \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q15_t const *pSrcX, *pSrcY;                                         \
+    q15x8_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q15_t const *) pX;                                         \
+    pSrcY  = (q15_t const *) pY;                                        \
+    k = count >> 3;                                                     \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                               \
+        yVec = vldrhq_s16(&pSrcY[-1]);                                  \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x8U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    mve_pred16_t p0 = vctp16q(k+1);                                     \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
+    yVec = vldrhq_s16(&pSrcY[-1]);                                      \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0);                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    p0 = vctp16q(k);                                                    \
+    yVec = vld1q(pSrcY);  pSrcY += 8;                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0);                           \
+                                                                        \
+    acc0 = asrl(acc0, 15);                                              \
+    acc1 = asrl(acc1, 15);                                              \
+    acc0 = __SSAT(acc0, 16);                                            \
+    acc1 = __SSAT(acc1, 16);                                            \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\
+{                                                   \
+    q15_t const *pSrcX, *pSrcY;                     \
+    q15x8_t   xVec, yVec;                           \
+    uint32_t    k;                                  \
+                                                    \
+    pSrcX = (q15_t const *) pX;                     \
+    pSrcY = (q15_t const *) pY;                     \
+    k = count >> 3;                                 \
+    while (k > 0U)                                  \
+    {                                               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;           \
+        yVec = vld1q(pSrcY);  pSrcY += 8;           \
+        acc = vmlaldavaq(acc, xVec, yVec);          \
+        /*  Decrement the loop counter   */         \
+        k--;                                        \
+    }                                               \
+    /*  tail predication expected here  */          \
+    k = count % 0x8U;                               \
+    if (k > 0U)                                     \
+    {                                               \
+        mve_pred16_t p0 = vctp16q(k);               \
+        xVec = vld1q(pSrcX);  pSrcX += 8;           \
+        yVec = vld1q(pSrcY);  pSrcY += 8;           \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);    \
+    }                                               \
+    acc = asrl(acc, 15);                            \
+    acc = __SSAT(acc, 16);                          \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                     \
+    q15_t const *pSrcX, *pSrcY;                                                       \
+    q15x8_t   xVec, yVec;                                                             \
+    uint32_t    k;                                                                    \
+                                                                                      \
+    pSrcX = (q15_t const *) pX;                                                       \
+    pSrcY  = (q15_t const *) pY;                                                      \
+    k = count >> 3;                                                                   \
+                                                                                      \
+    while (k > 0U)                                                                    \
+    {                                                                                 \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                             \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                          \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                          \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                             \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                          \
+        /*  Decrement the loop counter   */                                           \
+        k--;                                                                          \
+    }                                                                                 \
+    /* loop + tail predication expected here  */                                      \
+    k = count % 0x8U;                                                                 \
+    if (k > 0U)                                                                       \
+    {                                                                                 \
+        mve_pred16_t p0 = vctp16q(k);                                                 \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                             \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                 \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                    \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                 \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                    \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                 \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                             \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                    \
+    }                                                                                 \
+                                                                                      \
+    acc0 = asrl(acc0, 15);                                                            \
+    acc1 = asrl(acc1, 15);                                                            \
+    acc2 = asrl(acc2, 15);                                                            \
+    acc3 = asrl(acc3, 15);                                                            \
+    acc0 = __SSAT(acc0, 16);                                                          \
+    acc1 = __SSAT(acc1, 16);                                                          \
+    acc2 = __SSAT(acc2, 16);                                                          \
+    acc3 = __SSAT(acc3, 16);                                                          \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                         \
+    q15_t const *pSrcX, *pSrcY;                                           \
+    q15x8_t   xVec, yVec;                                                 \
+    uint32_t    k;                                                        \
+                                                                          \
+    pSrcX = (q15_t const *) pX;                                           \
+    pSrcY  = (q15_t const *) pY;                                          \
+    k = count >> 3;                                                       \
+                                                                          \
+    while (k > 0U)                                                        \
+    {                                                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                 \
+        xVec = vldrhq_s16(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                              \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                 \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                              \
+        /*  Decrement the loop counter   */                               \
+        k--;                                                              \
+    }                                                                     \
+    /* loop + tail predication expected here  */                          \
+    k = count % 0x8U;                                                     \
+    if (k > 0U)                                                           \
+    {                                                                     \
+        mve_pred16_t p0 = vctp16q(k);                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                                 \
+        xVec = vldrhq_s16(&pSrcX[1]);                                     \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                        \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                 \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                        \
+    }                                                                     \
+                                                                          \
+    acc0 = asrl(acc0, 15);                                                \
+    acc1 = asrl(acc1, 15);                                                \
+    acc0 = __SSAT(acc0, 16);                                              \
+    acc1 = __SSAT(acc1, 16);                                              \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q15_t const *pSrcX, *pSrcY;                                         \
+    q15x8_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q15_t const *) pX;                                         \
+    pSrcY  = (q15_t const *) pY;                                        \
+    k = (count-1) >> 3;                                                 \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        yVec = vld1q(pSrcY);  pSrcY += 8;                               \
+        xVec = vldrhq_s16(&pSrcX[1]);                                   \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                               \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc1 requires exact number of sample (count-1)  */               \
+    /* disable extra lanes in final MAC computation  */                 \
+    k = (count-1) % 0x8U;                                               \
+    mve_pred16_t p0 = vctp16q(k);                                       \
+    yVec = vld1q(pSrcY);  pSrcY += 8;                                   \
+    xVec = vldrhq_s16(&pSrcX[1]);                                       \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                    \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp16q(k+1);                                                  \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 15);                                              \
+    acc1 = asrl(acc1, 15);                                              \
+    acc0 = __SSAT(acc0, 16);                                            \
+    acc1 = __SSAT(acc1, 16);                                            \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{                                                                       \
+    q15_t const *pSrcX;                                                 \
+    const q15_t       *pY1 = pY + 1;                                    \
+    q15x8_t   xVec, yVec;                                               \
+    uint32_t    k;                                                      \
+                                                                        \
+    pSrcX = (q15_t const *) pX;                                         \
+    k = count >> 3;                                                     \
+                                                                        \
+    while (k > 0U)                                                      \
+    {                                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                               \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);        \
+        pY-=8;                                                          \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                            \
+        yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);       \
+        pY1-=8;                                                         \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                             \
+        k--;                                                            \
+    }                                                                   \
+    k = count % 0x8U;                                                   \
+    /* use predication to finalize MAC sum */                           \
+    /* acc0 requires exact number of sample  */                         \
+    /* disable extra lanes in final MAC computation  */                 \
+    mve_pred16_t p0 = vctp16q(k);                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                   \
+    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);            \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                          \
+    yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec);           \
+    /* acc1 requires 1 additional sample  */                            \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */   \
+    p0 = vctp16q(k+1);                                                  \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                          \
+                                                                        \
+    acc0 = asrl(acc0, 15);                                              \
+    acc1 = asrl(acc1, 15);                                              \
+    acc0 = __SSAT(acc0, 16);                                            \
+    acc1 = __SSAT(acc1, 16);                                            \
+}
+
+#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count)                                                 \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = count >> 3;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc = vmlaldavaq(acc, xVec, yVec);                                                           \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x8U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp16q(k);                                                                \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        acc = vmlaldavaq_p(acc, xVec, yVec, p0);                                                     \
+    }                                                                                                \
+    acc = asrl(acc, 15);                                                                             \
+    acc = __SSAT(acc, 16);                                                                           \
+}
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)               \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = count >> 3;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq(acc2, xVec, yVec);                                                         \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq(acc3, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x8U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp16q(k);                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vldrhq_s16(&pSrcX[2]);                                                                \
+        acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0);                                                   \
+        xVec = vldrhq_s16(&pSrcX[3]);                                                                \
+        acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 15);                                                                           \
+    acc1 = asrl(acc1, 15);                                                                           \
+    acc2 = asrl(acc2, 15);                                                                           \
+    acc3 = asrl(acc3, 15);                                                                           \
+    acc0 = __SSAT(acc0, 16);                                                                         \
+    acc1 = __SSAT(acc1, 16);                                                                         \
+    acc2 = __SSAT(acc2, 16);                                                                         \
+    acc3 = __SSAT(acc3, 16);                                                                         \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)                           \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = count >> 3;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x8U;                                                                                \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp16q(k);                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                   \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                   \
+    }                                                                                                \
+    acc0 = asrl(acc0, 15);                                                                           \
+    acc1 = asrl(acc1, 15);                                                                           \
+    acc0 = __SSAT(acc0, 16);                                                                         \
+    acc1 = __SSAT(acc1, 16);                                                                         \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)                             \
+{                                                                                                    \
+    q15_t const *pSrcX;                                                                              \
+    q15x8_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q15_t const *) pX;                                                                      \
+    k = (count-1) >> 3;                                                                              \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                     \
+        pY-=8;                                                                                       \
+        xVec = vldrhq_s16(&pSrcX[1]);                                                                \
+        acc1 = vmlaldavaq(acc1, xVec, yVec);                                                         \
+        xVec = vld1q(pSrcX);  pSrcX += 8;                                                            \
+        acc0 = vmlaldavaq(acc0, xVec, yVec);                                                         \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    k = (count - 1) % 0x8U;                                                                          \
+    /* use predication to finalize MAC sum */                                                        \
+    /* acc1 requires exact number of sample (count-1)  */                                            \
+    /* disable extra lanes in final MAC computation  */                                              \
+    mve_pred16_t p0 = vctp16q(k);                                                                    \
+    yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec);                                         \
+    xVec = vldrhq_s16(&pSrcX[1]);                                                                    \
+    acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0);                                                       \
+    /* acc0 requires 1 additional sample  (count) */                                                 \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                                \
+    p0 = vctp16q(k+1);                                                                               \
+    xVec = vld1q(pSrcX);  pSrcX += 8;                                                                \
+    acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0);                                                       \
+                                                                                                     \
+    acc0 = asrl(acc0, 15);                                                                           \
+    acc1 = asrl(acc1, 15);                                                                           \
+    acc0 = __SSAT(acc0, 16);                                                                         \
+    acc1 = __SSAT(acc1, 16);                                                                         \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                      \
+    q7_t const *pSrcX, *pSrcY;                                         \
+    q7x16_t   xVec, yVec;                                              \
+    uint32_t    k;                                                     \
+                                                                       \
+    pSrcX = (q7_t const *) pX;                                         \
+    pSrcY = (q7_t const *) pY;                                         \
+    k = count >> 4;                                                    \
+    while (k > 0U)                                                     \
+    {                                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                             \
+        yVec = vldrbq_s8(&pSrcY[-1]);                                  \
+        acc1 = vmladavaq(acc1, xVec, yVec);                            \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                             \
+        acc0 = vmladavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                            \
+        k--;                                                           \
+    }                                                                  \
+    k = count % 0x10U;                                                 \
+    /* use predication to finalize MAC sum */                          \
+    /* acc1 requires 1 additional sample  */                           \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */  \
+    mve_pred16_t p0 = vctp8q(k+1);                                     \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
+    yVec = vldrbq_s8(&pSrcY[-1]);                                      \
+    acc1 = vmladavaq_p(acc1, xVec, yVec,p0);                           \
+    /* acc0 requires exact number of sample  */                        \
+    /* disable extra lanes in final MAC computation  */                \
+    p0 = vctp8q(k);                                                    \
+    yVec = vld1q(pSrcY);  pSrcY += 16;                                 \
+    acc0 = vmladavaq_p(acc0, xVec, yVec,p0);                           \
+                                                                       \
+    acc0 = (acc0 >> 7);                                                \
+    acc1 = (acc1 >> 7);                                                \
+    acc0 = __SSAT(acc0, 8);                                            \
+    acc1 = __SSAT(acc1, 8);                                            \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\
+{                                                  \
+    q7_t const *pSrcX, *pSrcY;                     \
+    q7x16_t   xVec, yVec;                          \
+    uint32_t    k;                                 \
+                                                   \
+    pSrcX = (q7_t const *) pX;                     \
+    pSrcY = (q7_t const *) pY;                     \
+    k = count >> 4;                                \
+    while (k > 0U)                                 \
+    {                                              \
+        xVec = vld1q(pSrcX);  pSrcX += 16;         \
+        yVec = vld1q(pSrcY);  pSrcY += 16;         \
+        acc = vmladavaq(acc, xVec, yVec);          \
+        /*  Decrement the loop counter   */        \
+        k--;                                       \
+    }                                              \
+    /*  tail predication expected here  */         \
+    k = count % 0x10U;                             \
+    if (k > 0U)                                    \
+    {                                              \
+        mve_pred16_t p0 = vctp8q(k);               \
+        xVec = vld1q(pSrcX);  pSrcX += 16;         \
+        yVec = vld1q(pSrcY);  pSrcY += 16;         \
+        acc = vmladavaq_p(acc, xVec, yVec, p0);    \
+    }                                              \
+    acc =(acc >> 7);                               \
+    acc = __SSAT(acc, 8);                          \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\
+{                                                                                    \
+    q7_t const *pSrcX, *pSrcY;                                                       \
+    q7x16_t   xVec, yVec;                                                            \
+    uint32_t    k;                                                                   \
+                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                       \
+    pSrcY = (q7_t const *) pY;                                                       \
+    k = count >> 4;                                                                  \
+                                                                                     \
+    while (k > 0U)                                                                   \
+    {                                                                                \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                           \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                          \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                 \
+        acc2 = vmladavaq(acc2, xVec, yVec);                                          \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                 \
+        acc3 = vmladavaq(acc3, xVec, yVec);                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                          \
+        /*  Decrement the loop counter   */                                          \
+        k--;                                                                         \
+    }                                                                                \
+    /* loop + tail predication expected here  */                                     \
+    k = count % 0x10U;                                                               \
+    if (k > 0U)                                                                      \
+    {                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                 \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                                           \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                 \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                    \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                 \
+        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                    \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                 \
+        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                           \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                    \
+    }                                                                                \
+                                                                                     \
+    acc0 = (acc0 >> 7);                                                              \
+    acc1 = (acc1 >> 7);                                                              \
+    acc2 = (acc2 >> 7);                                                              \
+    acc3 = (acc3 >> 7);                                                              \
+    acc0 = __SSAT(acc0, 8);                                                          \
+    acc1 = __SSAT(acc1, 8);                                                          \
+    acc2 = __SSAT(acc2, 8);                                                          \
+    acc3 = __SSAT(acc3, 8);                                                          \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                        \
+    q7_t const *pSrcX, *pSrcY;                                           \
+    q7x16_t   xVec, yVec;                                                \
+    uint32_t    k;                                                       \
+                                                                         \
+    pSrcX = (q7_t const *) pX;                                           \
+    pSrcY = (q7_t const *) pY;                                           \
+    k = count >> 4;                                                      \
+                                                                         \
+    while (k > 0U)                                                       \
+    {                                                                    \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                               \
+        xVec = vldrbq_s8(&pSrcX[1]);                                     \
+        acc1 = vmladavaq(acc1, xVec, yVec);                              \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                               \
+        acc0 = vmladavaq(acc0, xVec, yVec);                              \
+        /*  Decrement the loop counter   */                              \
+        k--;                                                             \
+    }                                                                    \
+    /* loop + tail predication expected here  */                         \
+    k = count % 0x10U;                                                   \
+    if (k > 0U)                                                          \
+    {                                                                    \
+        mve_pred16_t p0 = vctp8q(k);                                     \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                               \
+        xVec = vldrbq_s8(&pSrcX[1]);                                     \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                        \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                               \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                        \
+    }                                                                    \
+                                                                         \
+    acc0 = (acc0 >> 7);                                                  \
+    acc1 = (acc1 >> 7);                                                  \
+    acc0 = __SSAT(acc0, 8);                                              \
+    acc1 = __SSAT(acc1, 8);                                              \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                      \
+    q7_t const *pSrcX, *pSrcY;                                         \
+    q7x16_t   xVec, yVec;                                              \
+    uint32_t    k;                                                     \
+                                                                       \
+    pSrcX = (q7_t const *) pX;                                         \
+    pSrcY = (q7_t const *) pY;                                         \
+    k = (count-1) >> 4;                                                \
+                                                                       \
+    while (k > 0U)                                                     \
+    {                                                                  \
+        yVec = vld1q(pSrcY);  pSrcY += 16;                             \
+        xVec = vldrbq_s8(&pSrcX[1]);                                   \
+        acc1 = vmladavaq(acc1, xVec, yVec);                            \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                             \
+        acc0 = vmladavaq(acc0, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                            \
+        k--;                                                           \
+    }                                                                  \
+    /* use predication to finalize MAC sum */                          \
+    /* acc1 requires exact number of sample (count-1)  */              \
+    /* disable extra lanes in final MAC computation  */                \
+    k = (count-1) % 0x10U;                                             \
+    mve_pred16_t p0 = vctp8q(k);                                       \
+    yVec = vld1q(pSrcY);  pSrcY += 16;                                 \
+    xVec = vldrbq_s8(&pSrcX[1]);                                       \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                          \
+    /* acc0 requires 1 additional sample  (count) */                   \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */  \
+    p0 = vctp8q(k+1);                                                  \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                          \
+                                                                       \
+    acc0 = (acc0 >> 7);                                                \
+    acc1 = (acc1 >> 7);                                                \
+    acc0 = __SSAT(acc0, 8);                                            \
+    acc1 = __SSAT(acc1, 8);                                            \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{                                                                      \
+    q7_t const *pSrcX;                                                 \
+    const q7_t       *pY1 = pY + 1;                                    \
+    q7x16_t   xVec, yVec;                                              \
+    uint32_t    k;                                                     \
+                                                                       \
+    pSrcX = (q7_t const *) pX;                                         \
+    k = count >> 4;                                                    \
+                                                                       \
+    while (k > 0U)                                                     \
+    {                                                                  \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                             \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                \
+        pY-=16;                                                        \
+        acc0 = vmladavaq(acc0, xVec, yVec);                            \
+        yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);               \
+        pY1-=16;                                                       \
+        acc1 = vmladavaq(acc1, xVec, yVec);                            \
+        /*  Decrement the loop counter   */                            \
+        k--;                                                           \
+    }                                                                  \
+    k = count % 0x10U;                                                 \
+    /* use predication to finalize MAC sum */                          \
+    /* acc0 requires exact number of sample  */                        \
+    /* disable extra lanes in final MAC computation  */                \
+    mve_pred16_t p0 = vctp8q(k);                                       \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                 \
+    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                    \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                          \
+    yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec);                   \
+    /* acc1 requires 1 additional sample  */                           \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */  \
+    p0 = vctp8q(k+1);                                                  \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                          \
+                                                                       \
+    acc0 = (acc0 >> 7);                                                \
+    acc1 = (acc1 >> 7);                                                \
+    acc0 = __SSAT(acc0, 8);                                            \
+    acc1 = __SSAT(acc1, 8);                                            \
+}
+
+#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count)                                                  \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = count >> 4;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc = vmladavaq(acc, xVec, yVec);                                                            \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x10U;                                                                               \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                                 \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        acc = vmladavaq_p(acc, xVec, yVec, p0);                                                      \
+    }                                                                                                \
+    acc = __SSAT(acc >> 7, 8);                                                                       \
+}
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)                \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = count >> 4;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                 \
+        acc2 = vmladavaq(acc2, xVec, yVec);                                                          \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                 \
+        acc3 = vmladavaq(acc3, xVec, yVec);                                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x10U;                                                                               \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                                 \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                    \
+        xVec = vldrbq_s8(&pSrcX[2]);                                                                 \
+        acc2 = vmladavaq_p(acc2, xVec, yVec, p0);                                                    \
+        xVec = vldrbq_s8(&pSrcX[3]);                                                                 \
+        acc3 = vmladavaq_p(acc3, xVec, yVec, p0);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                    \
+    }                                                                                                \
+    acc0 = __SSAT(acc0 >> 7, 8);                                                                     \
+    acc1 = __SSAT(acc1 >> 7, 8);                                                                     \
+    acc2 = __SSAT(acc2 >> 7, 8);                                                                     \
+    acc3 = __SSAT(acc3 >> 7, 8);                                                                     \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)                            \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = count >> 4;                                                                                  \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    /* Loop with tail predication expected here  */                                                  \
+    k = count % 0x10U;                                                                               \
+    if (k > 0U)                                                                                      \
+    {                                                                                                \
+        mve_pred16_t p0 = vctp8q(k);                                                                 \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                    \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                    \
+    }                                                                                                \
+    acc0 = __SSAT(acc0 >> 7, 8);                                                                     \
+    acc1 = __SSAT(acc1 >> 7, 8);                                                                     \
+}
+
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)                              \
+{                                                                                                    \
+    q7_t const *pSrcX;                                                                               \
+    q7x16_t   xVec, yVec;                                                                            \
+    uint32_t    k;                                                                                   \
+                                                                                                     \
+    pSrcX = (q7_t const *) pX;                                                                       \
+    k = (count-1) >> 4;                                                                              \
+                                                                                                     \
+    while (k > 0U)                                                                                   \
+    {                                                                                                \
+        yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                              \
+        pY-=16;                                                                                      \
+        xVec = vldrbq_s8(&pSrcX[1]);                                                                 \
+        acc1 = vmladavaq(acc1, xVec, yVec);                                                          \
+        xVec = vld1q(pSrcX);  pSrcX += 16;                                                           \
+        acc0 = vmladavaq(acc0, xVec, yVec);                                                          \
+        /*  Decrement the loop counter   */                                                          \
+        k--;                                                                                         \
+    }                                                                                                \
+    k = (count - 1) % 0x10U;                                                                         \
+    /* use predication to finalize MAC sum */                                                        \
+    /* acc1 requires exact number of sample (count-1)  */                                            \
+    /* disable extra lanes in final MAC computation  */                                              \
+    mve_pred16_t p0 = vctp8q(k);                                                                     \
+    yVec = vldrbq_gather_offset_s8(pY, decrIdxVec);                                                  \
+    xVec = vldrbq_s8(&pSrcX[1]);                                                                     \
+    acc1 = vmladavaq_p(acc1, xVec, yVec, p0);                                                        \
+    /* acc0 requires 1 additional sample  (count) */                                                 \
+    /* so add 1 to unmask an extra lane  in final MAC computation  */                                \
+    p0 = vctp8q(k+1);                                                                                \
+    xVec = vld1q(pSrcX);  pSrcX += 16;                                                               \
+    acc0 = vmladavaq_p(acc0, xVec, yVec, p0);                                                        \
+                                                                                                     \
+    acc0 = (acc0 >> 7);                                                                              \
+    acc1 = (acc1 >> 7);                                                                              \
+    acc0 = __SSAT(acc0, 8);                                                                          \
+    acc1 = __SSAT(acc1, 8);                                                                          \
 }
 
 #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
diff --git a/CMSIS/DSP/Testing/CMakeLists.txt b/CMSIS/DSP/Testing/CMakeLists.txt
index d132de8..c3d5267 100644
--- a/CMSIS/DSP/Testing/CMakeLists.txt
+++ b/CMSIS/DSP/Testing/CMakeLists.txt
@@ -54,7 +54,13 @@
   endif()
 
   list(APPEND output ",${PLATFORMID}")
-  list(APPEND output ",${COREID},")
+  if (CONFIGID)
+     # Specific config ID (like M55 with autovectorizer)
+     list(APPEND output ",${CONFIGID},")
+  else()
+     # Core ID is used as config ID when not specified
+     list(APPEND output ",${COREID},")
+  endif()
   if (ARMAC6)
     list(APPEND output "AC6")
   elseif(GCC)
diff --git a/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py b/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py
index 6c970e1..19d1866 100755
--- a/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py
+++ b/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py
@@ -48,14 +48,14 @@
         yield delimiter
         yield x
 
-def addToDb(db,desc):
+def addToDb(db,desc,runid):
     msg("Add %s to summary database\n" % desc)
-    completed = subprocess.run([sys.executable, "addToDB.py","-o",db,desc], timeout=3600)
+    completed = subprocess.run([sys.executable, "addToDB.py","-o",db,"-r",str(runid),desc], timeout=3600)
     check(completed)
 
-def addToRegDb(db,desc):
+def addToRegDb(db,desc,runid):
     msg("Add %s to regression database\n" % desc)
-    completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,desc], timeout=3600)
+    completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,"-r",str(runid),desc], timeout=3600)
     check(completed)
 
 
@@ -187,7 +187,7 @@
     
 
     # Launch cmake command.
-    def createCMake(self,flags,benchMode,platform):
+    def createCMake(self,configid,flags,benchMode,platform):
         with self.buildFolder() as b:
             self.saveEnv()
             if benchMode:
@@ -198,7 +198,8 @@
             cmd += ["-DCMAKE_PREFIX_PATH=%s" % self.compiler(),
                              "-DCMAKE_TOOLCHAIN_FILE=%s" % toolchainCmake,
                              "-DARM_CPU=%s" % self.core(),
-                             "-DPLATFORM=%s" % platform
+                             "-DPLATFORM=%s" % platform,
+                             "-DCONFIGID=%s" % configid
                     ]
             cmd += flags 
 
@@ -373,7 +374,7 @@
         else:
            return(TESTFAILED)
 
-    def runAndProcess(self,compiler,fvp,sim,benchmode,db,regdb):
+    def runAndProcess(self,compiler,fvp,sim,benchmode,db,regdb,benchid,regid):
         # If we can't parse test description we fail all tests
         self.processTest()
         # Otherwise if only building or those tests are failing, we continue
@@ -393,9 +394,9 @@
               if benchmode and (error == NOTESTFAILED):
                  error = self.computeSummaryStat()
                  if db is not None:
-                    addToDb(db,self.testName())
+                    addToDb(db,self.testName(),benchid)
                  if regdb is not None:
-                    addToRegDb(regdb,self.testName())
+                    addToRegDb(regdb,self.testName(),regid)
               return(error)
            else:
               msg("No FVP available")
diff --git a/CMSIS/DSP/Testing/addToDB.py b/CMSIS/DSP/Testing/addToDB.py
index 4d7b668..5b43cff 100755
--- a/CMSIS/DSP/Testing/addToDB.py
+++ b/CMSIS/DSP/Testing/addToDB.py
@@ -22,12 +22,13 @@
 MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
 MKINTFIELD=['ID', 'CYCLES']
 MKDATEFIELD=['DATE']
-MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
+MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE',"RUN"]
 MKKEYFIELDID={'CATEGORY':'categoryid', 
    'PLATFORM':'platformid', 
    'CORE':'coreid', 
    'COMPILER':'compilerid',
-   'TYPE':'typeid'}
+   'TYPE':'typeid',
+   'RUN':'runid'}
 
 # For table value extraction
 VALSTRFIELD=['NAME','VERSION']
@@ -56,7 +57,7 @@
   colsToKeep=[]
   cols = list(full.columns)
   params = list(elem.params.full)
-  common = diff(cols + ["TYPE"] , ['OLDID'] + params)  
+  common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)  
  
   for field in common:
        if field in MKSTRFIELD:
@@ -76,7 +77,7 @@
      sql = "CREATE TABLE %s (" % tableName
      cols = list(full.columns)
      params = list(elem.params.full)
-     common = diff(cols + ["TYPE"] , ['OLDID'] + params)
+     common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)
 
      sql += "%sid INTEGER PRIMARY KEY"  % (tableName)
      start = ","   
@@ -103,6 +104,7 @@
      sql += "FOREIGN KEY(platformid) REFERENCES PLATFORM(platformid),"
      sql += "FOREIGN KEY(coreid) REFERENCES CORE(coreid),"
      sql += "FOREIGN KEY(compilerid) REFERENCES COMPILER(compilerid)"
+     sql += "FOREIGN KEY(runid) REFERENCES RUN(runid)"
      sql += "  )"
      conn.execute(sql)
 
@@ -143,7 +145,7 @@
          return(None)
 
 
-def addRows(conn,elem,tableName,full):
+def addRows(conn,elem,tableName,full,runid=0):
    # List of columns we have in DB which is
    # different from the columns in the table
    compilerid = 0
@@ -211,6 +213,7 @@
             keys[field]=row[field]
         
          
+       keys['RUN']=runid
        # Get foreign keys and create missing data
        for field in common:
         if field in VALKEYFIELD:
@@ -261,31 +264,34 @@
   conn.execute("INSERT INTO CONFIG(compilerid,platformid,coreid,date) VALUES(?,?,?,?)" ,(config['compilerid'],config['platformid'],config['coreid'],fullDate))
   conn.commit()
 
-def addOneBenchmark(elem,fullPath,db,group):
+def getGroup(a):
+    return(re.sub(r'^(.+)(F64|F32|F16|Q31|Q15|Q7|U32|U16|U8|S32|S16|S8)$',r'\1',a))
+
+def addOneBenchmark(elem,fullPath,db,group,runid):
    if os.path.isfile(fullPath):
       full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
       fullDate = datetime.datetime.now()
       full['DATE'] = fullDate
       if group:
-         tableName = group
+         tableName = getGroup(group)
       else:
-         tableName = elem.data["class"]
+         tableName = getGroup(elem.data["class"])
       conn = sqlite3.connect(db)
       createTableIfMissing(conn,elem,tableName,full)
-      config = addRows(conn,elem,tableName,full)
+      config = addRows(conn,elem,tableName,full,runid)
       addConfig(conn,config,fullDate)
       conn.close()
 
 
-def addToDB(benchmark,dbpath,elem,group):
+def addToDB(benchmark,dbpath,elem,group,runid):
   if not elem.data["deprecated"]:
      if elem.params:
          benchPath = os.path.join(benchmark,elem.fullPath(),"fullBenchmark.csv")
          print("Processing %s" % benchPath)
-         addOneBenchmark(elem,benchPath,dbpath,group)
+         addOneBenchmark(elem,benchPath,dbpath,group,runid)
          
      for c in elem.children:
-       addToDB(benchmark,dbpath,c,group)
+       addToDB(benchmark,dbpath,c,group,runid)
 
 
 
@@ -295,6 +301,7 @@
 parser.add_argument('-b', nargs='?',type = str, default="FullBenchmark", help="Full Benchmark dir path")
 #parser.add_argument('-e', action='store_true', help="Embedded test")
 parser.add_argument('-o', nargs='?',type = str, default="bench.db", help="Benchmark database")
+parser.add_argument('-r', nargs='?',type = int, default=0, help="Run ID")
 
 parser.add_argument('others', nargs=argparse.REMAINDER)
 
@@ -310,7 +317,7 @@
       group=args.others[0] 
     else:
       group=None
-    addToDB(args.b,args.o,root,group)
+    addToDB(args.b,args.o,root,group,args.r)
     
 else:
     parser.print_help()
\ No newline at end of file
diff --git a/CMSIS/DSP/Testing/addToRegDB.py b/CMSIS/DSP/Testing/addToRegDB.py
index 3e7a54a..cf6297d 100755
--- a/CMSIS/DSP/Testing/addToRegDB.py
+++ b/CMSIS/DSP/Testing/addToRegDB.py
@@ -23,12 +23,13 @@
 MKINTFIELD=['ID','MAX']
 MKREALFIELD=['MAXREGCOEF']
 MKDATEFIELD=['DATE']
-MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
+MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE','RUN']
 MKKEYFIELDID={'CATEGORY':'categoryid', 
    'PLATFORM':'platformid', 
    'CORE':'coreid', 
    'COMPILER':'compilerid',
-   'TYPE':'typeid'}
+   'TYPE':'typeid',
+   'RUN':'runid'}
 
 # For table value extraction
 VALSTRFIELD=['NAME','VERSION','Regression']
@@ -58,7 +59,7 @@
   colsToKeep=[]
   cols = list(full.columns)
   params=diff(elem.params.full , elem.params.summary)
-  common = diff(cols + ["TYPE"] , ['OLDID'] + params)  
+  common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)  
  
   for field in common:
        if field in MKSTRFIELD:
@@ -80,7 +81,7 @@
      sql = "CREATE TABLE %s (" % tableName
      cols = list(full.columns)
      params=diff(elem.params.full , elem.params.summary)
-     common = diff(cols + ["TYPE"] , ['OLDID'] + params)
+     common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)
 
      sql += "%sid INTEGER PRIMARY KEY"  % (tableName)
      start = ","   
@@ -109,6 +110,7 @@
      sql += "FOREIGN KEY(platformid) REFERENCES PLATFORM(platformid),"
      sql += "FOREIGN KEY(coreid) REFERENCES CORE(coreid),"
      sql += "FOREIGN KEY(compilerid) REFERENCES COMPILER(compilerid)"
+     sql += "FOREIGN KEY(runid) REFERENCES RUN(runid)"
      sql += "  )"
      conn.execute(sql)
 
@@ -149,7 +151,7 @@
          return(None)
 
 
-def addRows(conn,elem,tableName,full):
+def addRows(conn,elem,tableName,full,runid=0):
    # List of columns we have in DB which is
    # different from the columns in the table
    compilerid = 0
@@ -218,7 +220,7 @@
         if field in VALBOOLFIELD:
             keys[field]=row[field]
         
-         
+       keys['RUN']=runid
        # Get foreign keys and create missing data
        for field in common:
         if field in VALKEYFIELD:
@@ -272,31 +274,34 @@
   conn.execute("INSERT INTO CONFIG(compilerid,platformid,coreid,date) VALUES(?,?,?,?)" ,(config['compilerid'],config['platformid'],config['coreid'],fullDate))
   conn.commit()
 
-def addOneBenchmark(elem,fullPath,db,group):
+def getGroup(a):
+    return(re.sub(r'^(.+)(F64|F32|F16|Q31|Q15|Q7|U32|U16|U8|S32|S16|S8)$',r'\1',a))
+
+def addOneBenchmark(elem,fullPath,db,group,runid):
    if os.path.isfile(fullPath):
       full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
       fullDate = datetime.datetime.now()
       full['DATE'] = fullDate
       if group:
-         tableName = group
+         tableName = getGroup(group)
       else:
-         tableName = elem.data["class"]
+         tableName = getGroup(elem.data["class"])
       conn = sqlite3.connect(db)
       createTableIfMissing(conn,elem,tableName,full)
-      config = addRows(conn,elem,tableName,full)
+      config = addRows(conn,elem,tableName,full,runid)
       addConfig(conn,config,fullDate)
       conn.close()
 
 
-def addToDB(benchmark,dbpath,elem,group):
+def addToDB(benchmark,dbpath,elem,group,runid):
   if not elem.data["deprecated"]:
      if elem.params:
          benchPath = os.path.join(benchmark,elem.fullPath(),"regression.csv")
          print("Processing %s" % benchPath)
-         addOneBenchmark(elem,benchPath,dbpath,group)
+         addOneBenchmark(elem,benchPath,dbpath,group,runid)
          
      for c in elem.children:
-       addToDB(benchmark,dbpath,c,group)
+       addToDB(benchmark,dbpath,c,group,runid)
 
 
 
@@ -306,6 +311,7 @@
 parser.add_argument('-b', nargs='?',type = str, default="FullBenchmark", help="Full Benchmark dir path")
 #parser.add_argument('-e', action='store_true', help="Embedded test")
 parser.add_argument('-o', nargs='?',type = str, default="reg.db", help="Regression benchmark database")
+parser.add_argument('-r', nargs='?',type = int, default=0, help="Run ID")
 
 parser.add_argument('others', nargs=argparse.REMAINDER)
 
@@ -321,7 +327,7 @@
       group=args.others[0] 
     else:
       group=None
-    addToDB(args.b,args.o,root,group)
+    addToDB(args.b,args.o,root,group,args.r)
     
 else:
     parser.print_help()
\ No newline at end of file
diff --git a/CMSIS/DSP/Testing/bench.txt b/CMSIS/DSP/Testing/bench.txt
index c4a9a3a..2a5c83c 100755
--- a/CMSIS/DSP/Testing/bench.txt
+++ b/CMSIS/DSP/Testing/bench.txt
@@ -351,8 +351,8 @@
                 Output  ERR_F32_ID : Err
 
                 Params PARAM1_ID = {
-                  NumTaps = [4,8,16,32,64]
-                  NB = [16,64,128,256]
+                  NumTaps = [16,32,64]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -388,8 +388,8 @@
                 Output  ERR_Q31_ID : Err
 
                 Params PARAM1_ID = {
-                  NumTaps = [4,8,16,32,64]
-                  NB = [16,64,128,256]
+                  NumTaps = [16,32,64]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -425,8 +425,8 @@
                 Output  ERR_Q15_ID : Err
 
                 Params PARAM1_ID = {
-                  NumTaps = [4,8,16,32,64]
-                  NB = [16,64,128,256]
+                  NumTaps = [16,32,64]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -464,8 +464,8 @@
                 Output  OUT_SAMPLES_F32_ID : Output
 
                 Params PARAM1_ID = {
-                  NBA = [4,5,9,16,64]
-                  NBB = [5,9,16,128]
+                  NBA = [9,16,64]
+                  NBB = [9,16,128]
                 }
 
                 Functions {
@@ -495,8 +495,8 @@
                 Output  OUT_SAMPLES_Q31_ID : Output
 
                 Params PARAM1_ID = {
-                  NBA = [4,5,9,16,64]
-                  NBB = [5,9,16,128]
+                  NBA = [9,16,64]
+                  NBB = [9,16,128]
                 }
 
                 Functions {
@@ -526,8 +526,8 @@
                 Output  OUT_SAMPLES_Q15_ID : Output
 
                 Params PARAM1_ID = {
-                  NBA = [4,5,9,16,64]
-                  NBB = [5,9,16,128]
+                  NBA = [9,16,64]
+                  NBB = [9,16,128]
                 }
 
                 Functions {
@@ -557,8 +557,8 @@
                 Output  OUT_SAMPLES_Q7_ID : Output
 
                 Params PARAM1_ID = {
-                  NBA = [4,5,9,16,64]
-                  NBB = [5,9,16,128]
+                  NBA = [9,16,64]
+                  NBB = [9,16,128]
                 }
 
                 Functions {
@@ -762,7 +762,7 @@
                 Output  OUT_SAMPLES_F32_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -792,7 +792,7 @@
                 Output  OUT_SAMPLES_Q31_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -820,7 +820,7 @@
                 Output  OUT_SAMPLES_Q15_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -848,7 +848,7 @@
                 Output  OUT_SAMPLES_F32_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -873,7 +873,7 @@
                 Output  OUT_SAMPLES_Q31_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -898,7 +898,7 @@
                 Output  OUT_SAMPLES_Q15_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -962,7 +962,7 @@
                 Output  OUT_SAMPLES_F32_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -992,7 +992,7 @@
                 Output  OUT_SAMPLES_Q31_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -1020,7 +1020,7 @@
                 Output  OUT_SAMPLES_Q15_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -1048,7 +1048,7 @@
                 Output  OUT_SAMPLES_Q7_ID : Output
 
                 Params PARAM1_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                 }
 
                 Functions {
@@ -1292,7 +1292,7 @@
 
                 
                 Params CFFT_PARAM_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                   IFFT = [0,1]
                   REV = [0,1]
                 }
@@ -1304,7 +1304,7 @@
                 }
 
                 Params RFFT_PARAM_ID = {
-                  NB = [32,64,128,256]
+                  NB = [64,128,256]
                   IFFT = [0,1]
                   REV = [1]
                 }
@@ -1342,7 +1342,7 @@
 
                 
                 Params CFFT_PARAM_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                   IFFT = [0,1]
                   REV = [0,1]
                 }
@@ -1354,7 +1354,7 @@
                 }
 
                 Params RFFT_PARAM_ID = {
-                  NB = [32,64,128,256]
+                  NB = [64,128,256]
                   IFFT = [0,1]
                   REV = [0,1]
                 }
@@ -1392,7 +1392,7 @@
 
                 
                 Params CFFT_PARAM_ID = {
-                  NB = [16,64,128,256]
+                  NB = [64,128,256]
                   IFFT = [0,1]
                   REV = [0,1]
                 }
@@ -1404,7 +1404,7 @@
                 }
 
                 Params RFFT_PARAM_ID = {
-                  NB = [32,64,128,256]
+                  NB = [64,128,256]
                   IFFT = [0,1]
                   REV = [1]
                 }
diff --git a/CMSIS/DSP/Testing/createDb.sql b/CMSIS/DSP/Testing/createDb.sql
index eca7a75..9285be6 100755
--- a/CMSIS/DSP/Testing/createDb.sql
+++ b/CMSIS/DSP/Testing/createDb.sql
@@ -29,6 +29,11 @@
 CREATE INDEX compiler_date_index ON COMPILER(date);
 CREATE INDEX compiler_all_index ON COMPILER(compilerkindid,version,date);
 
+CREATE TABLE RUN (
+    runid INTEGER PRIMARY KEY,
+    date text
+    );
+
 CREATE TABLE TYPE (
     typeid INTEGER PRIMARY KEY,
     type text );
@@ -76,15 +81,23 @@
 INSERT INTO CORE VALUES(3,"m3","ARMCM3");
 INSERT INTO CORE VALUES(4,"m4","ARMCM4");
 INSERT INTO CORE VALUES(5,"m4f","ARMCM4_FP");
-INSERT INTO CORE VALUES(6,"m7","ARMCM7_DP");
+INSERT INTO CORE VALUES(6,"m7d","ARMCM7_DP");
 INSERT INTO CORE VALUES(7,"m23","ARMCM23");
-INSERT INTO CORE VALUES(8,"m33","ARMCM33_DSP_FP");
-INSERT INTO CORE VALUES(9,"m35","ARMCM35P_DSP_FP");
+INSERT INTO CORE VALUES(8,"m33f","ARMCM33_DSP_FP");
+INSERT INTO CORE VALUES(9,"m35f","ARMCM35P_DSP_FP");
 INSERT INTO CORE VALUES(10,"a5","ARMCA5");
 INSERT INTO CORE VALUES(11,"a7","ARMCA7");
 INSERT INTO CORE VALUES(12,"a9","ARMCA9");
 INSERT INTO CORE VALUES(13,"a15","ARMCA15");
-INSERT INTO CORE VALUES(14,"m55","ARMv81MML_DSP_DP_MVE_FP");
+INSERT INTO CORE VALUES(14,"m55mvef","ARMv81MML_DSP_DP_MVE_FP");
+
+INSERT INTO CORE VALUES(15,"m0","M0");
+INSERT INTO CORE VALUES(16,"m7","M7");
+INSERT INTO CORE VALUES(17,"m33","M33");
+INSERT INTO CORE VALUES(18,"m4","M4");
+INSERT INTO CORE VALUES(19,"m55 mve","M55");
+INSERT INTO CORE VALUES(20,"m55 scalar","M55SCALAR");
+
 
 .quit
 
diff --git a/CMSIS/DSP/Testing/examples.sql b/CMSIS/DSP/Testing/examples.sql
index e4c201c..660ff20 100755
--- a/CMSIS/DSP/Testing/examples.sql
+++ b/CMSIS/DSP/Testing/examples.sql
@@ -6,7 +6,7 @@
 .headers ON
 .mode csv 
 
-/*
+
 select NB,CATEGORY.category,NAME,CYCLES,PLATFORM.platform,CORE.core,COMPILERKIND.compiler,COMPILER.version,BasicMathsBenchmarksF32.DATE 
   from BasicMathsBenchmarksF32
   INNER JOIN CATEGORY USING(categoryid)
@@ -15,8 +15,9 @@
   INNER JOIN COMPILER USING(compilerid)
   INNER JOIN COMPILERKIND USING(compilerkindid)
   ;
-*/
 
+
+/*
 select Regression,MAX,MAXREGCOEF,CATEGORY.category,NAME,PLATFORM.platform,CORE.core,COMPILERKIND.compiler,COMPILER.version,BasicMathsBenchmarksF32.DATE 
   from BasicMathsBenchmarksF32
   INNER JOIN CATEGORY USING(categoryid)
@@ -25,7 +26,7 @@
   INNER JOIN COMPILER USING(compilerid)
   INNER JOIN COMPILERKIND USING(compilerkindid)
   ;
-
+*/
 /* 
 
 Compute the max cycles for a test configuration (category + name)
diff --git a/CMSIS/DSP/Testing/extractDb.py b/CMSIS/DSP/Testing/extractDb.py
new file mode 100755
index 0000000..13595bd
--- /dev/null
+++ b/CMSIS/DSP/Testing/extractDb.py
@@ -0,0 +1,283 @@
+import argparse
+import sqlite3
+import re
+import pandas as pd
+import numpy as np
+
+# Command to get last runid 
+lastID="""SELECT runid FROM RUN ORDER BY runid DESC LIMIT 1
+"""
+
+def getLastRunID():
+  r=c.execute(lastID)
+  return(int(r.fetchone()[0]))
+
+
+runid = 1
+
+parser = argparse.ArgumentParser(description='Generate summary benchmarks')
+
+parser.add_argument('-b', nargs='?',type = str, default="bench.db", help="Benchmark database")
+parser.add_argument('-o', nargs='?',type = str, default="full.md", help="Full summary")
+parser.add_argument('-r', action='store_true', help="Regression database")
+
+# For runid or runid range
+parser.add_argument('others', nargs=argparse.REMAINDER)
+
+args = parser.parse_args()
+
+c = sqlite3.connect(args.b)
+
+if args.others:
+   runid=int(args.others[0])
+else:
+   runid=getLastRunID()
+
+# We extract data only from data tables
+# Those tables below are used for descriptions
+REMOVETABLES=['RUN','CORE', 'PLATFORM', 'COMPILERKIND', 'COMPILER', 'TYPE', 'CATEGORY', 'CONFIG']
+
+# This is assuming the database is generated by the regression script
+# So platform is the same for all benchmarks.
+# Category and type is coming from the test name in the yaml
+# So no need to add this information here
+# Name is removed here because it is added at the beginning
+REMOVECOLUMNS=['runid','NAME','type','platform','category','coredef','OPTIMIZED','HARDFP','FASTMATH','NEON','HELIUM','UNROLL','ROUNDING','DATE','compilerkindid','date','categoryid', 'ID', 'platformid', 'coreid', 'compilerid', 'typeid']
+
+# Get existing benchmark tables
+def getBenchTables():
+    r=c.execute("SELECT name FROM sqlite_master WHERE type='table'")
+    benchtables=[]
+    for table in r:
+        if not table[0] in REMOVETABLES:
+          benchtables.append(table[0])
+    return(benchtables)
+
+# get existing types in a table
+def getExistingTypes(benchTable):
+    r=c.execute("select distinct typeid from %s" % benchTable).fetchall()
+    result=[x[0] for x in r]
+    return(result)
+
+# Get compilers from specific type and table
+versioncompiler="""select distinct compiler,version from %s 
+  INNER JOIN COMPILER USING(compilerid)
+  INNER JOIN COMPILERKIND USING(compilerkindid) WHERE typeid=?"""
+
+# Get existing compiler in a table for a specific type
+# (In case report is structured by types)
+def getExistingCompiler(benchTable,typeid):
+    r=c.execute(versioncompiler % benchTable,(typeid,)).fetchall()
+    return(r)
+
+# Get type name from type id
+def getTypeName(typeid):
+    r=c.execute("select type from TYPE where typeid=?",(typeid,)).fetchone()
+    return(r[0])
+ 
+# Diff of 2 lists 
+def diff(first, second):
+        second = set(second)
+        return [item for item in first if item not in second]
+
+
+# Command to get data for specific compiler 
+# and type
+benchCmd="""select %s from %s
+  INNER JOIN CATEGORY USING(categoryid)
+  INNER JOIN PLATFORM USING(platformid)
+  INNER JOIN CORE USING(coreid)
+  INNER JOIN COMPILER USING(compilerid)
+  INNER JOIN COMPILERKIND USING(compilerkindid)
+  INNER JOIN TYPE USING(typeid)
+  WHERE compiler=? AND VERSION=? AND typeid = ? AND runid = ?
+  """
+
+# Command to get test names for specific compiler 
+# and type
+benchNames="""select distinct NAME from %s
+  INNER JOIN COMPILER USING(compilerid)
+  INNER JOIN COMPILERKIND USING(compilerkindid)
+  INNER JOIN TYPE USING(typeid)
+  WHERE compiler=? AND VERSION=? AND typeid = ? AND runid = ?
+  """
+
+# Command to get columns for specific table
+benchCmdColumns="""select * from %s
+  INNER JOIN CATEGORY USING(categoryid)
+  INNER JOIN PLATFORM USING(platformid)
+  INNER JOIN CORE USING(coreid)
+  INNER JOIN COMPILER USING(compilerid)
+  INNER JOIN COMPILERKIND USING(compilerkindid)
+  INNER JOIN TYPE USING(typeid)
+  """
+
+def joinit(iterable, delimiter):
+    it = iter(iterable)
+    yield next(it)
+    for x in it:
+        yield delimiter
+        yield x
+
+# Is not a column name finishing by id 
+# (often primary key for thetable)
+def isNotIDColumn(col):
+    if re.match(r'^.*id$',col):
+        return(False)
+    else:
+        return(True)
+    
+# Get test names
+# for specific typeid and compiler (for the data)
+def getTestNames(benchTable,comp,typeid):
+    vals=(comp[0],comp[1],typeid,runid)
+    result=c.execute(benchNames % benchTable,vals).fetchall()
+    return([x[0] for x in list(result)])
+
+# Get names of columns and data for a table
+# for specific typeid and compiler (for the data)
+def getColNamesAndData(benchTable,comp,typeid):
+    cursor=c.cursor()
+    result=cursor.execute(benchCmdColumns % (benchTable))
+    cols= [member[0] for member in cursor.description]
+    keepCols = ['NAME'] + [c for c in diff(cols , REMOVECOLUMNS) if isNotIDColumn(c)]
+    keepColsStr = "".join(joinit(keepCols,","))
+    vals=(comp[0],comp[1],typeid,runid)
+    result=cursor.execute(benchCmd % (keepColsStr,benchTable),vals)
+    vals =np.array([list(x) for x in list(result)])
+    return(keepCols,vals)
+
+# Write columns in markdown format
+def writeColumns(f,cols):
+    colStr = "".join(joinit(cols,"|"))
+    f.write("|")
+    f.write(colStr)
+    f.write("|\n")
+    sepStr="".join(joinit([":-:" for x in cols],"|"))
+    f.write("|")
+    f.write(sepStr)
+    f.write("|\n")
+
+# Write row in markdown format
+def writeRow(f,row):
+    row=[str(x) for x in row]
+    rowStr = "".join(joinit(row,"|"))
+    f.write("|")
+    f.write(rowStr)
+    f.write("|\n")
+
+PARAMS=["NB","NumTaps", "NBA", "NBB", "Factor", "NumStages","VECDIM","NBR","NBC","NBI","IFFT", "BITREV"]
+
+def regressionTableFor(name,output,ref,toSort,indexCols,field):
+    data=ref.pivot_table(index=indexCols, columns='core', 
+    values=[field], aggfunc='first')
+       
+    data=data.sort_values(toSort)
+       
+    cores = [c[1] for c in list(data.columns)]
+    columns = diff(indexCols,['NAME']) + cores
+
+    writeColumns(output,columns)
+    dataForFunc=data.loc[name]
+    if type(dataForFunc) is pd.DataFrame:
+       for row in dataForFunc.itertuples():
+           row=list(row)
+           if type(row[0]) is int:
+              row=[row[0]] + row[1:]
+           else: 
+              row=list(row[0]) + row[1:]
+           writeRow(output,row)
+    else:
+       writeRow(output,dataForFunc)
+
+def formatTableByCore(output,testNames,cols,vals):
+    if vals.size != 0:
+       ref=pd.DataFrame(vals,columns=cols)
+       toSort=["NAME"]
+       
+       for param in PARAMS:
+          if param in ref.columns:
+             ref[param]=pd.to_numeric(ref[param])
+             toSort.append(param)
+       if args.r:
+         #  Regression table
+         ref['MAX']=pd.to_numeric(ref['MAX'])
+         ref['MAXREGCOEF']=pd.to_numeric(ref['MAXREGCOEF'])
+       
+         indexCols=diff(cols,['core','Regression','MAXREGCOEF','MAX','version','compiler'])
+         valList = ['Regression']
+       else:
+         ref['CYCLES']=pd.to_numeric(ref['CYCLES'])
+       
+         indexCols=diff(cols,['core','CYCLES','version','compiler'])
+         valList = ['CYCLES']
+      
+       
+
+       for name in testNames:
+           if args.r:
+              output.write("#### %s\n" % name)
+
+              output.write("##### Regression\n" )
+              regressionTableFor(name,output,ref,toSort,indexCols,'Regression')
+              
+              output.write("##### Max cycles\n" )
+              regressionTableFor(name,output,ref,toSort,indexCols,'MAX')
+              
+              output.write("##### Max Reg Coef\n" )
+              regressionTableFor(name,output,ref,toSort,indexCols,'MAXREGCOEF')
+
+           else:
+              data=ref.pivot_table(index=indexCols, columns='core', 
+              values=valList, aggfunc='first')
+       
+              data=data.sort_values(toSort)
+       
+              cores = [c[1] for c in list(data.columns)]
+              columns = diff(indexCols,['NAME']) + cores
+
+              output.write("#### %s\n" % name)
+              writeColumns(output,columns)
+              dataForFunc=data.loc[name]
+              if type(dataForFunc) is pd.DataFrame:
+                 for row in dataForFunc.itertuples():
+                     row=list(row)
+                     if type(row[0]) is int:
+                        row=[row[0]] + row[1:]
+                     else: 
+                        row=list(row[0]) + row[1:]
+                     writeRow(output,row)
+              else:
+                 writeRow(output,dataForFunc)
+
+# Add a report for each table
+def addReportFor(output,benchName):
+    print("Process %s\n" % benchName)
+    output.write("# %s\n" % benchName)
+    allTypes = getExistingTypes(benchName)
+    # Add report for each type
+    for aTypeID in allTypes:
+        typeName = getTypeName(aTypeID)
+        output.write("## %s\n" % typeName)
+        ## Add report for each compiler
+        allCompilers = getExistingCompiler(benchName,aTypeID)
+        for compiler in allCompilers:
+            #print(compiler)
+            output.write("### %s (%s)\n" % compiler)
+            cols,vals=getColNamesAndData(benchName,compiler,aTypeID)
+            names=getTestNames(benchName,compiler,aTypeID)
+            formatTableByCore(output,names,cols,vals)
+           
+
+
+
+
+try:
+  with open(args.o,"w") as output:
+      benchtables=getBenchTables()
+      for bench in benchtables:
+          addReportFor(output,bench)
+finally:
+     c.close()
+
+
diff --git a/CMSIS/DSP/Testing/runAllBenchmarks.bat b/CMSIS/DSP/Testing/runAllBenchmarks.bat
deleted file mode 100755
index 3074ddc..0000000
--- a/CMSIS/DSP/Testing/runAllBenchmarks.bat
+++ /dev/null
@@ -1,70 +0,0 @@
-@ECHO OFF 
-
-echo "Basic Maths"
-python processTests.py -e BasicBenchmarks
-call:runBench
-
-echo "Complex Maths"
-python processTests.py -e ComplexBenchmarks
-call:runBench
-
-echo "FIR"
-python processTests.py -e FIR
-call:runBench
-
-echo "Convolution / Correlation"
-python processTests.py -e MISC
-call:runBench
-
-echo "Decimation / Interpolation"
-python processTests.py -e DECIM
-call:runBench
-
-echo "BiQuad"
-python processTests.py -e BIQUAD
-call:runBench
-
-echo "Controller"
-python processTests.py -e Controller
-call:runBench
-
-echo "Fast Math"
-python processTests.py -e FastMath
-call:runBench
-
-echo "Barycenter"
-python processTests.py -e SupportBarF32
-call:runBench
-
-echo "Support"
-python processTests.py -e Support
-call:runBench
-
-echo "Unary Matrix"
-python processTests.py -e Unary 
-call:runBench
-
-echo "Binary Matrix"
-python processTests.py -e Binary
-call:runBench
-
-echo "Transform"
-python processTests.py -e Transform
-call:runBench
-
-EXIT /B
-
-:runBench
-REM pushd build_m7
-REM pushd build_m0
-pushd build_a5
-make
-REM "C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_MPS2_Cortex-M7.exe" -a Testing > result.txt
-REM "C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_MPS2_Cortex-M0.exe" -a Testing > result.txt
-"C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_VE_Cortex-A5x1.exe" -a Testing  > result.txt
-popd
-echo "Parse result"
-REM python processResult.py -e -r build_m7\result.txt
-REM python processResult.py -e -r build_m0\result.txt
-python processResult.py -e -r build_a5\result.txt
-goto:eof
\ No newline at end of file
diff --git a/CMSIS/DSP/Testing/runAllBenchmarks.py b/CMSIS/DSP/Testing/runAllBenchmarks.py
deleted file mode 100755
index b62a15c..0000000
--- a/CMSIS/DSP/Testing/runAllBenchmarks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import os
-import os.path
-import subprocess 
-import colorama
-from colorama import init,Fore, Back, Style
-import argparse
-
-GROUPS = [
-"BasicBenchmarks",
-"ComplexBenchmarks",
-"FIR",
-"MISC",
-"DECIM",
-"BIQUAD",
-"Controller",
-"FastMath",
-"SupportBarF32",
-"Support",
-"Unary",
-"Binary",
-"Transform"
-]
-
-init()
-
-def msg(t):
-    print(Fore.CYAN + t + Style.RESET_ALL)
-
-def processTest(test):
-    subprocess.call(["python","processTests.py","-e",test])
-
-def addToDB(cmd):
-    for g in GROUPS:
-        msg("Add group %s" % g)
-        subprocess.call(["python",cmd,g])
-
-def run(build,fvp,custom=None):
-    result = "results.txt"
-    resultPath = os.path.join(build,result)
-
-    current=os.getcwd()
-    try:
-       msg("Build" )
-       os.chdir(build)
-       subprocess.call(["make"])
-       msg("Run")
-       with open(result,"w") as results:
-          if custom:
-            subprocess.call([fvp] + custom,stdout=results)
-          else:
-             subprocess.call([fvp,"-a","Testing"],stdout=results)
-    finally:
-       os.chdir(current)
-
-    msg("Parse result")
-    subprocess.call(["python","processResult.py","-e","-r",resultPath])
-
-    msg("Regression computations")
-    subprocess.call(["python","summaryBench.py","-r",resultPath])
-
-    msg("Add results to benchmark database")
-    addToDB("addToDB.py")
-
-    msg("Add results to regression database")
-    addToDB("addToRegDB.py")
-
-
-
-def processAndRun(buildfolder,fvp,custom=None):
-    processTest("DSPBenchmarks")
-    run(buildfolder,fvp,custom=custom)
-
-parser = argparse.ArgumentParser(description='Parse test description')
-parser.add_argument('-f', nargs='?',type = str, default="build_benchmark_m7", help="Build folder")
-parser.add_argument('-v', nargs='?',type = str, default="C:\\Program Files\\ARM\\Development Studio 2019.0\\sw\\models\\bin\\FVP_MPS2_Cortex-M7.exe", help="Fast Model")
-parser.add_argument('-c', nargs='?',type = str, help="Custom args")
-
-args = parser.parse_args()
-
-if args.f is not None:
-   BUILDFOLDER=args.f
-else:
-   BUILDFOLDER="build_benchmark_m7"
-
-if args.v is not None:
-   FVP=args.v
-else:
-   FVP="C:\\Program Files\\ARM\\Development Studio 2019.0\\sw\\models\\bin\\FVP_MPS2_Cortex-M7.exe"
-
-
-if args.c:
-    custom = args.c.split()
-else:
-    custom = None
-
-print(Fore.RED + "bench.db and reg.db databases must exist before running this script" + Style.RESET_ALL)
-
-msg("Process benchmark description file")
-subprocess.call(["python", "preprocess.py","-f","bench.txt"])
-
-msg("Generate all missing C files")
-subprocess.call(["python","processTests.py", "-e"])
-
-
-processAndRun(BUILDFOLDER,FVP,custom=custom)
diff --git a/CMSIS/DSP/Testing/runAllTests.py b/CMSIS/DSP/Testing/runAllTests.py
index d1455ed..12d1e3e 100755
--- a/CMSIS/DSP/Testing/runAllTests.py
+++ b/CMSIS/DSP/Testing/runAllTests.py
@@ -9,7 +9,29 @@
 import sys
 import itertools
 from pathlib import Path
+import sqlite3
 
+# Command to get last runid 
+lastID="""SELECT runid FROM RUN ORDER BY runid DESC LIMIT 1
+"""
+
+addNewIDCmd="""INSERT INTO RUN VALUES(?,date('now'))
+"""
+
+benchID = 0
+regID = 0
+
+def getLastRunID(c):
+  r=c.execute(lastID)
+  result=r.fetchone()
+  if result is None:
+     return(0)
+  else:
+     return(int(result[0]))
+
+def addNewID(c,newid):
+  c.execute(addNewIDCmd,(newid,))
+  c.commit()
 
 # Small state machine
 def updateTestStatus(testStatusForThisBuild,newTestStatus):
@@ -120,10 +142,26 @@
    if not os.path.exists(args.db):
       createDb(args.sqlite,args.db)
 
+   conn = sqlite3.connect(args.db)
+   try:
+      currentID = getLastRunID(conn)
+      benchID = currentID + 1 
+      addNewID(conn,benchID)
+   finally:
+     conn.close()
+
 if args.regdb is not None:
    if not os.path.exists(args.regdb):
       createDb(args.sqlite,args.regdb)
 
+   conn = sqlite3.connect(args.regdb)
+   try:
+      currentID = getLastRunID(conn)
+      regID = currentID + 1 
+      addNewID(conn,regID)
+   finally:
+     conn.close()
+
 
 with open(args.i,"r") as f:
      config=yaml.safe_load(f)
@@ -208,7 +246,7 @@
                   build.createArchive(flags)
                   msg("Config " + str(flagConfig) + "\n")
 
-                  build.createCMake(flags,args.b,args.p)
+                  build.createCMake(core,flags,args.b,args.p)
                   for test in config["TESTS"]:
                       msg(test["testName"]+"\n")
                       testClass=test["testClass"]
@@ -220,7 +258,7 @@
                       if 'SIM' in config:
                         if core in config['SIM']:
                            fvp = config['SIM'][core] 
-                      newTestStatus = test.runAndProcess(compiler,fvp,sim,args.b,args.db,args.regdb)
+                      newTestStatus = test.runAndProcess(compiler,fvp,sim,args.b,args.db,args.regdb,benchID,regID)
                       testStatusForThisBuild = updateTestStatus(testStatusForThisBuild,newTestStatus)
                       if testStatusForThisBuild != NOTESTFAILED:
                          failedBuild[buildStr] = testStatusForThisBuild
diff --git a/CMSIS/DSP/Toolchain/AC6.cmake b/CMSIS/DSP/Toolchain/AC6.cmake
index ec98852..f232acd 100644
--- a/CMSIS/DSP/Toolchain/AC6.cmake
+++ b/CMSIS/DSP/Toolchain/AC6.cmake
@@ -17,7 +17,7 @@
   get_target_property(DISABLEOPTIM ${PROJECTNAME} DISABLEOPTIMIZATION)
   if ((OPTIMIZED) AND (NOT DISABLEOPTIM))
     #cmake_print_variables(DISABLEOPTIM)
-    target_compile_options(${PROJECTNAME} PRIVATE "-O3")
+    target_compile_options(${PROJECTNAME} PRIVATE "-Ofast")
   endif()
 
   if (FASTMATHCOMPUTATIONS)