CMSIS-DSP: Improvement to testing scripts
diff --git a/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h b/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
index 65fc752..b2a0690 100755
--- a/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
+++ b/CMSIS/DSP/PrivateInclude/arm_vec_filtering.h
@@ -253,291 +253,223 @@
acc1 = vecAddAcrossF32Mve(acc1Vec); \
}
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count) \
-{ \
- float32_t const *pSrcX; \
- f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
- uint32_t k; \
- \
- acc0Vec = vdupq_n_f32(0.0f); \
- acc1Vec = vdupq_n_f32(0.0f); \
- pSrcX = (float32_t const *) pX; \
- k = (count - 1) >> 2; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- pY-=4; \
- xVec = vldrwq_f32(&pSrcX[1]); \
- acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = (count - 1) % 0x4U; \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- xVec = vldrwq_f32(&pSrcX[1]); \
- acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- p0 = vctp32q(k+1); \
- acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
- \
- acc0 = vecAddAcrossF32Mve(acc0Vec); \
- acc1 = vecAddAcrossF32Mve(acc1Vec); \
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count) \
+{ \
+ float32_t const *pSrcX; \
+ f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f32(0.0f); \
+ acc1Vec = vdupq_n_f32(0.0f); \
+ pSrcX = (float32_t const *) pX; \
+ k = (count - 1) >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vldrwq_f32(&pSrcX[1]); \
+ acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = (count - 1) % 0x4U; \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ xVec = vldrwq_f32(&pSrcX[1]); \
+ acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ p0 = vctp32q(k+1); \
+ acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
+ \
+ acc0 = vecAddAcrossF32Mve(acc0Vec); \
+ acc1 = vecAddAcrossF32Mve(acc1Vec); \
}
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count) \
-{ \
- float32_t const *pSrcX; \
- f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
- uint32_t k; \
- \
- acc0Vec = vdupq_n_f32(0.0f); \
- acc1Vec = vdupq_n_f32(0.0f); \
- pSrcX = (float32_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- pY-=4; \
- xVec = vldrwq_f32(&pSrcX[1]); \
- acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- xVec = vldrwq_f32(&pSrcX[1]); \
- acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
- } \
- acc0 = vecAddAcrossF32Mve(acc0Vec); \
- acc1 = vecAddAcrossF32Mve(acc1Vec); \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count) \
+{ \
+ float32_t const *pSrcX; \
+ f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f32(0.0f); \
+ acc1Vec = vdupq_n_f32(0.0f); \
+ pSrcX = (float32_t const *) pX; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vldrwq_f32(&pSrcX[1]); \
+ acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ xVec = vldrwq_f32(&pSrcX[1]); \
+ acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
+ } \
+ acc0 = vecAddAcrossF32Mve(acc0Vec); \
+ acc1 = vecAddAcrossF32Mve(acc1Vec); \
}
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count) \
-{ \
- float32_t const *pSrcX; \
- const float32_t *pY1 = pY + 1; \
- f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
- uint32_t k; \
- \
- acc0Vec = vdupq_n_f32(0.0f); \
- acc1Vec = vdupq_n_f32(0.0f); \
- pSrcX = (float32_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- pY-=4; \
- acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
- yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \
- pY1-=4; \
- acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x4U; \
- /* use predication to finalize MAC sum */ \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- mve_pred16_t p0 = vctp32q(k); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
- yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp32q(k+1); \
- acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
- \
- acc0 = vecAddAcrossF32Mve(acc0Vec); \
- acc1 = vecAddAcrossF32Mve(acc1Vec); \
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
+{ \
+ float32_t const *pSrcX; \
+ const float32_t *pY1 = pY + 1; \
+ f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f32(0.0f); \
+ acc1Vec = vdupq_n_f32(0.0f); \
+ pSrcX = (float32_t const *) pX; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ pY-=4; \
+ acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
+ yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \
+ pY1-=4; \
+ acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x4U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ mve_pred16_t p0 = vctp32q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
+ yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp32q(k+1); \
+ acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
+ \
+ acc0 = vecAddAcrossF32Mve(acc0Vec); \
+ acc1 = vecAddAcrossF32Mve(acc1Vec); \
}
-#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count) \
-{ \
- float32_t const *pSrcX; \
- f32x4_t accVec, xVec, yVec; \
- uint32_t k; \
- \
- accVec = vdupq_n_f32(0.0f); \
- pSrcX = (float32_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- pY-=4; \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- accVec = vfmaq_f32(accVec, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
- accVec = vfmaq_m_f32(accVec, xVec, yVec, p0); \
- } \
- acc = vecAddAcrossF32Mve(accVec); \
+#define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count) \
+{ \
+ float32_t const *pSrcX; \
+ f32x4_t accVec, xVec, yVec; \
+ uint32_t k; \
+ \
+ accVec = vdupq_n_f32(0.0f); \
+ pSrcX = (float32_t const *) pX; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ accVec = vfmaq_f32(accVec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
+ accVec = vfmaq_m_f32(accVec, xVec, yVec, p0); \
+ } \
+ acc = vecAddAcrossF32Mve(accVec); \
}
#endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
#if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
-#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count) \
-{ \
- q31_t const *pSrcX; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- pY-=4; \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc = vmlaldavaq(acc, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
- } \
- acc = asrl(acc, 31); \
+#define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count) \
+{ \
+ q31_t const *pSrcX; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc = vmlaldavaq(acc, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
+ } \
+ acc = asrl(acc, 31); \
}
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count) \
-{ \
- q31_t const *pSrcX; \
- const q31_t *pY1 = pY + 1; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- pY-=4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \
- pY1-=4; \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x4U; \
- /* use predication to finalize MAC sum */ \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- mve_pred16_t p0 = vctp32q(k); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp32q(k+1); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
-}
-
-
-
-
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
{ \
q31_t const *pSrcX; \
+ const q31_t *pY1 = pY + 1; \
q31x4_t xVec, yVec; \
uint32_t k; \
\
pSrcX = (q31_t const *) pX; \
- k = (count-1) >> 2; \
+ k = count >> 2; \
\
while (k > 0U) \
{ \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
pY-=4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); \
- pSrcX += 4; \
acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \
+ pY1-=4; \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
/* Decrement the loop counter */ \
k--; \
} \
- k = (count - 1) % 0x4U; \
+ k = count % 0x4U; \
/* use predication to finalize MAC sum */ \
- /* acc1 requires exact number of sample (count-1) */ \
+ /* acc0 requires exact number of sample */ \
/* disable extra lanes in final MAC computation */ \
mve_pred16_t p0 = vctp32q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- /* acc0 requires 1 additional sample (count) */ \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \
+ /* acc1 requires 1 additional sample */ \
/* so add 1 to unmask an extra lane in final MAC computation */ \
p0 = vctp32q(k+1); \
- xVec = vld1q(pSrcX); \
- pSrcX += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
\
acc0 = asrl(acc0, 31); \
acc1 = asrl(acc1, 31); \
@@ -545,1110 +477,1103 @@
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \
-{ \
- q31_t const *pSrcX; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- pY-=4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \
+{ \
+ q31_t const *pSrcX; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ k = (count-1) >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); \
+ pSrcX += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = (count - 1) % 0x4U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp32q(k+1); \
+ xVec = vld1q(pSrcX); \
+ pSrcX += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
}
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \
-{ \
- q31_t const *pSrcX; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- pY-=4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vldrwq_s32(&pSrcX[2]); \
- acc2 = vmlaldavaq(acc2, xVec, yVec); \
- xVec = vldrwq_s32(&pSrcX[3]); \
- acc3 = vmlaldavaq(acc3, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vldrwq_s32(&pSrcX[2]); \
- acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
- xVec = vldrwq_s32(&pSrcX[3]); \
- acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
- acc2 = asrl(acc2, 31); \
- acc3 = asrl(acc3, 31); \
-}
-
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count) \
-{ \
- q31_t const *pSrcX, *pSrcY; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- pSrcY = (q31_t const *) pY; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_s32(&pSrcY[-1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x4U; \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- mve_pred16_t p0 = vctp32q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vldrwq_s32(&pSrcY[-1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- p0 = vctp32q(k); \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \
- \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
-}
-
-#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count) \
-{ \
- q31_t const *pSrcX, *pSrcY; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- pSrcY = (q31_t const *) pY; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- acc = vmlaldavaq(acc, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
- } \
- acc = asrl(acc, 31); \
-}
-
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \
-{ \
- q31_t const *pSrcX, *pSrcY; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- pSrcY = (q31_t const *) pY; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vldrwq_s32(&pSrcX[2]); \
- acc2 = vmlaldavaq(acc2, xVec, yVec); \
- xVec = vldrwq_s32(&pSrcX[3]); \
- acc3 = vmlaldavaq(acc3, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* loop + tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vldrwq_s32(&pSrcX[2]); \
- acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
- xVec = vldrwq_s32(&pSrcX[3]); \
- acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
- acc2 = asrl(acc2, 31); \
- acc3 = asrl(acc3, 31); \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \
-{ \
- q31_t const *pSrcX, *pSrcY; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- pSrcY = (q31_t const *) pY; \
- k = count >> 2; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* loop + tail predication expected here */ \
- k = count % 0x4U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \
-{ \
- q31_t const *pSrcX, *pSrcY; \
- q31x4_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q31_t const *) pX; \
- pSrcY = (q31_t const *) pY; \
- k = (count-1) >> 2; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires exact number of sample (count-1) */ \
- /* disable extra lanes in final MAC computation */ \
- k = (count-1) % 0x4U; \
- mve_pred16_t p0 = vctp32q(k); \
- yVec = vld1q(pSrcY); pSrcY += 4; \
- xVec = vldrwq_s32(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- /* acc0 requires 1 additional sample (count) */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp32q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 4; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- \
- acc0 = asrl(acc0, 31); \
- acc1 = asrl(acc1, 31); \
-}
-
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count) \
-{ \
- q15_t const *pSrcX, *pSrcY; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- pSrcY = (q15_t const *) pY; \
- k = count >> 3; \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vldrhq_s16(&pSrcY[-1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x8U; \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- mve_pred16_t p0 = vctp16q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vldrhq_s16(&pSrcY[-1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- p0 = vctp16q(k); \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \
- \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
-}
-
-#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count) \
-{ \
- q15_t const *pSrcX, *pSrcY; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- pSrcY = (q15_t const *) pY; \
- k = count >> 3; \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- acc = vmlaldavaq(acc, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* tail predication expected here */ \
- k = count % 0x8U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp16q(k); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
- } \
- acc = asrl(acc, 15); \
- acc = __SSAT(acc, 16); \
-}
-
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \
-{ \
- q15_t const *pSrcX, *pSrcY; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- pSrcY = (q15_t const *) pY; \
- k = count >> 3; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vldrhq_s16(&pSrcX[2]); \
- acc2 = vmlaldavaq(acc2, xVec, yVec); \
- xVec = vldrhq_s16(&pSrcX[3]); \
- acc3 = vmlaldavaq(acc3, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* loop + tail predication expected here */ \
- k = count % 0x8U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp16q(k); \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vldrhq_s16(&pSrcX[2]); \
- acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
- xVec = vldrhq_s16(&pSrcX[3]); \
- acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc2 = asrl(acc2, 15); \
- acc3 = asrl(acc3, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
- acc2 = __SSAT(acc2, 16); \
- acc3 = __SSAT(acc3, 16); \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \
-{ \
- q15_t const *pSrcX, *pSrcY; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- pSrcY = (q15_t const *) pY; \
- k = count >> 3; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* loop + tail predication expected here */ \
- k = count % 0x8U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp16q(k); \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \
-{ \
- q15_t const *pSrcX, *pSrcY; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- pSrcY = (q15_t const *) pY; \
- k = (count-1) >> 3; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires exact number of sample (count-1) */ \
- /* disable extra lanes in final MAC computation */ \
- k = (count-1) % 0x8U; \
- mve_pred16_t p0 = vctp16q(k); \
- yVec = vld1q(pSrcY); pSrcY += 8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- /* acc0 requires 1 additional sample (count) */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp16q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count) \
-{ \
- q15_t const *pSrcX; \
- const q15_t *pY1 = pY + 1; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- k = count >> 3; \
- \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- pY-=8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \
- pY1-=8; \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x8U; \
- /* use predication to finalize MAC sum */ \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- mve_pred16_t p0 = vctp16q(k); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp16q(k+1); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
-}
-
-#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count) \
-{ \
- q15_t const *pSrcX; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- k = count >> 3; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- pY-=8; \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc = vmlaldavaq(acc, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x8U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp16q(k); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
- } \
- acc = asrl(acc, 15); \
- acc = __SSAT(acc, 16); \
-}
-
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \
-{ \
- q15_t const *pSrcX; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- k = count >> 3; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- pY-=8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vldrhq_s16(&pSrcX[2]); \
- acc2 = vmlaldavaq(acc2, xVec, yVec); \
- xVec = vldrhq_s16(&pSrcX[3]); \
- acc3 = vmlaldavaq(acc3, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x8U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp16q(k); \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vldrhq_s16(&pSrcX[2]); \
- acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
- xVec = vldrhq_s16(&pSrcX[3]); \
- acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc2 = asrl(acc2, 15); \
- acc3 = asrl(acc3, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
- acc2 = __SSAT(acc2, 16); \
- acc3 = __SSAT(acc3, 16); \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \
-{ \
- q15_t const *pSrcX; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- k = count >> 3; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- pY-=8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x8U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp16q(k); \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- } \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \
-{ \
- q15_t const *pSrcX; \
- q15x8_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q15_t const *) pX; \
- k = (count-1) >> 3; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- pY-=8; \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = (count - 1) % 0x8U; \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires exact number of sample (count-1) */ \
- /* disable extra lanes in final MAC computation */ \
- mve_pred16_t p0 = vctp16q(k); \
- yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
- xVec = vldrhq_s16(&pSrcX[1]); \
- acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
- /* acc0 requires 1 additional sample (count) */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp16q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 8; \
- acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
- \
- acc0 = asrl(acc0, 15); \
- acc1 = asrl(acc1, 15); \
- acc0 = __SSAT(acc0, 16); \
- acc1 = __SSAT(acc1, 16); \
-}
-
-#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count) \
-{ \
- q7_t const *pSrcX, *pSrcY; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- pSrcY = (q7_t const *) pY; \
- k = count >> 4; \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vldrbq_s8(&pSrcY[-1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x10U; \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- mve_pred16_t p0 = vctp8q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vldrbq_s8(&pSrcY[-1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec,p0); \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- p0 = vctp8q(k); \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec,p0); \
- \
- acc0 = (acc0 >> 7); \
- acc1 = (acc1 >> 7); \
- acc0 = __SSAT(acc0, 8); \
- acc1 = __SSAT(acc1, 8); \
-}
-
-#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count) \
-{ \
- q7_t const *pSrcX, *pSrcY; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- pSrcY = (q7_t const *) pY; \
- k = count >> 4; \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- acc = vmladavaq(acc, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* tail predication expected here */ \
- k = count % 0x10U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp8q(k); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- acc = vmladavaq_p(acc, xVec, yVec, p0); \
- } \
- acc =(acc >> 7); \
- acc = __SSAT(acc, 8); \
-}
-
-#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \
-{ \
- q7_t const *pSrcX, *pSrcY; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- pSrcY = (q7_t const *) pY; \
- k = count >> 4; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- xVec = vldrbq_s8(&pSrcX[2]); \
- acc2 = vmladavaq(acc2, xVec, yVec); \
- xVec = vldrbq_s8(&pSrcX[3]); \
- acc3 = vmladavaq(acc3, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* loop + tail predication expected here */ \
- k = count % 0x10U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp8q(k); \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- xVec = vldrbq_s8(&pSrcX[2]); \
- acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \
- xVec = vldrbq_s8(&pSrcX[3]); \
- acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- } \
- \
- acc0 = (acc0 >> 7); \
- acc1 = (acc1 >> 7); \
- acc2 = (acc2 >> 7); \
- acc3 = (acc3 >> 7); \
- acc0 = __SSAT(acc0, 8); \
- acc1 = __SSAT(acc1, 8); \
- acc2 = __SSAT(acc2, 8); \
- acc3 = __SSAT(acc3, 8); \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \
-{ \
- q7_t const *pSrcX, *pSrcY; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- pSrcY = (q7_t const *) pY; \
- k = count >> 4; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* loop + tail predication expected here */ \
- k = count % 0x10U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp8q(k); \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- } \
- \
- acc0 = (acc0 >> 7); \
- acc1 = (acc1 >> 7); \
- acc0 = __SSAT(acc0, 8); \
- acc1 = __SSAT(acc1, 8); \
-}
-
-#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \
-{ \
- q7_t const *pSrcX, *pSrcY; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- pSrcY = (q7_t const *) pY; \
- k = (count-1) >> 4; \
- \
- while (k > 0U) \
- { \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires exact number of sample (count-1) */ \
- /* disable extra lanes in final MAC computation */ \
- k = (count-1) % 0x10U; \
- mve_pred16_t p0 = vctp8q(k); \
- yVec = vld1q(pSrcY); pSrcY += 16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- /* acc0 requires 1 additional sample (count) */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp8q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- \
- acc0 = (acc0 >> 7); \
- acc1 = (acc1 >> 7); \
- acc0 = __SSAT(acc0, 8); \
- acc1 = __SSAT(acc1, 8); \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count) \
-{ \
- q7_t const *pSrcX; \
- const q7_t *pY1 = pY + 1; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- k = count >> 4; \
- \
- while (k > 0U) \
- { \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- pY-=16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \
- pY1-=16; \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = count % 0x10U; \
- /* use predication to finalize MAC sum */ \
- /* acc0 requires exact number of sample */ \
- /* disable extra lanes in final MAC computation */ \
- mve_pred16_t p0 = vctp8q(k); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \
- /* acc1 requires 1 additional sample */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp8q(k+1); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- \
- acc0 = (acc0 >> 7); \
- acc1 = (acc1 >> 7); \
- acc0 = __SSAT(acc0, 8); \
- acc1 = __SSAT(acc1, 8); \
-}
-
-#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count) \
-{ \
- q7_t const *pSrcX; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- k = count >> 4; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- pY-=16; \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc = vmladavaq(acc, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x10U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp8q(k); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- acc = vmladavaq_p(acc, xVec, yVec, p0); \
- } \
- acc = __SSAT(acc >> 7, 8); \
-}
-
-#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \
-{ \
- q7_t const *pSrcX; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- k = count >> 4; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- pY-=16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- xVec = vldrbq_s8(&pSrcX[2]); \
- acc2 = vmladavaq(acc2, xVec, yVec); \
- xVec = vldrbq_s8(&pSrcX[3]); \
- acc3 = vmladavaq(acc3, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x10U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp8q(k); \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- xVec = vldrbq_s8(&pSrcX[2]); \
- acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \
- xVec = vldrbq_s8(&pSrcX[3]); \
- acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- } \
- acc0 = __SSAT(acc0 >> 7, 8); \
- acc1 = __SSAT(acc1 >> 7, 8); \
- acc2 = __SSAT(acc2 >> 7, 8); \
- acc3 = __SSAT(acc3 >> 7, 8); \
-}
-
-#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \
-{ \
- q7_t const *pSrcX; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- k = count >> 4; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- pY-=16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- /* Loop with tail predication expected here */ \
- k = count % 0x10U; \
- if (k > 0U) \
- { \
- mve_pred16_t p0 = vctp8q(k); \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- } \
- acc0 = __SSAT(acc0 >> 7, 8); \
- acc1 = __SSAT(acc1 >> 7, 8); \
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \
+{ \
+ q31_t const *pSrcX; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
}
-#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \
-{ \
- q7_t const *pSrcX; \
- q7x16_t xVec, yVec; \
- uint32_t k; \
- \
- pSrcX = (q7_t const *) pX; \
- k = (count-1) >> 4; \
- \
- while (k > 0U) \
- { \
- /* note */ \
- /* could can be more efficient using Vector Scatter Store: */ \
- /* + pre-increment + WB */ \
- /* To be revisited when intrinsic available */ \
- /* SDCOMP-52618 */ \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- pY-=16; \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq(acc1, xVec, yVec); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq(acc0, xVec, yVec); \
- /* Decrement the loop counter */ \
- k--; \
- } \
- k = (count - 1) % 0x10U; \
- /* use predication to finalize MAC sum */ \
- /* acc1 requires exact number of sample (count-1) */ \
- /* disable extra lanes in final MAC computation */ \
- mve_pred16_t p0 = vctp8q(k); \
- yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
- xVec = vldrbq_s8(&pSrcX[1]); \
- acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
- /* acc0 requires 1 additional sample (count) */ \
- /* so add 1 to unmask an extra lane in final MAC computation */ \
- p0 = vctp8q(k+1); \
- xVec = vld1q(pSrcX); pSrcX += 16; \
- acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
- \
- acc0 = (acc0 >> 7); \
- acc1 = (acc1 >> 7); \
- acc0 = __SSAT(acc0, 8); \
- acc1 = __SSAT(acc1, 8); \
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \
+{ \
+ q31_t const *pSrcX; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ pY-=4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vldrwq_s32(&pSrcX[2]); \
+ acc2 = vmlaldavaq(acc2, xVec, yVec); \
+ xVec = vldrwq_s32(&pSrcX[3]); \
+ acc3 = vmlaldavaq(acc3, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vldrwq_s32(&pSrcX[2]); \
+ acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
+ xVec = vldrwq_s32(&pSrcX[3]); \
+ acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
+ acc2 = asrl(acc2, 31); \
+ acc3 = asrl(acc3, 31); \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{ \
+ q31_t const *pSrcX, *pSrcY; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ pSrcY = (q31_t const *) pY; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vldrwq_s32(&pSrcY[-1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x4U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ mve_pred16_t p0 = vctp32q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vldrwq_s32(&pSrcY[-1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ p0 = vctp32q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \
+ \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\
+{ \
+ q31_t const *pSrcX, *pSrcY; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ pSrcY = (q31_t const *) pY; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ acc = vmlaldavaq(acc, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
+ } \
+ acc = asrl(acc, 31); \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\
+{ \
+ q31_t const *pSrcX, *pSrcY; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ pSrcY = (q31_t const *) pY; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vldrwq_s32(&pSrcX[2]); \
+ acc2 = vmlaldavaq(acc2, xVec, yVec); \
+ xVec = vldrwq_s32(&pSrcX[3]); \
+ acc3 = vmlaldavaq(acc3, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vldrwq_s32(&pSrcX[2]); \
+ acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
+ xVec = vldrwq_s32(&pSrcX[3]); \
+ acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
+ acc2 = asrl(acc2, 31); \
+ acc3 = asrl(acc3, 31); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{ \
+ q31_t const *pSrcX, *pSrcY; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ pSrcY = (q31_t const *) pY; \
+ k = count >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x4U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
+{ \
+ q31_t const *pSrcX, *pSrcY; \
+ q31x4_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q31_t const *) pX; \
+ pSrcY = (q31_t const *) pY; \
+ k = (count-1) >> 2; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ k = (count-1) % 0x4U; \
+ mve_pred16_t p0 = vctp32q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 4; \
+ xVec = vldrwq_s32(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp32q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 4; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ \
+ acc0 = asrl(acc0, 31); \
+ acc1 = asrl(acc1, 31); \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{ \
+ q15_t const *pSrcX, *pSrcY; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ pSrcY = (q15_t const *) pY; \
+ k = count >> 3; \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_s16(&pSrcY[-1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x8U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ mve_pred16_t p0 = vctp16q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_s16(&pSrcY[-1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \
+ \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\
+{ \
+ q15_t const *pSrcX, *pSrcY; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ pSrcY = (q15_t const *) pY; \
+ k = count >> 3; \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ acc = vmlaldavaq(acc, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
+ } \
+ acc = asrl(acc, 15); \
+ acc = __SSAT(acc, 16); \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\
+{ \
+ q15_t const *pSrcX, *pSrcY; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ pSrcY = (q15_t const *) pY; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vldrhq_s16(&pSrcX[2]); \
+ acc2 = vmlaldavaq(acc2, xVec, yVec); \
+ xVec = vldrhq_s16(&pSrcX[3]); \
+ acc3 = vmlaldavaq(acc3, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vldrhq_s16(&pSrcX[2]); \
+ acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
+ xVec = vldrhq_s16(&pSrcX[3]); \
+ acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc2 = asrl(acc2, 15); \
+ acc3 = asrl(acc3, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+ acc2 = __SSAT(acc2, 16); \
+ acc3 = __SSAT(acc3, 16); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{ \
+ q15_t const *pSrcX, *pSrcY; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ pSrcY = (q15_t const *) pY; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{ \
+ q15_t const *pSrcX, *pSrcY; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ pSrcY = (q15_t const *) pY; \
+ k = (count-1) >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ k = (count-1) % 0x8U; \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp16q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
+{ \
+ q15_t const *pSrcX; \
+ const q15_t *pY1 = pY + 1; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ pY-=8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \
+ pY1-=8; \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x8U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ mve_pred16_t p0 = vctp16q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp16q(k+1); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+}
+
+#define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count) \
+{ \
+ q15_t const *pSrcX; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ pY-=8; \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc = vmlaldavaq(acc, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
+ } \
+ acc = asrl(acc, 15); \
+ acc = __SSAT(acc, 16); \
+}
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \
+{ \
+ q15_t const *pSrcX; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ pY-=8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vldrhq_s16(&pSrcX[2]); \
+ acc2 = vmlaldavaq(acc2, xVec, yVec); \
+ xVec = vldrhq_s16(&pSrcX[3]); \
+ acc3 = vmlaldavaq(acc3, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vldrhq_s16(&pSrcX[2]); \
+ acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
+ xVec = vldrhq_s16(&pSrcX[3]); \
+ acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc2 = asrl(acc2, 15); \
+ acc3 = asrl(acc3, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+ acc2 = __SSAT(acc2, 16); \
+ acc3 = __SSAT(acc3, 16); \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \
+{ \
+ q15_t const *pSrcX; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ pY-=8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \
+{ \
+ q15_t const *pSrcX; \
+ q15x8_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q15_t const *) pX; \
+ k = (count-1) >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ pY-=8; \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = (count - 1) % 0x8U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
+ xVec = vldrhq_s16(&pSrcX[1]); \
+ acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp16q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
+ \
+ acc0 = asrl(acc0, 15); \
+ acc1 = asrl(acc1, 15); \
+ acc0 = __SSAT(acc0, 16); \
+ acc1 = __SSAT(acc1, 16); \
+}
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{ \
+ q7_t const *pSrcX, *pSrcY; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ pSrcY = (q7_t const *) pY; \
+ k = count >> 4; \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vldrbq_s8(&pSrcY[-1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x10U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ mve_pred16_t p0 = vctp8q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vldrbq_s8(&pSrcY[-1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec,p0); \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ p0 = vctp8q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec,p0); \
+ \
+ acc0 = (acc0 >> 7); \
+ acc1 = (acc1 >> 7); \
+ acc0 = __SSAT(acc0, 8); \
+ acc1 = __SSAT(acc1, 8); \
+}
+
+#define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\
+{ \
+ q7_t const *pSrcX, *pSrcY; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ pSrcY = (q7_t const *) pY; \
+ k = count >> 4; \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ acc = vmladavaq(acc, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* tail predication expected here */ \
+ k = count % 0x10U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp8q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ acc = vmladavaq_p(acc, xVec, yVec, p0); \
+ } \
+ acc =(acc >> 7); \
+ acc = __SSAT(acc, 8); \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\
+{ \
+ q7_t const *pSrcX, *pSrcY; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ pSrcY = (q7_t const *) pY; \
+ k = count >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ xVec = vldrbq_s8(&pSrcX[2]); \
+ acc2 = vmladavaq(acc2, xVec, yVec); \
+ xVec = vldrbq_s8(&pSrcX[3]); \
+ acc3 = vmladavaq(acc3, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x10U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp8q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vldrbq_s8(&pSrcX[2]); \
+ acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \
+ xVec = vldrbq_s8(&pSrcX[3]); \
+ acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = (acc0 >> 7); \
+ acc1 = (acc1 >> 7); \
+ acc2 = (acc2 >> 7); \
+ acc3 = (acc3 >> 7); \
+ acc0 = __SSAT(acc0, 8); \
+ acc1 = __SSAT(acc1, 8); \
+ acc2 = __SSAT(acc2, 8); \
+ acc3 = __SSAT(acc3, 8); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{ \
+ q7_t const *pSrcX, *pSrcY; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ pSrcY = (q7_t const *) pY; \
+ k = count >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x10U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp8q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = (acc0 >> 7); \
+ acc1 = (acc1 >> 7); \
+ acc0 = __SSAT(acc0, 8); \
+ acc1 = __SSAT(acc1, 8); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{ \
+ q7_t const *pSrcX, *pSrcY; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ pSrcY = (q7_t const *) pY; \
+ k = (count-1) >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ k = (count-1) % 0x10U; \
+ mve_pred16_t p0 = vctp8q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp8q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ \
+ acc0 = (acc0 >> 7); \
+ acc1 = (acc1 >> 7); \
+ acc0 = __SSAT(acc0, 8); \
+ acc1 = __SSAT(acc1, 8); \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
+{ \
+ q7_t const *pSrcX; \
+ const q7_t *pY1 = pY + 1; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ k = count >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ pY-=16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \
+ pY1-=16; \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x10U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ mve_pred16_t p0 = vctp8q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp8q(k+1); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ \
+ acc0 = (acc0 >> 7); \
+ acc1 = (acc1 >> 7); \
+ acc0 = __SSAT(acc0, 8); \
+ acc1 = __SSAT(acc1, 8); \
+}
+
+#define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count) \
+{ \
+ q7_t const *pSrcX; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ k = count >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ pY-=16; \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc = vmladavaq(acc, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x10U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp8q(k); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ acc = vmladavaq_p(acc, xVec, yVec, p0); \
+ } \
+ acc = __SSAT(acc >> 7, 8); \
+}
+
+#define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \
+{ \
+ q7_t const *pSrcX; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ k = count >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ pY-=16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ xVec = vldrbq_s8(&pSrcX[2]); \
+ acc2 = vmladavaq(acc2, xVec, yVec); \
+ xVec = vldrbq_s8(&pSrcX[3]); \
+ acc3 = vmladavaq(acc3, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x10U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp8q(k); \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vldrbq_s8(&pSrcX[2]); \
+ acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \
+ xVec = vldrbq_s8(&pSrcX[3]); \
+ acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ acc0 = __SSAT(acc0 >> 7, 8); \
+ acc1 = __SSAT(acc1 >> 7, 8); \
+ acc2 = __SSAT(acc2 >> 7, 8); \
+ acc3 = __SSAT(acc3 >> 7, 8); \
+}
+
+#define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \
+{ \
+ q7_t const *pSrcX; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ k = count >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ pY-=16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x10U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp8q(k); \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ } \
+ acc0 = __SSAT(acc0 >> 7, 8); \
+ acc1 = __SSAT(acc1 >> 7, 8); \
+}
+
+
+#define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \
+{ \
+ q7_t const *pSrcX; \
+ q7x16_t xVec, yVec; \
+ uint32_t k; \
+ \
+ pSrcX = (q7_t const *) pX; \
+ k = (count-1) >> 4; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ pY-=16; \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq(acc1, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq(acc0, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = (count - 1) % 0x10U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ mve_pred16_t p0 = vctp8q(k); \
+ yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
+ xVec = vldrbq_s8(&pSrcX[1]); \
+ acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp8q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 16; \
+ acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
+ \
+ acc0 = (acc0 >> 7); \
+ acc1 = (acc1 >> 7); \
+ acc0 = __SSAT(acc0, 8); \
+ acc1 = __SSAT(acc1, 8); \
}
#endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
diff --git a/CMSIS/DSP/Testing/CMakeLists.txt b/CMSIS/DSP/Testing/CMakeLists.txt
index d132de8..c3d5267 100644
--- a/CMSIS/DSP/Testing/CMakeLists.txt
+++ b/CMSIS/DSP/Testing/CMakeLists.txt
@@ -54,7 +54,13 @@
endif()
list(APPEND output ",${PLATFORMID}")
- list(APPEND output ",${COREID},")
+ if (CONFIGID)
+ # Specific config ID (like M55 with autovectorizer)
+ list(APPEND output ",${CONFIGID},")
+ else()
+ # Core ID is used as config ID when not specified
+ list(APPEND output ",${COREID},")
+ endif()
if (ARMAC6)
list(APPEND output "AC6")
elseif(GCC)
diff --git a/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py b/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py
index 6c970e1..19d1866 100755
--- a/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py
+++ b/CMSIS/DSP/Testing/TestScripts/Regression/Commands.py
@@ -48,14 +48,14 @@
yield delimiter
yield x
-def addToDb(db,desc):
+def addToDb(db,desc,runid):
msg("Add %s to summary database\n" % desc)
- completed = subprocess.run([sys.executable, "addToDB.py","-o",db,desc], timeout=3600)
+ completed = subprocess.run([sys.executable, "addToDB.py","-o",db,"-r",str(runid),desc], timeout=3600)
check(completed)
-def addToRegDb(db,desc):
+def addToRegDb(db,desc,runid):
msg("Add %s to regression database\n" % desc)
- completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,desc], timeout=3600)
+ completed = subprocess.run([sys.executable, "addToRegDB.py","-o",db,"-r",str(runid),desc], timeout=3600)
check(completed)
@@ -187,7 +187,7 @@
# Launch cmake command.
- def createCMake(self,flags,benchMode,platform):
+ def createCMake(self,configid,flags,benchMode,platform):
with self.buildFolder() as b:
self.saveEnv()
if benchMode:
@@ -198,7 +198,8 @@
cmd += ["-DCMAKE_PREFIX_PATH=%s" % self.compiler(),
"-DCMAKE_TOOLCHAIN_FILE=%s" % toolchainCmake,
"-DARM_CPU=%s" % self.core(),
- "-DPLATFORM=%s" % platform
+ "-DPLATFORM=%s" % platform,
+ "-DCONFIGID=%s" % configid
]
cmd += flags
@@ -373,7 +374,7 @@
else:
return(TESTFAILED)
- def runAndProcess(self,compiler,fvp,sim,benchmode,db,regdb):
+ def runAndProcess(self,compiler,fvp,sim,benchmode,db,regdb,benchid,regid):
# If we can't parse test description we fail all tests
self.processTest()
# Otherwise if only building or those tests are failing, we continue
@@ -393,9 +394,9 @@
if benchmode and (error == NOTESTFAILED):
error = self.computeSummaryStat()
if db is not None:
- addToDb(db,self.testName())
+ addToDb(db,self.testName(),benchid)
if regdb is not None:
- addToRegDb(regdb,self.testName())
+ addToRegDb(regdb,self.testName(),regid)
return(error)
else:
msg("No FVP available")
diff --git a/CMSIS/DSP/Testing/addToDB.py b/CMSIS/DSP/Testing/addToDB.py
index 4d7b668..5b43cff 100755
--- a/CMSIS/DSP/Testing/addToDB.py
+++ b/CMSIS/DSP/Testing/addToDB.py
@@ -22,12 +22,13 @@
MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
MKINTFIELD=['ID', 'CYCLES']
MKDATEFIELD=['DATE']
-MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
+MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE',"RUN"]
MKKEYFIELDID={'CATEGORY':'categoryid',
'PLATFORM':'platformid',
'CORE':'coreid',
'COMPILER':'compilerid',
- 'TYPE':'typeid'}
+ 'TYPE':'typeid',
+ 'RUN':'runid'}
# For table value extraction
VALSTRFIELD=['NAME','VERSION']
@@ -56,7 +57,7 @@
colsToKeep=[]
cols = list(full.columns)
params = list(elem.params.full)
- common = diff(cols + ["TYPE"] , ['OLDID'] + params)
+ common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)
for field in common:
if field in MKSTRFIELD:
@@ -76,7 +77,7 @@
sql = "CREATE TABLE %s (" % tableName
cols = list(full.columns)
params = list(elem.params.full)
- common = diff(cols + ["TYPE"] , ['OLDID'] + params)
+ common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)
sql += "%sid INTEGER PRIMARY KEY" % (tableName)
start = ","
@@ -103,6 +104,7 @@
sql += "FOREIGN KEY(platformid) REFERENCES PLATFORM(platformid),"
sql += "FOREIGN KEY(coreid) REFERENCES CORE(coreid),"
sql += "FOREIGN KEY(compilerid) REFERENCES COMPILER(compilerid)"
+ sql += "FOREIGN KEY(runid) REFERENCES RUN(runid)"
sql += " )"
conn.execute(sql)
@@ -143,7 +145,7 @@
return(None)
-def addRows(conn,elem,tableName,full):
+def addRows(conn,elem,tableName,full,runid=0):
# List of columns we have in DB which is
# different from the columns in the table
compilerid = 0
@@ -211,6 +213,7 @@
keys[field]=row[field]
+ keys['RUN']=runid
# Get foreign keys and create missing data
for field in common:
if field in VALKEYFIELD:
@@ -261,31 +264,34 @@
conn.execute("INSERT INTO CONFIG(compilerid,platformid,coreid,date) VALUES(?,?,?,?)" ,(config['compilerid'],config['platformid'],config['coreid'],fullDate))
conn.commit()
-def addOneBenchmark(elem,fullPath,db,group):
+def getGroup(a):
+ return(re.sub(r'^(.+)(F64|F32|F16|Q31|Q15|Q7|U32|U16|U8|S32|S16|S8)$',r'\1',a))
+
+def addOneBenchmark(elem,fullPath,db,group,runid):
if os.path.isfile(fullPath):
full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
fullDate = datetime.datetime.now()
full['DATE'] = fullDate
if group:
- tableName = group
+ tableName = getGroup(group)
else:
- tableName = elem.data["class"]
+ tableName = getGroup(elem.data["class"])
conn = sqlite3.connect(db)
createTableIfMissing(conn,elem,tableName,full)
- config = addRows(conn,elem,tableName,full)
+ config = addRows(conn,elem,tableName,full,runid)
addConfig(conn,config,fullDate)
conn.close()
-def addToDB(benchmark,dbpath,elem,group):
+def addToDB(benchmark,dbpath,elem,group,runid):
if not elem.data["deprecated"]:
if elem.params:
benchPath = os.path.join(benchmark,elem.fullPath(),"fullBenchmark.csv")
print("Processing %s" % benchPath)
- addOneBenchmark(elem,benchPath,dbpath,group)
+ addOneBenchmark(elem,benchPath,dbpath,group,runid)
for c in elem.children:
- addToDB(benchmark,dbpath,c,group)
+ addToDB(benchmark,dbpath,c,group,runid)
@@ -295,6 +301,7 @@
parser.add_argument('-b', nargs='?',type = str, default="FullBenchmark", help="Full Benchmark dir path")
#parser.add_argument('-e', action='store_true', help="Embedded test")
parser.add_argument('-o', nargs='?',type = str, default="bench.db", help="Benchmark database")
+parser.add_argument('-r', nargs='?',type = int, default=0, help="Run ID")
parser.add_argument('others', nargs=argparse.REMAINDER)
@@ -310,7 +317,7 @@
group=args.others[0]
else:
group=None
- addToDB(args.b,args.o,root,group)
+ addToDB(args.b,args.o,root,group,args.r)
else:
parser.print_help()
\ No newline at end of file
diff --git a/CMSIS/DSP/Testing/addToRegDB.py b/CMSIS/DSP/Testing/addToRegDB.py
index 3e7a54a..cf6297d 100755
--- a/CMSIS/DSP/Testing/addToRegDB.py
+++ b/CMSIS/DSP/Testing/addToRegDB.py
@@ -23,12 +23,13 @@
MKINTFIELD=['ID','MAX']
MKREALFIELD=['MAXREGCOEF']
MKDATEFIELD=['DATE']
-MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
+MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE','RUN']
MKKEYFIELDID={'CATEGORY':'categoryid',
'PLATFORM':'platformid',
'CORE':'coreid',
'COMPILER':'compilerid',
- 'TYPE':'typeid'}
+ 'TYPE':'typeid',
+ 'RUN':'runid'}
# For table value extraction
VALSTRFIELD=['NAME','VERSION','Regression']
@@ -58,7 +59,7 @@
colsToKeep=[]
cols = list(full.columns)
params=diff(elem.params.full , elem.params.summary)
- common = diff(cols + ["TYPE"] , ['OLDID'] + params)
+ common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)
for field in common:
if field in MKSTRFIELD:
@@ -80,7 +81,7 @@
sql = "CREATE TABLE %s (" % tableName
cols = list(full.columns)
params=diff(elem.params.full , elem.params.summary)
- common = diff(cols + ["TYPE"] , ['OLDID'] + params)
+ common = diff(cols + ["TYPE","RUN"] , ['OLDID'] + params)
sql += "%sid INTEGER PRIMARY KEY" % (tableName)
start = ","
@@ -109,6 +110,7 @@
sql += "FOREIGN KEY(platformid) REFERENCES PLATFORM(platformid),"
sql += "FOREIGN KEY(coreid) REFERENCES CORE(coreid),"
sql += "FOREIGN KEY(compilerid) REFERENCES COMPILER(compilerid)"
+ sql += "FOREIGN KEY(runid) REFERENCES RUN(runid)"
sql += " )"
conn.execute(sql)
@@ -149,7 +151,7 @@
return(None)
-def addRows(conn,elem,tableName,full):
+def addRows(conn,elem,tableName,full,runid=0):
# List of columns we have in DB which is
# different from the columns in the table
compilerid = 0
@@ -218,7 +220,7 @@
if field in VALBOOLFIELD:
keys[field]=row[field]
-
+ keys['RUN']=runid
# Get foreign keys and create missing data
for field in common:
if field in VALKEYFIELD:
@@ -272,31 +274,34 @@
conn.execute("INSERT INTO CONFIG(compilerid,platformid,coreid,date) VALUES(?,?,?,?)" ,(config['compilerid'],config['platformid'],config['coreid'],fullDate))
conn.commit()
-def addOneBenchmark(elem,fullPath,db,group):
+def getGroup(a):
+ return(re.sub(r'^(.+)(F64|F32|F16|Q31|Q15|Q7|U32|U16|U8|S32|S16|S8)$',r'\1',a))
+
+def addOneBenchmark(elem,fullPath,db,group,runid):
if os.path.isfile(fullPath):
full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
fullDate = datetime.datetime.now()
full['DATE'] = fullDate
if group:
- tableName = group
+ tableName = getGroup(group)
else:
- tableName = elem.data["class"]
+ tableName = getGroup(elem.data["class"])
conn = sqlite3.connect(db)
createTableIfMissing(conn,elem,tableName,full)
- config = addRows(conn,elem,tableName,full)
+ config = addRows(conn,elem,tableName,full,runid)
addConfig(conn,config,fullDate)
conn.close()
-def addToDB(benchmark,dbpath,elem,group):
+def addToDB(benchmark,dbpath,elem,group,runid):
if not elem.data["deprecated"]:
if elem.params:
benchPath = os.path.join(benchmark,elem.fullPath(),"regression.csv")
print("Processing %s" % benchPath)
- addOneBenchmark(elem,benchPath,dbpath,group)
+ addOneBenchmark(elem,benchPath,dbpath,group,runid)
for c in elem.children:
- addToDB(benchmark,dbpath,c,group)
+ addToDB(benchmark,dbpath,c,group,runid)
@@ -306,6 +311,7 @@
parser.add_argument('-b', nargs='?',type = str, default="FullBenchmark", help="Full Benchmark dir path")
#parser.add_argument('-e', action='store_true', help="Embedded test")
parser.add_argument('-o', nargs='?',type = str, default="reg.db", help="Regression benchmark database")
+parser.add_argument('-r', nargs='?',type = int, default=0, help="Run ID")
parser.add_argument('others', nargs=argparse.REMAINDER)
@@ -321,7 +327,7 @@
group=args.others[0]
else:
group=None
- addToDB(args.b,args.o,root,group)
+ addToDB(args.b,args.o,root,group,args.r)
else:
parser.print_help()
\ No newline at end of file
diff --git a/CMSIS/DSP/Testing/bench.txt b/CMSIS/DSP/Testing/bench.txt
index c4a9a3a..2a5c83c 100755
--- a/CMSIS/DSP/Testing/bench.txt
+++ b/CMSIS/DSP/Testing/bench.txt
@@ -351,8 +351,8 @@
Output ERR_F32_ID : Err
Params PARAM1_ID = {
- NumTaps = [4,8,16,32,64]
- NB = [16,64,128,256]
+ NumTaps = [16,32,64]
+ NB = [64,128,256]
}
Functions {
@@ -388,8 +388,8 @@
Output ERR_Q31_ID : Err
Params PARAM1_ID = {
- NumTaps = [4,8,16,32,64]
- NB = [16,64,128,256]
+ NumTaps = [16,32,64]
+ NB = [64,128,256]
}
Functions {
@@ -425,8 +425,8 @@
Output ERR_Q15_ID : Err
Params PARAM1_ID = {
- NumTaps = [4,8,16,32,64]
- NB = [16,64,128,256]
+ NumTaps = [16,32,64]
+ NB = [64,128,256]
}
Functions {
@@ -464,8 +464,8 @@
Output OUT_SAMPLES_F32_ID : Output
Params PARAM1_ID = {
- NBA = [4,5,9,16,64]
- NBB = [5,9,16,128]
+ NBA = [9,16,64]
+ NBB = [9,16,128]
}
Functions {
@@ -495,8 +495,8 @@
Output OUT_SAMPLES_Q31_ID : Output
Params PARAM1_ID = {
- NBA = [4,5,9,16,64]
- NBB = [5,9,16,128]
+ NBA = [9,16,64]
+ NBB = [9,16,128]
}
Functions {
@@ -526,8 +526,8 @@
Output OUT_SAMPLES_Q15_ID : Output
Params PARAM1_ID = {
- NBA = [4,5,9,16,64]
- NBB = [5,9,16,128]
+ NBA = [9,16,64]
+ NBB = [9,16,128]
}
Functions {
@@ -557,8 +557,8 @@
Output OUT_SAMPLES_Q7_ID : Output
Params PARAM1_ID = {
- NBA = [4,5,9,16,64]
- NBB = [5,9,16,128]
+ NBA = [9,16,64]
+ NBB = [9,16,128]
}
Functions {
@@ -762,7 +762,7 @@
Output OUT_SAMPLES_F32_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -792,7 +792,7 @@
Output OUT_SAMPLES_Q31_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -820,7 +820,7 @@
Output OUT_SAMPLES_Q15_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -848,7 +848,7 @@
Output OUT_SAMPLES_F32_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -873,7 +873,7 @@
Output OUT_SAMPLES_Q31_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -898,7 +898,7 @@
Output OUT_SAMPLES_Q15_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -962,7 +962,7 @@
Output OUT_SAMPLES_F32_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -992,7 +992,7 @@
Output OUT_SAMPLES_Q31_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -1020,7 +1020,7 @@
Output OUT_SAMPLES_Q15_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -1048,7 +1048,7 @@
Output OUT_SAMPLES_Q7_ID : Output
Params PARAM1_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
}
Functions {
@@ -1292,7 +1292,7 @@
Params CFFT_PARAM_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
IFFT = [0,1]
REV = [0,1]
}
@@ -1304,7 +1304,7 @@
}
Params RFFT_PARAM_ID = {
- NB = [32,64,128,256]
+ NB = [64,128,256]
IFFT = [0,1]
REV = [1]
}
@@ -1342,7 +1342,7 @@
Params CFFT_PARAM_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
IFFT = [0,1]
REV = [0,1]
}
@@ -1354,7 +1354,7 @@
}
Params RFFT_PARAM_ID = {
- NB = [32,64,128,256]
+ NB = [64,128,256]
IFFT = [0,1]
REV = [0,1]
}
@@ -1392,7 +1392,7 @@
Params CFFT_PARAM_ID = {
- NB = [16,64,128,256]
+ NB = [64,128,256]
IFFT = [0,1]
REV = [0,1]
}
@@ -1404,7 +1404,7 @@
}
Params RFFT_PARAM_ID = {
- NB = [32,64,128,256]
+ NB = [64,128,256]
IFFT = [0,1]
REV = [1]
}
diff --git a/CMSIS/DSP/Testing/createDb.sql b/CMSIS/DSP/Testing/createDb.sql
index eca7a75..9285be6 100755
--- a/CMSIS/DSP/Testing/createDb.sql
+++ b/CMSIS/DSP/Testing/createDb.sql
@@ -29,6 +29,11 @@
CREATE INDEX compiler_date_index ON COMPILER(date);
CREATE INDEX compiler_all_index ON COMPILER(compilerkindid,version,date);
+CREATE TABLE RUN (
+ runid INTEGER PRIMARY KEY,
+ date text
+ );
+
CREATE TABLE TYPE (
typeid INTEGER PRIMARY KEY,
type text );
@@ -76,15 +81,23 @@
INSERT INTO CORE VALUES(3,"m3","ARMCM3");
INSERT INTO CORE VALUES(4,"m4","ARMCM4");
INSERT INTO CORE VALUES(5,"m4f","ARMCM4_FP");
-INSERT INTO CORE VALUES(6,"m7","ARMCM7_DP");
+INSERT INTO CORE VALUES(6,"m7d","ARMCM7_DP");
INSERT INTO CORE VALUES(7,"m23","ARMCM23");
-INSERT INTO CORE VALUES(8,"m33","ARMCM33_DSP_FP");
-INSERT INTO CORE VALUES(9,"m35","ARMCM35P_DSP_FP");
+INSERT INTO CORE VALUES(8,"m33f","ARMCM33_DSP_FP");
+INSERT INTO CORE VALUES(9,"m35f","ARMCM35P_DSP_FP");
INSERT INTO CORE VALUES(10,"a5","ARMCA5");
INSERT INTO CORE VALUES(11,"a7","ARMCA7");
INSERT INTO CORE VALUES(12,"a9","ARMCA9");
INSERT INTO CORE VALUES(13,"a15","ARMCA15");
-INSERT INTO CORE VALUES(14,"m55","ARMv81MML_DSP_DP_MVE_FP");
+INSERT INTO CORE VALUES(14,"m55mvef","ARMv81MML_DSP_DP_MVE_FP");
+
+INSERT INTO CORE VALUES(15,"m0","M0");
+INSERT INTO CORE VALUES(16,"m7","M7");
+INSERT INTO CORE VALUES(17,"m33","M33");
+INSERT INTO CORE VALUES(18,"m4","M4");
+INSERT INTO CORE VALUES(19,"m55 mve","M55");
+INSERT INTO CORE VALUES(20,"m55 scalar","M55SCALAR");
+
.quit
diff --git a/CMSIS/DSP/Testing/examples.sql b/CMSIS/DSP/Testing/examples.sql
index e4c201c..660ff20 100755
--- a/CMSIS/DSP/Testing/examples.sql
+++ b/CMSIS/DSP/Testing/examples.sql
@@ -6,7 +6,7 @@
.headers ON
.mode csv
-/*
+
select NB,CATEGORY.category,NAME,CYCLES,PLATFORM.platform,CORE.core,COMPILERKIND.compiler,COMPILER.version,BasicMathsBenchmarksF32.DATE
from BasicMathsBenchmarksF32
INNER JOIN CATEGORY USING(categoryid)
@@ -15,8 +15,9 @@
INNER JOIN COMPILER USING(compilerid)
INNER JOIN COMPILERKIND USING(compilerkindid)
;
-*/
+
+/*
select Regression,MAX,MAXREGCOEF,CATEGORY.category,NAME,PLATFORM.platform,CORE.core,COMPILERKIND.compiler,COMPILER.version,BasicMathsBenchmarksF32.DATE
from BasicMathsBenchmarksF32
INNER JOIN CATEGORY USING(categoryid)
@@ -25,7 +26,7 @@
INNER JOIN COMPILER USING(compilerid)
INNER JOIN COMPILERKIND USING(compilerkindid)
;
-
+*/
/*
Compute the max cycles for a test configuration (category + name)
diff --git a/CMSIS/DSP/Testing/extractDb.py b/CMSIS/DSP/Testing/extractDb.py
new file mode 100755
index 0000000..13595bd
--- /dev/null
+++ b/CMSIS/DSP/Testing/extractDb.py
@@ -0,0 +1,283 @@
+import argparse
+import sqlite3
+import re
+import pandas as pd
+import numpy as np
+
+# Command to get last runid
+lastID="""SELECT runid FROM RUN ORDER BY runid DESC LIMIT 1
+"""
+
+def getLastRunID():
+ r=c.execute(lastID)
+ return(int(r.fetchone()[0]))
+
+
+runid = 1
+
+parser = argparse.ArgumentParser(description='Generate summary benchmarks')
+
+parser.add_argument('-b', nargs='?',type = str, default="bench.db", help="Benchmark database")
+parser.add_argument('-o', nargs='?',type = str, default="full.md", help="Full summary")
+parser.add_argument('-r', action='store_true', help="Regression database")
+
+# For runid or runid range
+parser.add_argument('others', nargs=argparse.REMAINDER)
+
+args = parser.parse_args()
+
+c = sqlite3.connect(args.b)
+
+if args.others:
+ runid=int(args.others[0])
+else:
+ runid=getLastRunID()
+
+# We extract data only from data tables
+# Those tables below are used for descriptions
+REMOVETABLES=['RUN','CORE', 'PLATFORM', 'COMPILERKIND', 'COMPILER', 'TYPE', 'CATEGORY', 'CONFIG']
+
+# This is assuming the database is generated by the regression script
+# So platform is the same for all benchmarks.
+# Category and type is coming from the test name in the yaml
+# So no need to add this information here
+# Name is removed here because it is added at the beginning
+REMOVECOLUMNS=['runid','NAME','type','platform','category','coredef','OPTIMIZED','HARDFP','FASTMATH','NEON','HELIUM','UNROLL','ROUNDING','DATE','compilerkindid','date','categoryid', 'ID', 'platformid', 'coreid', 'compilerid', 'typeid']
+
+# Get existing benchmark tables
+def getBenchTables():
+ r=c.execute("SELECT name FROM sqlite_master WHERE type='table'")
+ benchtables=[]
+ for table in r:
+ if not table[0] in REMOVETABLES:
+ benchtables.append(table[0])
+ return(benchtables)
+
+# get existing types in a table
+def getExistingTypes(benchTable):
+ r=c.execute("select distinct typeid from %s" % benchTable).fetchall()
+ result=[x[0] for x in r]
+ return(result)
+
+# Get compilers from specific type and table
+versioncompiler="""select distinct compiler,version from %s
+ INNER JOIN COMPILER USING(compilerid)
+ INNER JOIN COMPILERKIND USING(compilerkindid) WHERE typeid=?"""
+
+# Get existing compiler in a table for a specific type
+# (In case report is structured by types)
+def getExistingCompiler(benchTable,typeid):
+ r=c.execute(versioncompiler % benchTable,(typeid,)).fetchall()
+ return(r)
+
+# Get type name from type id
+def getTypeName(typeid):
+ r=c.execute("select type from TYPE where typeid=?",(typeid,)).fetchone()
+ return(r[0])
+
+# Diff of 2 lists
+def diff(first, second):
+ second = set(second)
+ return [item for item in first if item not in second]
+
+
+# Command to get data for specific compiler
+# and type
+benchCmd="""select %s from %s
+ INNER JOIN CATEGORY USING(categoryid)
+ INNER JOIN PLATFORM USING(platformid)
+ INNER JOIN CORE USING(coreid)
+ INNER JOIN COMPILER USING(compilerid)
+ INNER JOIN COMPILERKIND USING(compilerkindid)
+ INNER JOIN TYPE USING(typeid)
+ WHERE compiler=? AND VERSION=? AND typeid = ? AND runid = ?
+ """
+
+# Command to get test names for specific compiler
+# and type
+benchNames="""select distinct NAME from %s
+ INNER JOIN COMPILER USING(compilerid)
+ INNER JOIN COMPILERKIND USING(compilerkindid)
+ INNER JOIN TYPE USING(typeid)
+ WHERE compiler=? AND VERSION=? AND typeid = ? AND runid = ?
+ """
+
+# Command to get columns for specific table
+benchCmdColumns="""select * from %s
+ INNER JOIN CATEGORY USING(categoryid)
+ INNER JOIN PLATFORM USING(platformid)
+ INNER JOIN CORE USING(coreid)
+ INNER JOIN COMPILER USING(compilerid)
+ INNER JOIN COMPILERKIND USING(compilerkindid)
+ INNER JOIN TYPE USING(typeid)
+ """
+
+def joinit(iterable, delimiter):
+ it = iter(iterable)
+ yield next(it)
+ for x in it:
+ yield delimiter
+ yield x
+
+# Is not a column name finishing by id
+# (often primary key for thetable)
+def isNotIDColumn(col):
+ if re.match(r'^.*id$',col):
+ return(False)
+ else:
+ return(True)
+
+# Get test names
+# for specific typeid and compiler (for the data)
+def getTestNames(benchTable,comp,typeid):
+ vals=(comp[0],comp[1],typeid,runid)
+ result=c.execute(benchNames % benchTable,vals).fetchall()
+ return([x[0] for x in list(result)])
+
+# Get names of columns and data for a table
+# for specific typeid and compiler (for the data)
+def getColNamesAndData(benchTable,comp,typeid):
+ cursor=c.cursor()
+ result=cursor.execute(benchCmdColumns % (benchTable))
+ cols= [member[0] for member in cursor.description]
+ keepCols = ['NAME'] + [c for c in diff(cols , REMOVECOLUMNS) if isNotIDColumn(c)]
+ keepColsStr = "".join(joinit(keepCols,","))
+ vals=(comp[0],comp[1],typeid,runid)
+ result=cursor.execute(benchCmd % (keepColsStr,benchTable),vals)
+ vals =np.array([list(x) for x in list(result)])
+ return(keepCols,vals)
+
+# Write columns in markdown format
+def writeColumns(f,cols):
+ colStr = "".join(joinit(cols,"|"))
+ f.write("|")
+ f.write(colStr)
+ f.write("|\n")
+ sepStr="".join(joinit([":-:" for x in cols],"|"))
+ f.write("|")
+ f.write(sepStr)
+ f.write("|\n")
+
+# Write row in markdown format
+def writeRow(f,row):
+ row=[str(x) for x in row]
+ rowStr = "".join(joinit(row,"|"))
+ f.write("|")
+ f.write(rowStr)
+ f.write("|\n")
+
+PARAMS=["NB","NumTaps", "NBA", "NBB", "Factor", "NumStages","VECDIM","NBR","NBC","NBI","IFFT", "BITREV"]
+
+def regressionTableFor(name,output,ref,toSort,indexCols,field):
+ data=ref.pivot_table(index=indexCols, columns='core',
+ values=[field], aggfunc='first')
+
+ data=data.sort_values(toSort)
+
+ cores = [c[1] for c in list(data.columns)]
+ columns = diff(indexCols,['NAME']) + cores
+
+ writeColumns(output,columns)
+ dataForFunc=data.loc[name]
+ if type(dataForFunc) is pd.DataFrame:
+ for row in dataForFunc.itertuples():
+ row=list(row)
+ if type(row[0]) is int:
+ row=[row[0]] + row[1:]
+ else:
+ row=list(row[0]) + row[1:]
+ writeRow(output,row)
+ else:
+ writeRow(output,dataForFunc)
+
+def formatTableByCore(output,testNames,cols,vals):
+ if vals.size != 0:
+ ref=pd.DataFrame(vals,columns=cols)
+ toSort=["NAME"]
+
+ for param in PARAMS:
+ if param in ref.columns:
+ ref[param]=pd.to_numeric(ref[param])
+ toSort.append(param)
+ if args.r:
+ # Regression table
+ ref['MAX']=pd.to_numeric(ref['MAX'])
+ ref['MAXREGCOEF']=pd.to_numeric(ref['MAXREGCOEF'])
+
+ indexCols=diff(cols,['core','Regression','MAXREGCOEF','MAX','version','compiler'])
+ valList = ['Regression']
+ else:
+ ref['CYCLES']=pd.to_numeric(ref['CYCLES'])
+
+ indexCols=diff(cols,['core','CYCLES','version','compiler'])
+ valList = ['CYCLES']
+
+
+
+ for name in testNames:
+ if args.r:
+ output.write("#### %s\n" % name)
+
+ output.write("##### Regression\n" )
+ regressionTableFor(name,output,ref,toSort,indexCols,'Regression')
+
+ output.write("##### Max cycles\n" )
+ regressionTableFor(name,output,ref,toSort,indexCols,'MAX')
+
+ output.write("##### Max Reg Coef\n" )
+ regressionTableFor(name,output,ref,toSort,indexCols,'MAXREGCOEF')
+
+ else:
+ data=ref.pivot_table(index=indexCols, columns='core',
+ values=valList, aggfunc='first')
+
+ data=data.sort_values(toSort)
+
+ cores = [c[1] for c in list(data.columns)]
+ columns = diff(indexCols,['NAME']) + cores
+
+ output.write("#### %s\n" % name)
+ writeColumns(output,columns)
+ dataForFunc=data.loc[name]
+ if type(dataForFunc) is pd.DataFrame:
+ for row in dataForFunc.itertuples():
+ row=list(row)
+ if type(row[0]) is int:
+ row=[row[0]] + row[1:]
+ else:
+ row=list(row[0]) + row[1:]
+ writeRow(output,row)
+ else:
+ writeRow(output,dataForFunc)
+
+# Add a report for each table
+def addReportFor(output,benchName):
+ print("Process %s\n" % benchName)
+ output.write("# %s\n" % benchName)
+ allTypes = getExistingTypes(benchName)
+ # Add report for each type
+ for aTypeID in allTypes:
+ typeName = getTypeName(aTypeID)
+ output.write("## %s\n" % typeName)
+ ## Add report for each compiler
+ allCompilers = getExistingCompiler(benchName,aTypeID)
+ for compiler in allCompilers:
+ #print(compiler)
+ output.write("### %s (%s)\n" % compiler)
+ cols,vals=getColNamesAndData(benchName,compiler,aTypeID)
+ names=getTestNames(benchName,compiler,aTypeID)
+ formatTableByCore(output,names,cols,vals)
+
+
+
+
+
+try:
+ with open(args.o,"w") as output:
+ benchtables=getBenchTables()
+ for bench in benchtables:
+ addReportFor(output,bench)
+finally:
+ c.close()
+
+
diff --git a/CMSIS/DSP/Testing/runAllBenchmarks.bat b/CMSIS/DSP/Testing/runAllBenchmarks.bat
deleted file mode 100755
index 3074ddc..0000000
--- a/CMSIS/DSP/Testing/runAllBenchmarks.bat
+++ /dev/null
@@ -1,70 +0,0 @@
-@ECHO OFF
-
-echo "Basic Maths"
-python processTests.py -e BasicBenchmarks
-call:runBench
-
-echo "Complex Maths"
-python processTests.py -e ComplexBenchmarks
-call:runBench
-
-echo "FIR"
-python processTests.py -e FIR
-call:runBench
-
-echo "Convolution / Correlation"
-python processTests.py -e MISC
-call:runBench
-
-echo "Decimation / Interpolation"
-python processTests.py -e DECIM
-call:runBench
-
-echo "BiQuad"
-python processTests.py -e BIQUAD
-call:runBench
-
-echo "Controller"
-python processTests.py -e Controller
-call:runBench
-
-echo "Fast Math"
-python processTests.py -e FastMath
-call:runBench
-
-echo "Barycenter"
-python processTests.py -e SupportBarF32
-call:runBench
-
-echo "Support"
-python processTests.py -e Support
-call:runBench
-
-echo "Unary Matrix"
-python processTests.py -e Unary
-call:runBench
-
-echo "Binary Matrix"
-python processTests.py -e Binary
-call:runBench
-
-echo "Transform"
-python processTests.py -e Transform
-call:runBench
-
-EXIT /B
-
-:runBench
-REM pushd build_m7
-REM pushd build_m0
-pushd build_a5
-make
-REM "C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_MPS2_Cortex-M7.exe" -a Testing > result.txt
-REM "C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_MPS2_Cortex-M0.exe" -a Testing > result.txt
-"C:\Program Files\ARM\Development Studio 2019.0\sw\models\bin\FVP_VE_Cortex-A5x1.exe" -a Testing > result.txt
-popd
-echo "Parse result"
-REM python processResult.py -e -r build_m7\result.txt
-REM python processResult.py -e -r build_m0\result.txt
-python processResult.py -e -r build_a5\result.txt
-goto:eof
\ No newline at end of file
diff --git a/CMSIS/DSP/Testing/runAllBenchmarks.py b/CMSIS/DSP/Testing/runAllBenchmarks.py
deleted file mode 100755
index b62a15c..0000000
--- a/CMSIS/DSP/Testing/runAllBenchmarks.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import os
-import os.path
-import subprocess
-import colorama
-from colorama import init,Fore, Back, Style
-import argparse
-
-GROUPS = [
-"BasicBenchmarks",
-"ComplexBenchmarks",
-"FIR",
-"MISC",
-"DECIM",
-"BIQUAD",
-"Controller",
-"FastMath",
-"SupportBarF32",
-"Support",
-"Unary",
-"Binary",
-"Transform"
-]
-
-init()
-
-def msg(t):
- print(Fore.CYAN + t + Style.RESET_ALL)
-
-def processTest(test):
- subprocess.call(["python","processTests.py","-e",test])
-
-def addToDB(cmd):
- for g in GROUPS:
- msg("Add group %s" % g)
- subprocess.call(["python",cmd,g])
-
-def run(build,fvp,custom=None):
- result = "results.txt"
- resultPath = os.path.join(build,result)
-
- current=os.getcwd()
- try:
- msg("Build" )
- os.chdir(build)
- subprocess.call(["make"])
- msg("Run")
- with open(result,"w") as results:
- if custom:
- subprocess.call([fvp] + custom,stdout=results)
- else:
- subprocess.call([fvp,"-a","Testing"],stdout=results)
- finally:
- os.chdir(current)
-
- msg("Parse result")
- subprocess.call(["python","processResult.py","-e","-r",resultPath])
-
- msg("Regression computations")
- subprocess.call(["python","summaryBench.py","-r",resultPath])
-
- msg("Add results to benchmark database")
- addToDB("addToDB.py")
-
- msg("Add results to regression database")
- addToDB("addToRegDB.py")
-
-
-
-def processAndRun(buildfolder,fvp,custom=None):
- processTest("DSPBenchmarks")
- run(buildfolder,fvp,custom=custom)
-
-parser = argparse.ArgumentParser(description='Parse test description')
-parser.add_argument('-f', nargs='?',type = str, default="build_benchmark_m7", help="Build folder")
-parser.add_argument('-v', nargs='?',type = str, default="C:\\Program Files\\ARM\\Development Studio 2019.0\\sw\\models\\bin\\FVP_MPS2_Cortex-M7.exe", help="Fast Model")
-parser.add_argument('-c', nargs='?',type = str, help="Custom args")
-
-args = parser.parse_args()
-
-if args.f is not None:
- BUILDFOLDER=args.f
-else:
- BUILDFOLDER="build_benchmark_m7"
-
-if args.v is not None:
- FVP=args.v
-else:
- FVP="C:\\Program Files\\ARM\\Development Studio 2019.0\\sw\\models\\bin\\FVP_MPS2_Cortex-M7.exe"
-
-
-if args.c:
- custom = args.c.split()
-else:
- custom = None
-
-print(Fore.RED + "bench.db and reg.db databases must exist before running this script" + Style.RESET_ALL)
-
-msg("Process benchmark description file")
-subprocess.call(["python", "preprocess.py","-f","bench.txt"])
-
-msg("Generate all missing C files")
-subprocess.call(["python","processTests.py", "-e"])
-
-
-processAndRun(BUILDFOLDER,FVP,custom=custom)
diff --git a/CMSIS/DSP/Testing/runAllTests.py b/CMSIS/DSP/Testing/runAllTests.py
index d1455ed..12d1e3e 100755
--- a/CMSIS/DSP/Testing/runAllTests.py
+++ b/CMSIS/DSP/Testing/runAllTests.py
@@ -9,7 +9,29 @@
import sys
import itertools
from pathlib import Path
+import sqlite3
+# Command to get last runid
+lastID="""SELECT runid FROM RUN ORDER BY runid DESC LIMIT 1
+"""
+
+addNewIDCmd="""INSERT INTO RUN VALUES(?,date('now'))
+"""
+
+benchID = 0
+regID = 0
+
+def getLastRunID(c):
+ r=c.execute(lastID)
+ result=r.fetchone()
+ if result is None:
+ return(0)
+ else:
+ return(int(result[0]))
+
+def addNewID(c,newid):
+ c.execute(addNewIDCmd,(newid,))
+ c.commit()
# Small state machine
def updateTestStatus(testStatusForThisBuild,newTestStatus):
@@ -120,10 +142,26 @@
if not os.path.exists(args.db):
createDb(args.sqlite,args.db)
+ conn = sqlite3.connect(args.db)
+ try:
+ currentID = getLastRunID(conn)
+ benchID = currentID + 1
+ addNewID(conn,benchID)
+ finally:
+ conn.close()
+
if args.regdb is not None:
if not os.path.exists(args.regdb):
createDb(args.sqlite,args.regdb)
+ conn = sqlite3.connect(args.regdb)
+ try:
+ currentID = getLastRunID(conn)
+ regID = currentID + 1
+ addNewID(conn,regID)
+ finally:
+ conn.close()
+
with open(args.i,"r") as f:
config=yaml.safe_load(f)
@@ -208,7 +246,7 @@
build.createArchive(flags)
msg("Config " + str(flagConfig) + "\n")
- build.createCMake(flags,args.b,args.p)
+ build.createCMake(core,flags,args.b,args.p)
for test in config["TESTS"]:
msg(test["testName"]+"\n")
testClass=test["testClass"]
@@ -220,7 +258,7 @@
if 'SIM' in config:
if core in config['SIM']:
fvp = config['SIM'][core]
- newTestStatus = test.runAndProcess(compiler,fvp,sim,args.b,args.db,args.regdb)
+ newTestStatus = test.runAndProcess(compiler,fvp,sim,args.b,args.db,args.regdb,benchID,regID)
testStatusForThisBuild = updateTestStatus(testStatusForThisBuild,newTestStatus)
if testStatusForThisBuild != NOTESTFAILED:
failedBuild[buildStr] = testStatusForThisBuild
diff --git a/CMSIS/DSP/Toolchain/AC6.cmake b/CMSIS/DSP/Toolchain/AC6.cmake
index ec98852..f232acd 100644
--- a/CMSIS/DSP/Toolchain/AC6.cmake
+++ b/CMSIS/DSP/Toolchain/AC6.cmake
@@ -17,7 +17,7 @@
get_target_property(DISABLEOPTIM ${PROJECTNAME} DISABLEOPTIMIZATION)
if ((OPTIMIZED) AND (NOT DISABLEOPTIM))
#cmake_print_variables(DISABLEOPTIM)
- target_compile_options(${PROJECTNAME} PRIVATE "-O3")
+ target_compile_options(${PROJECTNAME} PRIVATE "-Ofast")
endif()
if (FASTMATHCOMPUTATIONS)