CMSIS-NN: Refactor, remove unused function arm_nn_mat_mult_kernel_s8_s16_reordered (#1417)

* CMSIS-NN: Remove unused function arm_nn_mat_mult_kernel_s8_s16_reordered.c
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 1fba1b3..c1c9994 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -2815,7 +2815,6 @@
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c"/>
-        <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c"/>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 685e8cc..4d13c2f 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        24 January 2022
- * $Revision:    V.7.4.0
+ * $Date:        7 February 2022
+ * $Revision:    V.8.0.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -1585,87 +1585,6 @@
                                     const uint16_t out_shift,
                                     const q7_t *bias,
                                     q7_t *pOut);
-/**
- * @brief Matrix-multiplication function for convolution with per-channel requantization.
- * @param[in]       input_a     pointer to operand A
- * @param[in]       input_b     pointer to operand B, always consists of 2 vectors.
- * @param[in]       output_ch   number of rows of A
- * @param[in]       out_shift  pointer to per output channel requantization shift parameter.
- * @param[in]       out_mult   pointer to per output channel requantization multiplier parameter.
- * @param[in]       out_offset      output tensor offset.
- * @param[in]       activation_min   minimum value to clamp the output to. Range : int8
- * @param[in]       activation_max   maximum value to clamp the output to. Range : int8
- * @param[in]       num_col_a   number of columns of A
- * @param[in]       output_bias per output channel bias. Range : int32
- * @param[in,out]   out_0       pointer to output
- * @return     The function returns one of the two
- *              1. The incremented output pointer for a successful operation or
- *              2. NULL if implementation is not available.
- *
- * @details   This function does the matrix multiplication of weight matrix for all output channels
- *            with 2 columns from im2col and produces two elements/output_channel. The outputs are
- *            clamped in the range provided by activation min and max.
- *            Supported framework: TensorFlow Lite micro.
- */
-q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
-                                    const q15_t *input_b,
-                                    const uint16_t output_ch,
-                                    const int32_t *out_shift,
-                                    const int32_t *out_mult,
-                                    const int32_t out_offset,
-                                    const int16_t activation_min,
-                                    const int16_t activation_max,
-                                    const uint16_t num_col_a,
-                                    const int32_t *const output_bias,
-                                    q7_t *out_0);
-
-/**
- * @brief Matrix-multiplication of re-ordered input B with A.
- *
- * @details  For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence
- *           of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range
- *           provided by activation min and max.
- *   * @details
- *   - Supported framework : TensorFlow Lite Micro
- *   - The following constrains on the arguments apply
- *      -# num_col_a is a multiple of 4
- *      -# output_ch is a multiple of 2
- *
- */
-q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
-                                              const q15_t *input_b,
-                                              const uint16_t output_ch,
-                                              const int32_t *out_shift,
-                                              const int32_t *out_mult,
-                                              const int32_t out_offset,
-                                              const int16_t activation_min,
-                                              const int16_t activation_max,
-                                              const uint16_t num_col_a,
-                                              const int32_t *const output_bias,
-                                              q7_t *out_0);
-
-/**
- *@brief Matrix-multiplication function for convolution with reordered columns
- *@param[in]       pA          pointer to operand A
- *@param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
- *@param[in]       ch_im_out   numRow of A
- *@param[in]       numCol_A    numCol of A
- *@param[in]       bias_shift  amount of left-shift for bias
- *@param[in]       out_shift   amount of right-shift for output
- *@param[in]       bias        the bias
- *@param[in,out]   pOut        pointer to output
- *@return     The function returns the incremented output pointer
- *
- *@details  This function assumes that data in pInBuffer are reordered
- */
-q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
-                                              const q15_t *pInBuffer,
-                                              const uint16_t ch_im_out,
-                                              const uint16_t numCol_A,
-                                              const uint16_t bias_shift,
-                                              const uint16_t out_shift,
-                                              const q7_t *bias,
-                                              q7_t *pOut);
 
 #ifdef __cplusplus
 }
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index c99fdff..52c8e04 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        3. February 2022
- * $Revision:    V.6.0.1
+ * $Date:        7. February 2022
+ * $Revision:    V.6.1.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -743,6 +743,40 @@
 void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
 
 /**
+ * @brief Matrix-multiplication function for convolution with per-channel requantization.
+ * @param[in]       input_a     pointer to operand A
+ * @param[in]       input_b     pointer to operand B, always consists of 2 vectors.
+ * @param[in]       output_ch   number of rows of A
+ * @param[in]       out_shift  pointer to per output channel requantization shift parameter.
+ * @param[in]       out_mult   pointer to per output channel requantization multiplier parameter.
+ * @param[in]       out_offset      output tensor offset.
+ * @param[in]       activation_min   minimum value to clamp the output to. Range : int8
+ * @param[in]       activation_max   maximum value to clamp the output to. Range : int8
+ * @param[in]       num_col_a   number of columns of A
+ * @param[in]       output_bias per output channel bias. Range : int32
+ * @param[in,out]   out_0       pointer to output
+ * @return     The function returns one of the two
+ *              1. The incremented output pointer for a successful operation or
+ *              2. NULL if implementation is not available.
+ *
+ * @details   This function does the matrix multiplication of weight matrix for all output channels
+ *            with 2 columns from im2col and produces two elements/output_channel. The outputs are
+ *            clamped in the range provided by activation min and max.
+ *            Supported framework: TensorFlow Lite micro.
+ */
+q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
+                                    const q15_t *input_b,
+                                    const uint16_t output_ch,
+                                    const int32_t *out_shift,
+                                    const int32_t *out_mult,
+                                    const int32_t out_offset,
+                                    const int16_t activation_min,
+                                    const int16_t activation_max,
+                                    const uint16_t num_col_a,
+                                    const int32_t *const output_bias,
+                                    q7_t *out_0);
+
+/**
  * @brief macro for adding rounding offset
  */
 #ifndef ARM_NN_TRUNCATE
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
deleted file mode 100644
index 842a180..0000000
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Project:      CMSIS NN Library
- * Title:        arm_nn_mat_mult_kernel_s8_s16_reordered.c
- * Description:  Matrix-multiplication function for convolution with reordered columns
- *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.3
- *
- * Target Processor:  Cortex-M cores
- * -------------------------------------------------------------------- */
-
-#include "arm_nnfunctions.h"
-#include "arm_nnsupportfunctions.h"
-
-/*
- * Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel
- *        requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command.
- *
- * Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses
- *        read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and
- *        unifying these two functions is a potential future improvement.
- *
- */
-
-q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
-                                              const q15_t *input_b,
-                                              const uint16_t output_ch,
-                                              const int32_t *out_shift,
-                                              const int32_t *out_mult,
-                                              const int32_t out_offset,
-                                              const int16_t activation_min,
-                                              const int16_t activation_max,
-                                              const uint16_t num_col_a,
-                                              const int32_t *const output_bias,
-                                              q7_t *out_0)
-{
-#if defined(ARM_MATH_DSP)
-    /* set up the second output pointers */
-    q7_t *out_1 = out_0 + output_ch;
-    const int32_t *bias = output_bias;
-
-    uint16_t row_count = output_ch / 2;
-    const q7_t *ip_a0 = input_a;
-    /* this loop over rows in A */
-    while (row_count)
-    {
-        /* setup pointers for B */
-        const q15_t *ip_b0 = input_b;
-        const q15_t *ip_b1 = ip_b0 + num_col_a;
-
-        /* align the second pointer for A */
-        const q7_t *ip_a1 = ip_a0 + num_col_a;
-
-        /* Init accumulator with bias for channel N and N + 1 */
-        q31_t ch_0_out_0 = *bias;
-        q31_t ch_0_out_1 = *bias++;
-        q31_t ch_1_out_0 = *bias;
-        q31_t ch_1_out_1 = *bias++;
-
-        uint16_t col_count = num_col_a / 4;
-        /* accumulate over the vector */
-        while (col_count)
-        {
-            q31_t a01, a02, a11, a12;
-            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
-            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
-            ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
-
-            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
-            ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
-            ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
-
-            b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
-            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
-            ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
-            ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
-
-            col_count--;
-        } /* while over col_count */
-
-        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
-        ch_0_out_0 += out_offset;
-        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
-        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
-        *out_0++ = (q7_t)ch_0_out_0;
-
-        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
-        ch_0_out_1 += out_offset;
-        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
-        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
-        *out_1++ = (q7_t)ch_0_out_1;
-        out_mult++;
-        out_shift++;
-
-        ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
-        ch_1_out_0 += out_offset;
-        ch_1_out_0 = MAX(ch_1_out_0, activation_min);
-        ch_1_out_0 = MIN(ch_1_out_0, activation_max);
-        *out_0++ = (q7_t)ch_1_out_0;
-
-        ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
-        ch_1_out_1 += out_offset;
-        ch_1_out_1 = MAX(ch_1_out_1, activation_min);
-        ch_1_out_1 = MIN(ch_1_out_1, activation_max);
-        *out_1++ = (q7_t)ch_1_out_1;
-        out_mult++;
-        out_shift++;
-
-        /* skip row */
-        ip_a0 += num_col_a;
-        row_count--;
-    }
-
-    if (output_ch & 1)
-    {
-        /* setup pointers for B */
-        const q15_t *ip_b0 = input_b;
-        const q15_t *ip_b1 = ip_b0 + num_col_a;
-
-        /* Init accumulator with bias for channel N + 1 */
-        q31_t ch_0_out_0 = *bias;
-        q31_t ch_0_out_1 = ch_0_out_0;
-
-        int32_t col_count = num_col_a / 4;
-        while (col_count)
-        {
-            q31_t a01, a02;
-            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
-            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
-
-            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
-
-            b0 = arm_nn_read_q15x2_ia(&ip_b0);
-            b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
-            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
-            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
-
-            col_count--;
-        } /* while over col_count */
-
-        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
-        ch_0_out_0 += out_offset;
-        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
-        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
-        *out_0++ = (q7_t)ch_0_out_0;
-
-        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
-        ch_0_out_1 += out_offset;
-        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
-        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
-        *out_1++ = (q7_t)ch_0_out_1;
-    }
-
-    out_0 += output_ch;
-
-    /* return the new output pointer with offset */
-    return out_0;
-#else
-    (void)input_a;
-    (void)input_b;
-    (void)output_ch;
-    (void)out_shift;
-    (void)out_mult;
-    (void)out_offset;
-    (void)activation_min;
-    (void)activation_max;
-    (void)num_col_a;
-    (void)output_bias;
-    (void)out_0;
-    /* To be completed */
-    return NULL;
-#endif
-}