CMSIS-NN: Refactor, remove unused function arm_nn_mat_mult_kernel_s8_s16_reordered (#1417)
* CMSIS-NN: Remove unused function arm_nn_mat_mult_kernel_s8_s16_reordered.c
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 1fba1b3..c1c9994 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -2815,7 +2815,6 @@
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_u8_basic_ver1.c"/>
- <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_separable_conv_HWC_q7.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c"/>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 685e8cc..4d13c2f 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: 24 January 2022
- * $Revision: V.7.4.0
+ * $Date: 7 February 2022
+ * $Revision: V.8.0.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -1585,87 +1585,6 @@
const uint16_t out_shift,
const q7_t *bias,
q7_t *pOut);
-/**
- * @brief Matrix-multiplication function for convolution with per-channel requantization.
- * @param[in] input_a pointer to operand A
- * @param[in] input_b pointer to operand B, always consists of 2 vectors.
- * @param[in] output_ch number of rows of A
- * @param[in] out_shift pointer to per output channel requantization shift parameter.
- * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
- * @param[in] out_offset output tensor offset.
- * @param[in] activation_min minimum value to clamp the output to. Range : int8
- * @param[in] activation_max maximum value to clamp the output to. Range : int8
- * @param[in] num_col_a number of columns of A
- * @param[in] output_bias per output channel bias. Range : int32
- * @param[in,out] out_0 pointer to output
- * @return The function returns one of the two
- * 1. The incremented output pointer for a successful operation or
- * 2. NULL if implementation is not available.
- *
- * @details This function does the matrix multiplication of weight matrix for all output channels
- * with 2 columns from im2col and produces two elements/output_channel. The outputs are
- * clamped in the range provided by activation min and max.
- * Supported framework: TensorFlow Lite micro.
- */
-q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
- const q15_t *input_b,
- const uint16_t output_ch,
- const int32_t *out_shift,
- const int32_t *out_mult,
- const int32_t out_offset,
- const int16_t activation_min,
- const int16_t activation_max,
- const uint16_t num_col_a,
- const int32_t *const output_bias,
- q7_t *out_0);
-
-/**
- * @brief Matrix-multiplication of re-ordered input B with A.
- *
- * @details For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence
- * of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range
- * provided by activation min and max.
- * * @details
- * - Supported framework : TensorFlow Lite Micro
- * - The following constrains on the arguments apply
- * -# num_col_a is a multiple of 4
- * -# output_ch is a multiple of 2
- *
- */
-q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
- const q15_t *input_b,
- const uint16_t output_ch,
- const int32_t *out_shift,
- const int32_t *out_mult,
- const int32_t out_offset,
- const int16_t activation_min,
- const int16_t activation_max,
- const uint16_t num_col_a,
- const int32_t *const output_bias,
- q7_t *out_0);
-
-/**
- *@brief Matrix-multiplication function for convolution with reordered columns
- *@param[in] pA pointer to operand A
- *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
- *@param[in] ch_im_out numRow of A
- *@param[in] numCol_A numCol of A
- *@param[in] bias_shift amount of left-shift for bias
- *@param[in] out_shift amount of right-shift for output
- *@param[in] bias the bias
- *@param[in,out] pOut pointer to output
- *@return The function returns the incremented output pointer
- *
- *@details This function assumes that data in pInBuffer are reordered
- */
-q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
- const q15_t *pInBuffer,
- const uint16_t ch_im_out,
- const uint16_t numCol_A,
- const uint16_t bias_shift,
- const uint16_t out_shift,
- const q7_t *bias,
- q7_t *pOut);
#ifdef __cplusplus
}
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index c99fdff..52c8e04 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
- * $Date: 3. February 2022
- * $Revision: V.6.0.1
+ * $Date: 7. February 2022
+ * $Revision: V.6.1.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -743,6 +743,40 @@
void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
/**
+ * @brief Matrix-multiplication function for convolution with per-channel requantization.
+ * @param[in] input_a pointer to operand A
+ * @param[in] input_b pointer to operand B, always consists of 2 vectors.
+ * @param[in] output_ch number of rows of A
+ * @param[in] out_shift pointer to per output channel requantization shift parameter.
+ * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
+ * @param[in] out_offset output tensor offset.
+ * @param[in] activation_min minimum value to clamp the output to. Range : int8
+ * @param[in] activation_max maximum value to clamp the output to. Range : int8
+ * @param[in] num_col_a number of columns of A
+ * @param[in] output_bias per output channel bias. Range : int32
+ * @param[in,out] out_0 pointer to output
+ * @return The function returns one of the two
+ * 1. The incremented output pointer for a successful operation or
+ * 2. NULL if implementation is not available.
+ *
+ * @details This function does the matrix multiplication of weight matrix for all output channels
+ * with 2 columns from im2col and produces two elements/output_channel. The outputs are
+ * clamped in the range provided by activation min and max.
+ * Supported framework: TensorFlow Lite micro.
+ */
+q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
+ const q15_t *input_b,
+ const uint16_t output_ch,
+ const int32_t *out_shift,
+ const int32_t *out_mult,
+ const int32_t out_offset,
+ const int16_t activation_min,
+ const int16_t activation_max,
+ const uint16_t num_col_a,
+ const int32_t *const output_bias,
+ q7_t *out_0);
+
+/**
* @brief macro for adding rounding offset
*/
#ifndef ARM_NN_TRUNCATE
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
deleted file mode 100644
index 842a180..0000000
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* ----------------------------------------------------------------------
- * Project: CMSIS NN Library
- * Title: arm_nn_mat_mult_kernel_s8_s16_reordered.c
- * Description: Matrix-multiplication function for convolution with reordered columns
- *
- * $Date: 09. October 2020
- * $Revision: V.1.0.3
- *
- * Target Processor: Cortex-M cores
- * -------------------------------------------------------------------- */
-
-#include "arm_nnfunctions.h"
-#include "arm_nnsupportfunctions.h"
-
-/*
- * Matrix-multiplication with re-ordered input and bias inputs for convolution with per-channel
- * requantization. The re-ordering is a consequence of sign extension is done by the SXTB16 command.
- *
- * Refer header file for details. This function differs from arm_nn_mat_mult_kernel_s8_s16(), in that it uses
- * read_and_pad_reordered() instead of arm_nn_mat_mult_kernel_s8_s16(). Investigating the cycles impact and
- * unifying these two functions is a potential future improvement.
- *
- */
-
-q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
- const q15_t *input_b,
- const uint16_t output_ch,
- const int32_t *out_shift,
- const int32_t *out_mult,
- const int32_t out_offset,
- const int16_t activation_min,
- const int16_t activation_max,
- const uint16_t num_col_a,
- const int32_t *const output_bias,
- q7_t *out_0)
-{
-#if defined(ARM_MATH_DSP)
- /* set up the second output pointers */
- q7_t *out_1 = out_0 + output_ch;
- const int32_t *bias = output_bias;
-
- uint16_t row_count = output_ch / 2;
- const q7_t *ip_a0 = input_a;
- /* this loop over rows in A */
- while (row_count)
- {
- /* setup pointers for B */
- const q15_t *ip_b0 = input_b;
- const q15_t *ip_b1 = ip_b0 + num_col_a;
-
- /* align the second pointer for A */
- const q7_t *ip_a1 = ip_a0 + num_col_a;
-
- /* Init accumulator with bias for channel N and N + 1 */
- q31_t ch_0_out_0 = *bias;
- q31_t ch_0_out_1 = *bias++;
- q31_t ch_1_out_0 = *bias;
- q31_t ch_1_out_1 = *bias++;
-
- uint16_t col_count = num_col_a / 4;
- /* accumulate over the vector */
- while (col_count)
- {
- q31_t a01, a02, a11, a12;
- q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
- q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
- ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
- ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
-
- ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
- ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
- ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
- ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
-
- b0 = arm_nn_read_q15x2_ia(&ip_b0);
- b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
- ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
- ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
- ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
- ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
-
- col_count--;
- } /* while over col_count */
-
- ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
- ch_0_out_0 += out_offset;
- ch_0_out_0 = MAX(ch_0_out_0, activation_min);
- ch_0_out_0 = MIN(ch_0_out_0, activation_max);
- *out_0++ = (q7_t)ch_0_out_0;
-
- ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
- ch_0_out_1 += out_offset;
- ch_0_out_1 = MAX(ch_0_out_1, activation_min);
- ch_0_out_1 = MIN(ch_0_out_1, activation_max);
- *out_1++ = (q7_t)ch_0_out_1;
- out_mult++;
- out_shift++;
-
- ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
- ch_1_out_0 += out_offset;
- ch_1_out_0 = MAX(ch_1_out_0, activation_min);
- ch_1_out_0 = MIN(ch_1_out_0, activation_max);
- *out_0++ = (q7_t)ch_1_out_0;
-
- ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
- ch_1_out_1 += out_offset;
- ch_1_out_1 = MAX(ch_1_out_1, activation_min);
- ch_1_out_1 = MIN(ch_1_out_1, activation_max);
- *out_1++ = (q7_t)ch_1_out_1;
- out_mult++;
- out_shift++;
-
- /* skip row */
- ip_a0 += num_col_a;
- row_count--;
- }
-
- if (output_ch & 1)
- {
- /* setup pointers for B */
- const q15_t *ip_b0 = input_b;
- const q15_t *ip_b1 = ip_b0 + num_col_a;
-
- /* Init accumulator with bias for channel N + 1 */
- q31_t ch_0_out_0 = *bias;
- q31_t ch_0_out_1 = ch_0_out_0;
-
- int32_t col_count = num_col_a / 4;
- while (col_count)
- {
- q31_t a01, a02;
- q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
- q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
- ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
-
- ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
- ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
-
- b0 = arm_nn_read_q15x2_ia(&ip_b0);
- b1 = arm_nn_read_q15x2_ia(&ip_b1);
-
- ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
- ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
-
- col_count--;
- } /* while over col_count */
-
- ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
- ch_0_out_0 += out_offset;
- ch_0_out_0 = MAX(ch_0_out_0, activation_min);
- ch_0_out_0 = MIN(ch_0_out_0, activation_max);
- *out_0++ = (q7_t)ch_0_out_0;
-
- ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
- ch_0_out_1 += out_offset;
- ch_0_out_1 = MAX(ch_0_out_1, activation_min);
- ch_0_out_1 = MIN(ch_0_out_1, activation_max);
- *out_1++ = (q7_t)ch_0_out_1;
- }
-
- out_0 += output_ch;
-
- /* return the new output pointer with offset */
- return out_0;
-#else
- (void)input_a;
- (void)input_b;
- (void)output_ch;
- (void)out_shift;
- (void)out_mult;
- (void)out_offset;
- (void)activation_min;
- (void)activation_max;
- (void)num_col_a;
- (void)output_bias;
- (void)out_0;
- /* To be completed */
- return NULL;
-#endif
-}