CMSIS-NN: Optimized convolution for 16x8 data type
Authors: Sebastian-Larsson, oscarandersson8218 (GitHub)
1. Optimized function is added for input data type of int16
and filter data type of int8. The main constraint is
that the kernel volume is < 512.
2. Unit tests are added as well.
Change-Id: I9f0f3067a275c8bf5f9263ebe76cd9a1033c7793
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 137f663..eef3f2e 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -11,8 +11,9 @@
<release version="5.8.1">
Active development ...
CMSIS-DSP: 1.10.0 (see revision history for details)
- CMSIS-NN: 3.0.1 (see revision history for details)
- - Support for int16
+ CMSIS-NN: 3.1.0 (see revision history for details)
+ - Support for int16 convolution
+ - Support for DSP extension optimization for int16 convolution
</release>
<release version="5.8.0" date="2021-06-24">
CMSIS-Core(M): 5.5.0 (see revision history for details)
@@ -2815,6 +2816,7 @@
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c"/>
+ <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c"/>
@@ -2853,6 +2855,7 @@
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
+ <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c"/>
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 45d04f3..a5a0f5d 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -7,6 +7,17 @@
<th>Description</th>
</tr>
<tr>
+ <td>V3.1.0</td>
+ <td>
+ <ul>
+ <li> Added arm_convolve_fast_s16 </li>
+ <li> Added arm_nn_mat_mult_kernel_s16 DSP implementation </li>
+ <li> Added conv int16 DSP implementation </li>
+ <li> Added unit tests for int16 DSP conv kernel </li>
+ </ul>
+ </td>
+ </tr>
+ <tr>
<td>V3.0.1</td>
<td>
<ul>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 3387ce2..4f316f9 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: 14 June 2021
- * $Revision: V.7.1.0
+ * $Date: 11 August 2021
+ * $Revision: V.7.2.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -368,6 +368,47 @@
const int64_t *bias_data,
const cmsis_nn_dims *output_dims,
q15_t *output_data);
+/**
+ * @brief Optimized s16 convolution function
+ * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
+ arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required
+ * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
+ * conv_params->input_offset : Not used
+ * conv_params->output_offset : Not used
+ * @param[in] quant_params Per-channel quantization info.
+ * It contains the multiplier and shift values to be applied to each output channel
+ * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in] input_data Input (activation) data pointer. Data type: int16
+ * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
+ * spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
+ exceed 512
+ * @param[in] filter_data Filter data pointer. Data type: int8
+ * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
+ * @param[in] bias_data Optional bias data pointer. Data type: int64
+ * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
+ * @param[out] output_data Output data pointer. Data type: int16
+
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ * 1. Supported framework: TensorFlow Lite micro
+ * 2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
+ * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ * 4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
+ *
+ */
+
+arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
+ const cmsis_nn_conv_params *conv_params,
+ const cmsis_nn_per_channel_quant_params *quant_params,
+ const cmsis_nn_dims *input_dims,
+ const q15_t *input_data,
+ const cmsis_nn_dims *filter_dims,
+ const q7_t *filter_data,
+ const cmsis_nn_dims *bias_dims,
+ const int64_t *bias_data,
+ const cmsis_nn_dims *output_dims,
+ q15_t *output_data);
/**
* @brief Get the required buffer size for s16 convolution function
@@ -381,6 +422,17 @@
int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
/**
+ * @brief Get the required buffer size for fast s16 convolution function
+ *
+ * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
+ * are the spatial filter dimensions
+ * @return The function returns required buffer size(bytes)
+ *
+ */
+int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
+
+/**
* @brief Basic Q7 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimension
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index ee59d14..b9db880 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
- * $Date: 20. July 2021
- * $Revision: V.5.7.0
+ * $Date: 12 August 2021
+ * $Revision: V.5.8.0
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -244,7 +244,37 @@
const uint16_t row_len,
const int32_t *const bias,
q7_t *out);
-
+/**
+ * @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
+ * @param[in] input_a pointer to operand A
+ * @param[in] input_b pointer to operand B, always consists of 2 vectors.
+ * @param[in] output_ch number of rows of A
+ * @param[in] out_shift pointer to per output channel requantization shift parameter.
+ * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
+ * @param[in] activation_min minimum value to clamp the output to. Range : int16
+ * @param[in] activation_max maximum value to clamp the output to. Range : int16
+ * @param[in] num_col_a number of columns of A
+ * @param[in] output_bias per output channel bias. Range : int64
+ * @param[in,out] out_0 pointer to output
+ * @return The function returns one of the two
+ * 1. The incremented output pointer for a successful operation or
+ * 2. NULL if implementation is not available.
+ *
+ * @details This function does the matrix multiplication of weight matrix for all output channels
+ * with 2 columns from im2col and produces two elements/output_channel. The outputs are
+ * clamped in the range provided by activation min and max.
+ * Supported framework: TensorFlow Lite micro.
+ */
+q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
+ const q15_t *input_b,
+ const int32_t output_ch,
+ const int32_t *out_shift,
+ const int32_t *out_mult,
+ const int16_t activation_min,
+ const int16_t activation_max,
+ const int32_t num_col_a,
+ const int64_t *const output_bias,
+ q15_t *out_0);
/**
* @brief General Matrix-multiplication without requantization for one row & one column
* @param[in] row_elements number of row elements
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt b/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
index 2fa6c28..295a61e 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
+++ b/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
@@ -20,7 +20,7 @@
file(GLOB SRC "./*_s8*.c")
add_library(CMSISNNConvolutions STATIC ${SRC})
-target_sources(CMSISNNConvolutions PUBLIC arm_convolve_s16.c arm_convolve_wrapper_s16.c)
+target_sources(CMSISNNConvolutions PUBLIC arm_convolve_s16.c arm_convolve_wrapper_s16.c arm_convolve_fast_s16.c)
### Includes
target_include_directories(CMSISNNConvolutions PUBLIC "${NN}/Include")
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
new file mode 100644
index 0000000..54c9ae2
--- /dev/null
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_convolve_fast_s16.c
+ * Description: Optimized s16 version of convolution.
+ *
+ * $Date: 12 August 2021
+ * $Revision: V.1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupNN
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * Basic s16 convolution function.
+ *
+ * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
+ * are multiples of 4 or atleast greater than 4.
+ *
+ */
+
+arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
+ const cmsis_nn_conv_params *conv_params,
+ const cmsis_nn_per_channel_quant_params *quant_params,
+ const cmsis_nn_dims *input_dims,
+ const q15_t *input_data,
+ const cmsis_nn_dims *filter_dims,
+ const q7_t *filter_data,
+ const cmsis_nn_dims *bias_dims,
+ const int64_t *bias_data,
+ const cmsis_nn_dims *output_dims,
+ q15_t *output_data)
+{
+ (void)bias_dims;
+ if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
+ {
+ return ARM_MATH_SIZE_MISMATCH;
+ }
+ q15_t *buffer_a = (q15_t *)ctx->buf;
+
+ const int32_t input_batches = input_dims->n;
+ const int32_t input_x = input_dims->w;
+ const int32_t input_y = input_dims->h;
+ const int32_t input_ch = input_dims->c;
+ const int32_t kernel_x = filter_dims->w;
+ const int32_t kernel_y = filter_dims->h;
+ const int32_t output_x = output_dims->w;
+ const int32_t output_y = output_dims->h;
+ const int32_t output_ch = output_dims->c;
+
+ const int32_t pad_x = conv_params->padding.w;
+ const int32_t pad_y = conv_params->padding.h;
+ const int32_t stride_x = conv_params->stride.w;
+ const int32_t stride_y = conv_params->stride.h;
+
+ const int16_t out_activation_min = conv_params->activation.min;
+ const int16_t out_activation_max = conv_params->activation.max;
+ int32_t *output_mult = quant_params->multiplier;
+ int32_t *output_shift = quant_params->shift;
+
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ /* Generate two columns from the input tensor a GEMM computation */
+ q15_t *two_column_buf = buffer_a;
+ q15_t *out = output_data;
+ /* This part implements the im2col function */
+ for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
+ {
+ for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++)
+ {
+ for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y;
+ i_ker_y++)
+ {
+ for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x;
+ i_ker_x++)
+ {
+ if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
+ {
+ /* Filling 0 for out-of-bound paddings */
+ arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
+ }
+ else
+ {
+ arm_memcpy_q7((q7_t *)two_column_buf,
+ (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
+ input_ch * sizeof(q15_t));
+ }
+ two_column_buf += input_ch;
+ }
+ }
+ /* Computation is filed for every 2 columns */
+ if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
+ {
+ out = arm_nn_mat_mult_kernel_s16(filter_data,
+ buffer_a,
+ output_ch,
+ output_shift,
+ output_mult,
+ out_activation_min,
+ out_activation_max,
+ (input_ch * kernel_y * kernel_x),
+ bias_data,
+ out);
+
+ /* Counter reset */
+ two_column_buf = buffer_a;
+ }
+ }
+ }
+
+ /* Left-over because odd number of output pixels */
+ if (two_column_buf != buffer_a)
+ {
+ const q7_t *ker_a = filter_data;
+ int i;
+
+ for (i = 0; i < output_ch; i++)
+ {
+ /* Init the accumulator*/
+ q31_t sum = 0;
+
+ /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
+ const q15_t *ip_as_col = buffer_a;
+
+ /* 4 multiply and accumulates are done in one loop. */
+ uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
+
+ while (col_count)
+ {
+ q31_t ker_a1, ker_a2;
+ q31_t ip_b1, ip_b2;
+
+ ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
+
+ ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
+ sum = __SMLAD(ker_a1, ip_b1, sum);
+ ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
+ sum = __SMLAD(ker_a2, ip_b2, sum);
+
+ col_count--;
+ }
+ /* Handle left over mac */
+ col_count = input_ch * kernel_y * kernel_x & 0x3;
+ while (col_count)
+ {
+ q7_t ker_a1 = *ker_a++;
+ q15_t ip_b1 = *ip_as_col++;
+ sum += ker_a1 * ip_b1;
+ col_count--;
+ }
+ if (bias_data)
+ {
+ q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
+ q63_t acc_64 = sum + bias_data[i];
+ sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
+ }
+ else
+ {
+ sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
+ }
+ sum = MAX(sum, out_activation_min);
+ sum = MIN(sum, out_activation_max);
+ *out++ = (q15_t)sum;
+ }
+ }
+#else
+ (void)bias_data;
+ (void)filter_data;
+ (void)buffer_a;
+ (void)kernel_x;
+ (void)kernel_y;
+ (void)pad_x;
+ (void)pad_y;
+ (void)stride_x;
+ (void)stride_y;
+ (void)out_activation_min;
+ (void)out_activation_max;
+ (void)output_mult;
+ (void)output_shift;
+ return ARM_MATH_ARGUMENT_ERROR;
+#endif
+ /* Advance to the next batch */
+ input_data += (input_x * input_y * input_ch);
+ output_data += (output_x * output_y * output_ch);
+ }
+
+ /* Return to application */
+ return ARM_MATH_SUCCESS;
+}
+
+int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
+#else
+ (void)input_dims;
+ (void)filter_dims;
+ return 0;
+#endif
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
index 33c5d43..73902f7 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
@@ -22,8 +22,8 @@
* Description: s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
* cmsis-nn to perform the convolution.
*
- * $Date: 14. June 2021
- * $Revision: V.1.0.0
+ * $Date: 11 August 2021
+ * $Revision: V.1.1.0
*
* Target Processor: Cortex-M cores
*
@@ -59,6 +59,36 @@
const cmsis_nn_dims *output_dims,
q15_t *output_data)
{
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ if (filter_dims->w * filter_dims->h * input_dims->c < 512)
+ {
+ return arm_convolve_fast_s16(ctx,
+ conv_params,
+ quant_params,
+ input_dims,
+ input_data,
+ filter_dims,
+ filter_data,
+ bias_dims,
+ bias_data,
+ output_dims,
+ output_data);
+ }
+ else
+ {
+ return arm_convolve_s16(ctx,
+ conv_params,
+ quant_params,
+ input_dims,
+ input_data,
+ filter_dims,
+ filter_data,
+ bias_dims,
+ bias_data,
+ output_dims,
+ output_data);
+ }
+#else
return arm_convolve_s16(ctx,
conv_params,
quant_params,
@@ -70,6 +100,7 @@
bias_data,
output_dims,
output_data);
+#endif
}
int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
@@ -80,7 +111,14 @@
(void)conv_params;
(void)output_dims;
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ if (filter_dims->w * filter_dims->h * input_dims->c < 512)
+ return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
+
return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
+#else
+ return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
+#endif
}
/**
diff --git a/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt b/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
index 0a5c811..255bce8 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
+++ b/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
@@ -20,7 +20,7 @@
file(GLOB SRC "./*_s8.c")
add_library(CMSISNNSupport STATIC ${SRC})
-target_sources(CMSISNNSupport PUBLIC arm_q7_to_q15_with_offset.c)
+target_sources(CMSISNNSupport PUBLIC arm_q7_to_q15_with_offset.c arm_nn_mat_mul_kernel_s16.c)
### Includes
target_include_directories(CMSISNNSupport PUBLIC "${NN}/Include")
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
new file mode 100644
index 0000000..41d0bc9
--- /dev/null
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_nn_mat_mult_kernel_s16.c
+ * Description: Matrix-multiplication function for convolution
+ *
+ * $Date: 12 August 2021
+ * $Revision: V.1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/*
+ * Matrix-multiplication function for convolution with per-channel requantization.
+ *
+ * Refer header file for details.
+ *
+ */
+
+q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
+ const q15_t *input_b,
+ const int32_t output_ch,
+ const int32_t *out_shift,
+ const int32_t *out_mult,
+ const int16_t activation_min,
+ const int16_t activation_max,
+ const int32_t num_col_a,
+ const int64_t *const output_bias,
+ q15_t *out_0)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ /* set up the second output pointers */
+ q15_t *out_1 = out_0 + output_ch;
+ const int64_t *bias = output_bias;
+ uint16_t row_count = output_ch / 2;
+ const q7_t *ip_a0 = input_a;
+
+ /* this loop over rows in A */
+ while (row_count)
+ {
+ /* setup pointers for B */
+ const q15_t *ip_b0 = input_b;
+ const q15_t *ip_b1 = ip_b0 + num_col_a;
+
+ /* align the second pointer for A */
+ const q7_t *ip_a1 = ip_a0 + num_col_a;
+
+ /* Init accumulator for channel N and N + 1 */
+ q31_t ch_0_out_0 = 0;
+ q31_t ch_0_out_1 = 0;
+ q31_t ch_1_out_0 = 0;
+ q31_t ch_1_out_1 = 0;
+
+ uint16_t col_count = num_col_a / 4;
+ /* accumulate over the vector */
+ while (col_count)
+ {
+ q31_t a01, a02, a11, a12;
+ q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+ q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+ ip_a0 = read_and_pad(ip_a0, &a01, &a02);
+ ip_a1 = read_and_pad(ip_a1, &a11, &a12);
+
+ ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
+ ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+ ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
+ ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
+
+ b0 = arm_nn_read_q15x2_ia(&ip_b0);
+ b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+ ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
+ ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+ ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
+ ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
+
+ col_count--;
+ } /* while over col_count */
+ col_count = num_col_a & 0x3;
+ while (col_count)
+ {
+ q7_t a0 = *ip_a0++;
+ q15_t b0 = *ip_b0++;
+ q7_t a1 = *ip_a1++;
+ q15_t b1 = *ip_b1++;
+
+ ch_0_out_0 += a0 * b0;
+ ch_0_out_1 += a0 * b1;
+ ch_1_out_0 += a1 * b0;
+ ch_1_out_1 += a1 * b1;
+ col_count--;
+ } /* while over col_count */
+ if (bias)
+ {
+ q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+ q63_t acc_64 = ch_0_out_0 + *bias;
+ ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+ acc_64 = ch_0_out_1 + *bias++;
+ ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+ out_mult++;
+ }
+ else
+ {
+ ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
+ ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
+ out_mult++;
+ }
+ ch_0_out_0 = MAX(ch_0_out_0, activation_min);
+ ch_0_out_0 = MIN(ch_0_out_0, activation_max);
+ *out_0++ = (q15_t)ch_0_out_0;
+
+ ch_0_out_1 = MAX(ch_0_out_1, activation_min);
+ ch_0_out_1 = MIN(ch_0_out_1, activation_max);
+ *out_1++ = (q15_t)ch_0_out_1;
+ out_shift++;
+
+ if (bias)
+ {
+ q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+ q63_t acc_64 = ch_1_out_0 + *bias;
+ ch_1_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+ acc_64 = ch_1_out_1 + *bias++;
+ ch_1_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+ out_mult++;
+ }
+ else
+ {
+ ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
+ ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
+ out_mult++;
+ }
+ ch_1_out_0 = MAX(ch_1_out_0, activation_min);
+ ch_1_out_0 = MIN(ch_1_out_0, activation_max);
+ *out_0++ = (q15_t)ch_1_out_0;
+
+ ch_1_out_1 = MAX(ch_1_out_1, activation_min);
+ ch_1_out_1 = MIN(ch_1_out_1, activation_max);
+ *out_1++ = (q15_t)ch_1_out_1;
+ out_shift++;
+
+ /* skip row */
+ ip_a0 += num_col_a;
+ row_count--;
+ }
+
+ /* compute the last odd numbered row if any */
+ if (output_ch & 0x1)
+ {
+ /* setup pointers for B */
+ const q15_t *ip_b0 = input_b;
+ const q15_t *ip_b1 = ip_b0 + num_col_a;
+
+ q31_t ch_0_out_0 = 0;
+ q31_t ch_0_out_1 = 0;
+
+ uint16_t col_count = num_col_a >> 2;
+ while (col_count)
+ {
+ q31_t a01, a02;
+ q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+ q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+ ip_a0 = read_and_pad(ip_a0, &a01, &a02);
+
+ ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
+ ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+
+ b0 = arm_nn_read_q15x2_ia(&ip_b0);
+ b1 = arm_nn_read_q15x2_ia(&ip_b1);
+ ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
+ ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+
+ col_count--;
+ }
+ col_count = num_col_a & 0x3;
+ while (col_count)
+ {
+ q7_t a0 = *ip_a0++;
+ q15_t b0 = *ip_b0++;
+ q15_t b1 = *ip_b1++;
+
+ ch_0_out_0 += a0 * b0;
+ ch_0_out_1 += a0 * b1;
+ col_count--;
+ }
+ if (bias)
+ {
+ q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+ q63_t acc_64 = ch_0_out_0 + *bias;
+ ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+ acc_64 = ch_0_out_1 + *bias++;
+ ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+ }
+ else
+ {
+ ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
+ ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
+ }
+ ch_0_out_0 = MAX(ch_0_out_0, activation_min);
+ ch_0_out_0 = MIN(ch_0_out_0, activation_max);
+ *out_0++ = (q15_t)ch_0_out_0;
+
+ ch_0_out_1 = MAX(ch_0_out_1, activation_min);
+ ch_0_out_1 = MIN(ch_0_out_1, activation_max);
+ *out_1++ = (q15_t)ch_0_out_1;
+ out_mult++;
+ out_shift++;
+ }
+
+ out_0 += output_ch;
+
+ /* return the new output pointer with offset */
+ return out_0;
+#else
+ (void)input_a;
+ (void)input_b;
+ (void)output_ch;
+ (void)out_shift;
+ (void)out_mult;
+ (void)activation_min;
+ (void)activation_max;
+ (void)num_col_a;
+ (void)output_bias;
+ (void)out_0;
+ /* To be completed */
+ return NULL;
+#endif
+}
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index ffd8f8f..b9afb8b 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -70,6 +70,7 @@
add_subdirectory(TestCases/test_arm_convolve_1x1_s8_fast)
add_subdirectory(TestCases/test_arm_convolve_s8)
add_subdirectory(TestCases/test_arm_convolve_s16)
+add_subdirectory(TestCases/test_arm_convolve_fast_s16)
add_subdirectory(TestCases/test_arm_depthwise_conv_3x3_s8)
add_subdirectory(TestCases/test_arm_depthwise_conv_s8)
add_subdirectory(TestCases/test_arm_depthwise_conv_s8_opt)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/CMakeLists.txt
new file mode 100644
index 0000000..1f1a309
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_cmsis_nn_unit_test_executable(test_arm_convolve_fast_s16)
+
+target_sources(test_arm_convolve_fast_s16 PRIVATE
+ Unity/unity_test_arm_convolve_fast_s16.c
+ Unity/TestRunner/unity_test_arm_convolve_fast_s16_runner.c)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/Unity/unity_test_arm_convolve_fast_s16.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/Unity/unity_test_arm_convolve_fast_s16.c
new file mode 100644
index 0000000..f510215
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/Unity/unity_test_arm_convolve_fast_s16.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../test_arm_convolve_fast_s16.c"
+#include "unity.h"
+
+#ifdef USING_FVP_CORSTONE_300
+extern void uart_init(void);
+#endif
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void setUp(void)
+{ /* This is run before EACH TEST */
+#ifdef USING_FVP_CORSTONE_300
+ uart_init();
+#endif
+}
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void tearDown(void) {}
+
+void test_int16xint8_arm_convolve_fast_s16(void) { int16xint8_arm_convolve_fast_s16(); }
+void test_requantize_s64_arm_convolve_fast_s16(void) { requantize_s64_arm_convolve_fast_s16(); }
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/test_arm_convolve_fast_s16.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/test_arm_convolve_fast_s16.c
new file mode 100644
index 0000000..77d6283
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/test_arm_convolve_fast_s16.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+
+#include <arm_nnfunctions.h>
+#include <unity.h>
+
+#include "../TestData/int16xint8/test_data.h"
+#include "../TestData/requantize_s64/test_data.h"
+#include "../Utils/validate.h"
+
+void int16xint8_arm_convolve_fast_s16(void)
+{
+ q15_t output[INT16XINT8_DST_SIZE] = {0};
+
+ cmsis_nn_context ctx;
+ cmsis_nn_conv_params conv_params;
+ cmsis_nn_per_channel_quant_params quant_params;
+ cmsis_nn_dims input_dims;
+ cmsis_nn_dims filter_dims;
+ cmsis_nn_dims bias_dims;
+ cmsis_nn_dims output_dims;
+
+ const q63_t *bias_data = int16xint8_biases;
+ const q7_t *kernel_data = int16xint8_weights;
+ const q15_t *input_data = int16xint8_input;
+ const q15_t *output_ref = int16xint8_output_ref;
+ const int32_t output_ref_size = INT16XINT8_DST_SIZE;
+
+ input_dims.n = INT16XINT8_INPUT_BATCHES;
+ input_dims.w = INT16XINT8_INPUT_W;
+ input_dims.h = INT16XINT8_INPUT_H;
+ input_dims.c = INT16XINT8_IN_CH;
+ filter_dims.w = INT16XINT8_FILTER_X;
+ filter_dims.h = INT16XINT8_FILTER_Y;
+ output_dims.w = INT16XINT8_OUTPUT_W;
+ output_dims.h = INT16XINT8_OUTPUT_H;
+ output_dims.c = INT16XINT8_OUT_CH;
+
+ conv_params.padding.w = INT16XINT8_PAD_X;
+ conv_params.padding.h = INT16XINT8_PAD_Y;
+ conv_params.stride.w = INT16XINT8_STRIDE_X;
+ conv_params.stride.h = INT16XINT8_STRIDE_Y;
+
+ conv_params.input_offset = 0;
+ conv_params.output_offset = 0;
+ conv_params.activation.min = INT16XINT8_OUT_ACTIVATION_MIN;
+ conv_params.activation.max = INT16XINT8_OUT_ACTIVATION_MAX;
+ quant_params.multiplier = (int32_t *)int16xint8_output_mult;
+ quant_params.shift = (int32_t *)int16xint8_output_shift;
+
+ int buf_size = arm_convolve_wrapper_s16_get_buffer_size(&conv_params, &input_dims, &filter_dims, &output_dims);
+ ctx.buf = malloc(buf_size);
+
+ arm_status result = arm_convolve_wrapper_s16(&ctx,
+ &conv_params,
+ &quant_params,
+ &input_dims,
+ input_data,
+ &filter_dims,
+ kernel_data,
+ &bias_dims,
+ bias_data,
+ &output_dims,
+ output);
+ free(ctx.buf);
+
+ TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+ TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+ buf_size = arm_convolve_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+ ctx.buf = malloc(buf_size);
+
+ result = arm_convolve_fast_s16(&ctx,
+ &conv_params,
+ &quant_params,
+ &input_dims,
+ input_data,
+ &filter_dims,
+ kernel_data,
+ &bias_dims,
+ bias_data,
+ &output_dims,
+ output);
+ free(ctx.buf);
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+ TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+#else
+ TEST_ASSERT_EQUAL(ARM_MATH_ARGUMENT_ERROR, result);
+#endif
+}
+
+void requantize_s64_arm_convolve_fast_s16(void)
+{
+ q15_t output[REQUANTIZE_S64_DST_SIZE] = {0};
+
+ cmsis_nn_context ctx;
+ cmsis_nn_conv_params conv_params;
+ cmsis_nn_per_channel_quant_params quant_params;
+ cmsis_nn_dims input_dims;
+ cmsis_nn_dims filter_dims;
+ cmsis_nn_dims bias_dims;
+ cmsis_nn_dims output_dims;
+
+ const q63_t *bias_data = requantize_s64_biases;
+ const q7_t *kernel_data = requantize_s64_weights;
+ const q15_t *input_data = requantize_s64_input;
+ const q15_t *output_ref = requantize_s64_output_ref;
+ const int32_t output_ref_size = REQUANTIZE_S64_DST_SIZE;
+
+ input_dims.n = REQUANTIZE_S64_INPUT_BATCHES;
+ input_dims.w = REQUANTIZE_S64_INPUT_W;
+ input_dims.h = REQUANTIZE_S64_INPUT_H;
+ input_dims.c = REQUANTIZE_S64_IN_CH;
+ filter_dims.w = REQUANTIZE_S64_FILTER_X;
+ filter_dims.h = REQUANTIZE_S64_FILTER_Y;
+ output_dims.w = REQUANTIZE_S64_OUTPUT_W;
+ output_dims.h = REQUANTIZE_S64_OUTPUT_H;
+ output_dims.c = REQUANTIZE_S64_OUT_CH;
+
+ conv_params.padding.w = REQUANTIZE_S64_PAD_X;
+ conv_params.padding.h = REQUANTIZE_S64_PAD_Y;
+ conv_params.stride.w = REQUANTIZE_S64_STRIDE_X;
+ conv_params.stride.h = REQUANTIZE_S64_STRIDE_Y;
+
+ conv_params.input_offset = REQUANTIZE_S64_INPUT_OFFSET;
+ conv_params.output_offset = REQUANTIZE_S64_OUTPUT_OFFSET;
+ conv_params.activation.min = REQUANTIZE_S64_OUT_ACTIVATION_MIN;
+ conv_params.activation.max = REQUANTIZE_S64_OUT_ACTIVATION_MAX;
+ quant_params.multiplier = (int32_t *)requantize_s64_output_mult;
+ quant_params.shift = (int32_t *)requantize_s64_output_shift;
+
+ int buf_size = arm_convolve_wrapper_s16_get_buffer_size(&conv_params, &input_dims, &filter_dims, &output_dims);
+ ctx.buf = malloc(buf_size);
+
+ arm_status result = arm_convolve_wrapper_s16(&ctx,
+ &conv_params,
+ &quant_params,
+ &input_dims,
+ input_data,
+ &filter_dims,
+ kernel_data,
+ &bias_dims,
+ bias_data,
+ &output_dims,
+ output);
+
+ free(ctx.buf);
+ TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+ TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+ buf_size = arm_convolve_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+ ctx.buf = malloc(buf_size);
+
+ result = arm_convolve_fast_s16(&ctx,
+ &conv_params,
+ &quant_params,
+ &input_dims,
+ input_data,
+ &filter_dims,
+ kernel_data,
+ &bias_dims,
+ bias_data,
+ &output_dims,
+ output);
+ free(ctx.buf);
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+ TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+ TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+#else
+ TEST_ASSERT_EQUAL(ARM_MATH_ARGUMENT_ERROR, result);
+#endif
+}