CMSIS-NN: Optimized convolution for 16x8 data type

Authors: Sebastian-Larsson, oscarandersson8218 (GitHub)

1. Optimized function is added for input data type of int16
and filter data type of int8. The main constraint is
that the kernel volume is < 512.

2. Unit tests are added as well.

Change-Id: I9f0f3067a275c8bf5f9263ebe76cd9a1033c7793
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 137f663..eef3f2e 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -11,8 +11,9 @@
     <release version="5.8.1">
       Active development ...
       CMSIS-DSP: 1.10.0 (see revision history for details)
-      CMSIS-NN: 3.0.1 (see revision history for details)
-       - Support for int16
+      CMSIS-NN: 3.1.0 (see revision history for details)
+       - Support for int16 convolution
+       - Support for DSP extension optimization for int16 convolution
     </release>
     <release version="5.8.0" date="2021-06-24">
       CMSIS-Core(M): 5.5.0 (see revision history for details)
@@ -2815,6 +2816,7 @@
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_basic.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c"/>
+        <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q15_fast_nonsquare.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s16.c"/>
@@ -2853,6 +2855,7 @@
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
+        <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_core_4x_s8.c"/>
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 45d04f3..a5a0f5d 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -7,6 +7,17 @@
     <th>Description</th>
   </tr>
   <tr>
+    <td>V3.1.0</td>
+    <td>
+    <ul>
+      <li> Added arm_convolve_fast_s16 </li>
+      <li> Added arm_nn_mat_mult_kernel_s16 DSP implementation </li>
+      <li> Added conv int16 DSP implementation </li>
+      <li> Added unit tests for int16 DSP conv kernel </li>
+      </ul>
+    </td>
+  </tr>
+  <tr>
     <td>V3.0.1</td>
     <td>
     <ul>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 3387ce2..4f316f9 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        14 June 2021
- * $Revision:    V.7.1.0
+ * $Date:        11 August 2021
+ * $Revision:    V.7.2.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -368,6 +368,47 @@
                             const int64_t *bias_data,
                             const cmsis_nn_dims *output_dims,
                             q15_t *output_data);
+/**
+ * @brief Optimized s16 convolution function
+ * @param[in, out] ctx            Function context that contains the additional buffer if required by the function.
+                                  arm_convolve_fast_s16_get_buffer_size will return the buffer_size if required
+ * @param[in]      conv_params    Convolution parameters (e.g. strides, dilations, pads,...).
+ *                                conv_params->input_offset  : Not used
+ *                                conv_params->output_offset : Not used
+ * @param[in]      quant_params   Per-channel quantization info.
+ *                                It contains the multiplier and shift values to be applied to each output channel
+ * @param[in]      input_dims     Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in]      input_data     Input (activation) data pointer. Data type: int16
+ * @param[in]      filter_dims    Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
+ *                                spatial filter dimensions. (filter_dims->w * filter_dims->h * input_dims->c) must not
+ exceed 512
+ * @param[in]      filter_data    Filter data pointer. Data type: int8
+ * @param[in]      bias_dims      Bias tensor dimensions. Format: [C_OUT]
+ * @param[in]      bias_data      Optional bias data pointer. Data type: int64
+ * @param[in]      output_dims    Output tensor dimensions. Format: [N, H, W, C_OUT]
+ * @param[out]     output_data    Output data pointer. Data type: int16
+
+ * @return     The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ *    1. Supported framework: TensorFlow Lite micro
+ *    2. q7/q15 is used as data type eventhough it is s8/s16 data. It is done so to be consistent with existing APIs.
+ *    3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
+ *    4. Implementation supports kernel volumes (filter width * filter height * input channels) < 512.
+ *
+ */
+
+arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
+                                 const cmsis_nn_conv_params *conv_params,
+                                 const cmsis_nn_per_channel_quant_params *quant_params,
+                                 const cmsis_nn_dims *input_dims,
+                                 const q15_t *input_data,
+                                 const cmsis_nn_dims *filter_dims,
+                                 const q7_t *filter_data,
+                                 const cmsis_nn_dims *bias_dims,
+                                 const int64_t *bias_data,
+                                 const cmsis_nn_dims *output_dims,
+                                 q15_t *output_data);
 
 /**
  * @brief Get the required buffer size for s16 convolution function
@@ -381,6 +422,17 @@
 int32_t arm_convolve_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
 
 /**
+ * @brief Get the required buffer size for fast s16 convolution function
+ *
+ * @param[in]       input_dims            Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ * @param[in]       filter_dims           Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
+ * are the spatial filter dimensions
+ * @return          The function returns required buffer size(bytes)
+ *
+ */
+int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
+
+/**
  * @brief Basic Q7 convolution function
  * @param[in]       Im_in       pointer to input tensor
  * @param[in]       dim_im_in   input tensor dimension
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index ee59d14..b9db880 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        20. July 2021
- * $Revision:    V.5.7.0
+ * $Date:        12 August 2021
+ * $Revision:    V.5.8.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -244,7 +244,37 @@
                          const uint16_t row_len,
                          const int32_t *const bias,
                          q7_t *out);
-
+/**
+ * @brief Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
+ * @param[in]       input_a     pointer to operand A
+ * @param[in]       input_b     pointer to operand B, always consists of 2 vectors.
+ * @param[in]       output_ch   number of rows of A
+ * @param[in]       out_shift  pointer to per output channel requantization shift parameter.
+ * @param[in]       out_mult   pointer to per output channel requantization multiplier parameter.
+ * @param[in]       activation_min   minimum value to clamp the output to. Range : int16
+ * @param[in]       activation_max   maximum value to clamp the output to. Range : int16
+ * @param[in]       num_col_a   number of columns of A
+ * @param[in]       output_bias per output channel bias. Range : int64
+ * @param[in,out]   out_0       pointer to output
+ * @return     The function returns one of the two
+ *              1. The incremented output pointer for a successful operation or
+ *              2. NULL if implementation is not available.
+ *
+ * @details   This function does the matrix multiplication of weight matrix for all output channels
+ *            with 2 columns from im2col and produces two elements/output_channel. The outputs are
+ *            clamped in the range provided by activation min and max.
+ *            Supported framework: TensorFlow Lite micro.
+ */
+q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
+                                  const q15_t *input_b,
+                                  const int32_t output_ch,
+                                  const int32_t *out_shift,
+                                  const int32_t *out_mult,
+                                  const int16_t activation_min,
+                                  const int16_t activation_max,
+                                  const int32_t num_col_a,
+                                  const int64_t *const output_bias,
+                                  q15_t *out_0);
 /**
  * @brief General Matrix-multiplication without requantization for one row & one column
  * @param[in]       row_elements  number of row elements
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt b/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
index 2fa6c28..295a61e 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
+++ b/CMSIS/NN/Source/ConvolutionFunctions/CMakeLists.txt
@@ -20,7 +20,7 @@
 
 file(GLOB SRC "./*_s8*.c")
 add_library(CMSISNNConvolutions STATIC ${SRC})
-target_sources(CMSISNNConvolutions PUBLIC arm_convolve_s16.c arm_convolve_wrapper_s16.c)
+target_sources(CMSISNNConvolutions PUBLIC arm_convolve_s16.c arm_convolve_wrapper_s16.c arm_convolve_fast_s16.c)
 
 ### Includes
 target_include_directories(CMSISNNConvolutions PUBLIC "${NN}/Include")
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
new file mode 100644
index 0000000..54c9ae2
--- /dev/null
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_fast_s16.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_fast_s16.c
+ * Description:  Optimized s16 version of convolution.
+ *
+ * $Date:        12 August 2021
+ * $Revision:    V.1.1.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * Basic s16 convolution function.
+ *
+ * Refer header file for details. Optimal use case for the DSP/MVE implementation is when input and output channels
+ * are multiples of 4 or atleast greater than 4.
+ *
+ */
+
+arm_status arm_convolve_fast_s16(const cmsis_nn_context *ctx,
+                                 const cmsis_nn_conv_params *conv_params,
+                                 const cmsis_nn_per_channel_quant_params *quant_params,
+                                 const cmsis_nn_dims *input_dims,
+                                 const q15_t *input_data,
+                                 const cmsis_nn_dims *filter_dims,
+                                 const q7_t *filter_data,
+                                 const cmsis_nn_dims *bias_dims,
+                                 const int64_t *bias_data,
+                                 const cmsis_nn_dims *output_dims,
+                                 q15_t *output_data)
+{
+    (void)bias_dims;
+    if (filter_dims->w * filter_dims->h * input_dims->c >= 512)
+    {
+        return ARM_MATH_SIZE_MISMATCH;
+    }
+    q15_t *buffer_a = (q15_t *)ctx->buf;
+
+    const int32_t input_batches = input_dims->n;
+    const int32_t input_x = input_dims->w;
+    const int32_t input_y = input_dims->h;
+    const int32_t input_ch = input_dims->c;
+    const int32_t kernel_x = filter_dims->w;
+    const int32_t kernel_y = filter_dims->h;
+    const int32_t output_x = output_dims->w;
+    const int32_t output_y = output_dims->h;
+    const int32_t output_ch = output_dims->c;
+
+    const int32_t pad_x = conv_params->padding.w;
+    const int32_t pad_y = conv_params->padding.h;
+    const int32_t stride_x = conv_params->stride.w;
+    const int32_t stride_y = conv_params->stride.h;
+
+    const int16_t out_activation_min = conv_params->activation.min;
+    const int16_t out_activation_max = conv_params->activation.max;
+    int32_t *output_mult = quant_params->multiplier;
+    int32_t *output_shift = quant_params->shift;
+
+    for (int i_batch = 0; i_batch < input_batches; i_batch++)
+    {
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+        /* Generate two columns from the input tensor a GEMM computation */
+        q15_t *two_column_buf = buffer_a;
+        q15_t *out = output_data;
+        /* This part implements the im2col function */
+        for (int32_t i_out_y = 0; i_out_y < output_y; i_out_y++)
+        {
+            for (int32_t i_out_x = 0; i_out_x < output_x; i_out_x++)
+            {
+                for (int32_t i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y;
+                     i_ker_y++)
+                {
+                    for (int32_t i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x;
+                         i_ker_x++)
+                    {
+                        if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
+                        {
+                            /* Filling 0 for out-of-bound paddings */
+                            arm_memset_q7((q7_t *)two_column_buf, 0, sizeof(q15_t) * input_ch);
+                        }
+                        else
+                        {
+                            arm_memcpy_q7((q7_t *)two_column_buf,
+                                          (const q7_t *)(input_data + (i_ker_y * input_x + i_ker_x) * input_ch),
+                                          input_ch * sizeof(q15_t));
+                        }
+                        two_column_buf += input_ch;
+                    }
+                }
+                /* Computation is filed for every 2 columns */
+                if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
+                {
+                    out = arm_nn_mat_mult_kernel_s16(filter_data,
+                                                     buffer_a,
+                                                     output_ch,
+                                                     output_shift,
+                                                     output_mult,
+                                                     out_activation_min,
+                                                     out_activation_max,
+                                                     (input_ch * kernel_y * kernel_x),
+                                                     bias_data,
+                                                     out);
+
+                    /* Counter reset */
+                    two_column_buf = buffer_a;
+                }
+            }
+        }
+
+        /* Left-over because odd number of output pixels */
+        if (two_column_buf != buffer_a)
+        {
+            const q7_t *ker_a = filter_data;
+            int i;
+
+            for (i = 0; i < output_ch; i++)
+            {
+                /* Init the accumulator*/
+                q31_t sum = 0;
+
+                /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
+                const q15_t *ip_as_col = buffer_a;
+
+                /* 4 multiply and accumulates are done in one loop. */
+                uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
+
+                while (col_count)
+                {
+                    q31_t ker_a1, ker_a2;
+                    q31_t ip_b1, ip_b2;
+
+                    ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
+
+                    ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
+                    sum = __SMLAD(ker_a1, ip_b1, sum);
+                    ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
+                    sum = __SMLAD(ker_a2, ip_b2, sum);
+
+                    col_count--;
+                }
+                /* Handle left over mac */
+                col_count = input_ch * kernel_y * kernel_x & 0x3;
+                while (col_count)
+                {
+                    q7_t ker_a1 = *ker_a++;
+                    q15_t ip_b1 = *ip_as_col++;
+                    sum += ker_a1 * ip_b1;
+                    col_count--;
+                }
+                if (bias_data)
+                {
+                    q31_t reduced_multiplier = REDUCE_MULTIPLIER(output_mult[i]);
+                    q63_t acc_64 = sum + bias_data[i];
+                    sum = arm_nn_requantize_s64(acc_64, reduced_multiplier, output_shift[i]);
+                }
+                else
+                {
+                    sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
+                }
+                sum = MAX(sum, out_activation_min);
+                sum = MIN(sum, out_activation_max);
+                *out++ = (q15_t)sum;
+            }
+        }
+#else
+        (void)bias_data;
+        (void)filter_data;
+        (void)buffer_a;
+        (void)kernel_x;
+        (void)kernel_y;
+        (void)pad_x;
+        (void)pad_y;
+        (void)stride_x;
+        (void)stride_y;
+        (void)out_activation_min;
+        (void)out_activation_max;
+        (void)output_mult;
+        (void)output_shift;
+        return ARM_MATH_ARGUMENT_ERROR;
+#endif
+        /* Advance to the next batch */
+        input_data += (input_x * input_y * input_ch);
+        output_data += (output_x * output_y * output_ch);
+    }
+
+    /* Return to application */
+    return ARM_MATH_SUCCESS;
+}
+
+int32_t arm_convolve_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
+{
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t);
+#else
+    (void)input_dims;
+    (void)filter_dims;
+    return 0;
+#endif
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
index 33c5d43..73902f7 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s16.c
@@ -22,8 +22,8 @@
  * Description:  s16 convolution layer wrapper function with the main purpose to call the optimal kernel available in
  * cmsis-nn to perform the convolution.
  *
- * $Date:        14. June 2021
- * $Revision:    V.1.0.0
+ * $Date:        11 August 2021
+ * $Revision:    V.1.1.0
  *
  * Target Processor:  Cortex-M cores
  *
@@ -59,6 +59,36 @@
                                     const cmsis_nn_dims *output_dims,
                                     q15_t *output_data)
 {
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    if (filter_dims->w * filter_dims->h * input_dims->c < 512)
+    {
+        return arm_convolve_fast_s16(ctx,
+                                     conv_params,
+                                     quant_params,
+                                     input_dims,
+                                     input_data,
+                                     filter_dims,
+                                     filter_data,
+                                     bias_dims,
+                                     bias_data,
+                                     output_dims,
+                                     output_data);
+    }
+    else
+    {
+        return arm_convolve_s16(ctx,
+                                conv_params,
+                                quant_params,
+                                input_dims,
+                                input_data,
+                                filter_dims,
+                                filter_data,
+                                bias_dims,
+                                bias_data,
+                                output_dims,
+                                output_data);
+    }
+#else
     return arm_convolve_s16(ctx,
                             conv_params,
                             quant_params,
@@ -70,6 +100,7 @@
                             bias_data,
                             output_dims,
                             output_data);
+#endif
 }
 
 int32_t arm_convolve_wrapper_s16_get_buffer_size(const cmsis_nn_conv_params *conv_params,
@@ -80,7 +111,14 @@
     (void)conv_params;
     (void)output_dims;
 
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    if (filter_dims->w * filter_dims->h * input_dims->c < 512)
+        return arm_convolve_fast_s16_get_buffer_size(input_dims, filter_dims);
+
     return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
+#else
+    return arm_convolve_s16_get_buffer_size(input_dims, filter_dims);
+#endif
 }
 
 /**
diff --git a/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt b/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
index 0a5c811..255bce8 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
+++ b/CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
@@ -20,7 +20,7 @@
 
 file(GLOB SRC "./*_s8.c")
 add_library(CMSISNNSupport STATIC ${SRC})
-target_sources(CMSISNNSupport PUBLIC arm_q7_to_q15_with_offset.c)
+target_sources(CMSISNNSupport PUBLIC arm_q7_to_q15_with_offset.c arm_nn_mat_mul_kernel_s16.c)
 
 ### Includes
 target_include_directories(CMSISNNSupport PUBLIC "${NN}/Include")
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
new file mode 100644
index 0000000..41d0bc9
--- /dev/null
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_mat_mult_kernel_s16.c
+ * Description:  Matrix-multiplication function for convolution
+ *
+ * $Date:        12 August 2021
+ * $Revision:    V.1.1.0
+ *
+ * Target Processor:  Cortex-M cores
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/*
+ * Matrix-multiplication function for convolution with per-channel requantization.
+ *
+ * Refer header file for details.
+ *
+ */
+
+q15_t *arm_nn_mat_mult_kernel_s16(const q7_t *input_a,
+                                  const q15_t *input_b,
+                                  const int32_t output_ch,
+                                  const int32_t *out_shift,
+                                  const int32_t *out_mult,
+                                  const int16_t activation_min,
+                                  const int16_t activation_max,
+                                  const int32_t num_col_a,
+                                  const int64_t *const output_bias,
+                                  q15_t *out_0)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    /* set up the second output pointers */
+    q15_t *out_1 = out_0 + output_ch;
+    const int64_t *bias = output_bias;
+    uint16_t row_count = output_ch / 2;
+    const q7_t *ip_a0 = input_a;
+
+    /* this loop over rows in A */
+    while (row_count)
+    {
+        /* setup pointers for B */
+        const q15_t *ip_b0 = input_b;
+        const q15_t *ip_b1 = ip_b0 + num_col_a;
+
+        /* align the second pointer for A */
+        const q7_t *ip_a1 = ip_a0 + num_col_a;
+
+        /* Init accumulator for channel N and N + 1 */
+        q31_t ch_0_out_0 = 0;
+        q31_t ch_0_out_1 = 0;
+        q31_t ch_1_out_0 = 0;
+        q31_t ch_1_out_1 = 0;
+
+        uint16_t col_count = num_col_a / 4;
+        /* accumulate over the vector */
+        while (col_count)
+        {
+            q31_t a01, a02, a11, a12;
+            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+            ip_a0 = read_and_pad(ip_a0, &a01, &a02);
+            ip_a1 = read_and_pad(ip_a1, &a11, &a12);
+
+            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+            ch_1_out_0 = __SMLAD(a11, b0, ch_1_out_0);
+            ch_1_out_1 = __SMLAD(a11, b1, ch_1_out_1);
+
+            b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+            ch_1_out_0 = __SMLAD(a12, b0, ch_1_out_0);
+            ch_1_out_1 = __SMLAD(a12, b1, ch_1_out_1);
+
+            col_count--;
+        } /* while over col_count */
+        col_count = num_col_a & 0x3;
+        while (col_count)
+        {
+            q7_t a0 = *ip_a0++;
+            q15_t b0 = *ip_b0++;
+            q7_t a1 = *ip_a1++;
+            q15_t b1 = *ip_b1++;
+
+            ch_0_out_0 += a0 * b0;
+            ch_0_out_1 += a0 * b1;
+            ch_1_out_0 += a1 * b0;
+            ch_1_out_1 += a1 * b1;
+            col_count--;
+        } /* while over col_count */
+        if (bias)
+        {
+            q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+            q63_t acc_64 = ch_0_out_0 + *bias;
+            ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+            acc_64 = ch_0_out_1 + *bias++;
+            ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+            out_mult++;
+        }
+        else
+        {
+            ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
+            ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
+            out_mult++;
+        }
+        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
+        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
+        *out_0++ = (q15_t)ch_0_out_0;
+
+        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
+        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
+        *out_1++ = (q15_t)ch_0_out_1;
+        out_shift++;
+
+        if (bias)
+        {
+            q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+            q63_t acc_64 = ch_1_out_0 + *bias;
+            ch_1_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+            acc_64 = ch_1_out_1 + *bias++;
+            ch_1_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+            out_mult++;
+        }
+        else
+        {
+            ch_1_out_0 = arm_nn_requantize(ch_1_out_0, *out_mult, *out_shift);
+            ch_1_out_1 = arm_nn_requantize(ch_1_out_1, *out_mult, *out_shift);
+            out_mult++;
+        }
+        ch_1_out_0 = MAX(ch_1_out_0, activation_min);
+        ch_1_out_0 = MIN(ch_1_out_0, activation_max);
+        *out_0++ = (q15_t)ch_1_out_0;
+
+        ch_1_out_1 = MAX(ch_1_out_1, activation_min);
+        ch_1_out_1 = MIN(ch_1_out_1, activation_max);
+        *out_1++ = (q15_t)ch_1_out_1;
+        out_shift++;
+
+        /* skip row */
+        ip_a0 += num_col_a;
+        row_count--;
+    }
+
+    /* compute the last odd numbered row if any */
+    if (output_ch & 0x1)
+    {
+        /* setup pointers for B */
+        const q15_t *ip_b0 = input_b;
+        const q15_t *ip_b1 = ip_b0 + num_col_a;
+
+        q31_t ch_0_out_0 = 0;
+        q31_t ch_0_out_1 = 0;
+
+        uint16_t col_count = num_col_a >> 2;
+        while (col_count)
+        {
+            q31_t a01, a02;
+            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+            ip_a0 = read_and_pad(ip_a0, &a01, &a02);
+
+            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+
+            b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            b1 = arm_nn_read_q15x2_ia(&ip_b1);
+            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+
+            col_count--;
+        }
+        col_count = num_col_a & 0x3;
+        while (col_count)
+        {
+            q7_t a0 = *ip_a0++;
+            q15_t b0 = *ip_b0++;
+            q15_t b1 = *ip_b1++;
+
+            ch_0_out_0 += a0 * b0;
+            ch_0_out_1 += a0 * b1;
+            col_count--;
+        }
+        if (bias)
+        {
+            q31_t reduced_multiplier = REDUCE_MULTIPLIER(*out_mult);
+            q63_t acc_64 = ch_0_out_0 + *bias;
+            ch_0_out_0 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+            acc_64 = ch_0_out_1 + *bias++;
+            ch_0_out_1 = arm_nn_requantize_s64(acc_64, reduced_multiplier, *out_shift);
+        }
+        else
+        {
+            ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
+            ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
+        }
+        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
+        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
+        *out_0++ = (q15_t)ch_0_out_0;
+
+        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
+        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
+        *out_1++ = (q15_t)ch_0_out_1;
+        out_mult++;
+        out_shift++;
+    }
+
+    out_0 += output_ch;
+
+    /* return the new output pointer with offset */
+    return out_0;
+#else
+    (void)input_a;
+    (void)input_b;
+    (void)output_ch;
+    (void)out_shift;
+    (void)out_mult;
+    (void)activation_min;
+    (void)activation_max;
+    (void)num_col_a;
+    (void)output_bias;
+    (void)out_0;
+    /* To be completed */
+    return NULL;
+#endif
+}
diff --git a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
index ffd8f8f..b9afb8b 100644
--- a/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
+++ b/CMSIS/NN/Tests/UnitTest/CMakeLists.txt
@@ -70,6 +70,7 @@
 add_subdirectory(TestCases/test_arm_convolve_1x1_s8_fast)
 add_subdirectory(TestCases/test_arm_convolve_s8)
 add_subdirectory(TestCases/test_arm_convolve_s16)
+add_subdirectory(TestCases/test_arm_convolve_fast_s16)
 add_subdirectory(TestCases/test_arm_depthwise_conv_3x3_s8)
 add_subdirectory(TestCases/test_arm_depthwise_conv_s8)
 add_subdirectory(TestCases/test_arm_depthwise_conv_s8_opt)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/CMakeLists.txt b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/CMakeLists.txt
new file mode 100644
index 0000000..1f1a309
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_cmsis_nn_unit_test_executable(test_arm_convolve_fast_s16)
+
+target_sources(test_arm_convolve_fast_s16 PRIVATE
+    Unity/unity_test_arm_convolve_fast_s16.c
+    Unity/TestRunner/unity_test_arm_convolve_fast_s16_runner.c)
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/Unity/unity_test_arm_convolve_fast_s16.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/Unity/unity_test_arm_convolve_fast_s16.c
new file mode 100644
index 0000000..f510215
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/Unity/unity_test_arm_convolve_fast_s16.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../test_arm_convolve_fast_s16.c"
+#include "unity.h"
+
+#ifdef USING_FVP_CORSTONE_300
+extern void uart_init(void);
+#endif
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void setUp(void)
+{ /* This is run before EACH TEST */
+#ifdef USING_FVP_CORSTONE_300
+    uart_init();
+#endif
+}
+
+/* This function is called from the autogenerated file.
+ * The name must be exactly like this
+ */
+void tearDown(void) {}
+
+void test_int16xint8_arm_convolve_fast_s16(void) { int16xint8_arm_convolve_fast_s16(); }
+void test_requantize_s64_arm_convolve_fast_s16(void) { requantize_s64_arm_convolve_fast_s16(); }
diff --git a/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/test_arm_convolve_fast_s16.c b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/test_arm_convolve_fast_s16.c
new file mode 100644
index 0000000..77d6283
--- /dev/null
+++ b/CMSIS/NN/Tests/UnitTest/TestCases/test_arm_convolve_fast_s16/test_arm_convolve_fast_s16.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+
+#include <arm_nnfunctions.h>
+#include <unity.h>
+
+#include "../TestData/int16xint8/test_data.h"
+#include "../TestData/requantize_s64/test_data.h"
+#include "../Utils/validate.h"
+
+void int16xint8_arm_convolve_fast_s16(void)
+{
+    q15_t output[INT16XINT8_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_conv_params conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = int16xint8_biases;
+    const q7_t *kernel_data = int16xint8_weights;
+    const q15_t *input_data = int16xint8_input;
+    const q15_t *output_ref = int16xint8_output_ref;
+    const int32_t output_ref_size = INT16XINT8_DST_SIZE;
+
+    input_dims.n = INT16XINT8_INPUT_BATCHES;
+    input_dims.w = INT16XINT8_INPUT_W;
+    input_dims.h = INT16XINT8_INPUT_H;
+    input_dims.c = INT16XINT8_IN_CH;
+    filter_dims.w = INT16XINT8_FILTER_X;
+    filter_dims.h = INT16XINT8_FILTER_Y;
+    output_dims.w = INT16XINT8_OUTPUT_W;
+    output_dims.h = INT16XINT8_OUTPUT_H;
+    output_dims.c = INT16XINT8_OUT_CH;
+
+    conv_params.padding.w = INT16XINT8_PAD_X;
+    conv_params.padding.h = INT16XINT8_PAD_Y;
+    conv_params.stride.w = INT16XINT8_STRIDE_X;
+    conv_params.stride.h = INT16XINT8_STRIDE_Y;
+
+    conv_params.input_offset = 0;
+    conv_params.output_offset = 0;
+    conv_params.activation.min = INT16XINT8_OUT_ACTIVATION_MIN;
+    conv_params.activation.max = INT16XINT8_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)int16xint8_output_mult;
+    quant_params.shift = (int32_t *)int16xint8_output_shift;
+
+    int buf_size = arm_convolve_wrapper_s16_get_buffer_size(&conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_status result = arm_convolve_wrapper_s16(&ctx,
+                                                 &conv_params,
+                                                 &quant_params,
+                                                 &input_dims,
+                                                 input_data,
+                                                 &filter_dims,
+                                                 kernel_data,
+                                                 &bias_dims,
+                                                 bias_data,
+                                                 &output_dims,
+                                                 output);
+    free(ctx.buf);
+
+    TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_convolve_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_convolve_fast_s16(&ctx,
+                                   &conv_params,
+                                   &quant_params,
+                                   &input_dims,
+                                   input_data,
+                                   &filter_dims,
+                                   kernel_data,
+                                   &bias_dims,
+                                   bias_data,
+                                   &output_dims,
+                                   output);
+    free(ctx.buf);
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+#else
+    TEST_ASSERT_EQUAL(ARM_MATH_ARGUMENT_ERROR, result);
+#endif
+}
+
+void requantize_s64_arm_convolve_fast_s16(void)
+{
+    q15_t output[REQUANTIZE_S64_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_conv_params conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = requantize_s64_biases;
+    const q7_t *kernel_data = requantize_s64_weights;
+    const q15_t *input_data = requantize_s64_input;
+    const q15_t *output_ref = requantize_s64_output_ref;
+    const int32_t output_ref_size = REQUANTIZE_S64_DST_SIZE;
+
+    input_dims.n = REQUANTIZE_S64_INPUT_BATCHES;
+    input_dims.w = REQUANTIZE_S64_INPUT_W;
+    input_dims.h = REQUANTIZE_S64_INPUT_H;
+    input_dims.c = REQUANTIZE_S64_IN_CH;
+    filter_dims.w = REQUANTIZE_S64_FILTER_X;
+    filter_dims.h = REQUANTIZE_S64_FILTER_Y;
+    output_dims.w = REQUANTIZE_S64_OUTPUT_W;
+    output_dims.h = REQUANTIZE_S64_OUTPUT_H;
+    output_dims.c = REQUANTIZE_S64_OUT_CH;
+
+    conv_params.padding.w = REQUANTIZE_S64_PAD_X;
+    conv_params.padding.h = REQUANTIZE_S64_PAD_Y;
+    conv_params.stride.w = REQUANTIZE_S64_STRIDE_X;
+    conv_params.stride.h = REQUANTIZE_S64_STRIDE_Y;
+
+    conv_params.input_offset = REQUANTIZE_S64_INPUT_OFFSET;
+    conv_params.output_offset = REQUANTIZE_S64_OUTPUT_OFFSET;
+    conv_params.activation.min = REQUANTIZE_S64_OUT_ACTIVATION_MIN;
+    conv_params.activation.max = REQUANTIZE_S64_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)requantize_s64_output_mult;
+    quant_params.shift = (int32_t *)requantize_s64_output_shift;
+
+    int buf_size = arm_convolve_wrapper_s16_get_buffer_size(&conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_status result = arm_convolve_wrapper_s16(&ctx,
+                                                 &conv_params,
+                                                 &quant_params,
+                                                 &input_dims,
+                                                 input_data,
+                                                 &filter_dims,
+                                                 kernel_data,
+                                                 &bias_dims,
+                                                 bias_data,
+                                                 &output_dims,
+                                                 output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_convolve_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_convolve_fast_s16(&ctx,
+                                   &conv_params,
+                                   &quant_params,
+                                   &input_dims,
+                                   input_data,
+                                   &filter_dims,
+                                   kernel_data,
+                                   &bias_dims,
+                                   bias_data,
+                                   &output_dims,
+                                   output);
+    free(ctx.buf);
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
+    TEST_ASSERT_EQUAL(ARM_MATH_SUCCESS, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+#else
+    TEST_ASSERT_EQUAL(ARM_MATH_ARGUMENT_ERROR, result);
+#endif
+}