Add implementation of SVDF kernel.
Change-Id: Id16637f8c124df728b7731d8f7a1b257d4db70fa
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 226af35..7ae0678 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -9,6 +9,10 @@
<tr>
<td>V1.x.0</td>
<td>
+ Added the following function for int8 SVDF operator.<br>
+ <ul>
+ <li>arm_svdf_s8</li>
+ </ul>
<ul>
<li>arm_depthwise_conv_wrapper_s8</li>
<li>arm_convolve_wrapper_s8</li>
diff --git a/CMSIS/NN/Include/arm_nn_types.h b/CMSIS/NN/Include/arm_nn_types.h
index 4b216c9..4e825c5 100644
--- a/CMSIS/NN/Include/arm_nn_types.h
+++ b/CMSIS/NN/Include/arm_nn_types.h
@@ -115,6 +115,16 @@
cmsis_nn_activation activation;
} cmsis_nn_fc_params;
+/** CMSIS-NN object for SVDF layer parameters */
+typedef struct
+{
+ int32_t rank;
+ int32_t input_offset; /**< Zero value for the input tensor */
+ int32_t output_offset; /**< Zero value for the output tensor */
+ cmsis_nn_activation input_activation;
+ cmsis_nn_activation output_activation;
+} cmsis_nn_svdf_params;
+
#endif // _ARM_NN_TYPES_H
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index e4b95a1..921e9e5 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: July 27, 2020
- * $Revision: V.6.0.2
+ * $Date: August 21, 2020
+ * $Revision: V.6.5.2
*
* Target Processor: Cortex-M CPUs
* -------------------------------------------------------------------- */
@@ -41,6 +41,7 @@
* - Convolution Functions
* - Activation Functions
* - Fully-connected Layer Functions
+ * - SVDF Layer Functions
* - Pooling Functions
* - Softmax Functions
* - Basic math Functions
@@ -2061,6 +2062,61 @@
const uint16_t input_w,
int8_t *output,
const uint32_t offset_w);
+/**
+ * @defgroup SVDF SVDF Layer Functions
+ *
+ */
+
+ /**
+ * @brief s8 SVDF function
+ *
+ * @param[in] input_ctx Temporary scratch buffer
+ * @param[in] output_ctx Temporary output scratch buffer
+ * @param[in] svdf_params SVDF Parameters
+ * Range of svdf_params->input_offset : [-127, 128]
+ * Range of svdf_params->output_offset : [-127, 128]
+ * @param[in] input_quant_params Input quantization parameters
+ * @param[in] output_quant_params Output quantization parameters
+ * @param[in] input_dims Input tensor dimensions
+ * @param[in] input_data Pointer to input tensor
+ * @param[in] state_dims State tensor dimensions
+ * @param[in] state_data Pointer to state tensor
+ * @param[in] weights_feature_dims Weights (feature) tensor dimensions
+ * @param[in] weights_feature_data Pointer to the weights (feature) tensor
+ * @param[in] weights_time_dims Weights (time) tensor dimensions
+ * @param[in] weights_time_data Pointer to the weights (time) tensor
+ * @param[in] bias_dims Bias tensor dimensions
+ * @param[in] bias_data Pointer to bias tensor
+ * @param[in] output_dims Output tensor dimensions
+ * @param[out] output_data Pointer to the output tensor
+ *
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ *
+ * @details
+ * 1. Supported framework: TensorFlow Lite micro
+ * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
+ *
+ */
+ arm_status
+ arm_svdf_s8(const cmsis_nn_context *input_ctx,
+ const cmsis_nn_context *output_ctx,
+ const cmsis_nn_svdf_params *svdf_params,
+ const cmsis_nn_per_tensor_quant_params *input_quant_params,
+ const cmsis_nn_per_tensor_quant_params *output_quant_params,
+ const cmsis_nn_dims *input_dims,
+ const q7_t *input_data,
+ const cmsis_nn_dims *state_dims,
+ q15_t *state_data,
+ const cmsis_nn_dims *weights_feature_dims,
+ const q7_t *weights_feature_data,
+ const cmsis_nn_dims *weights_time_dims,
+ const q15_t *weights_time_data,
+ const cmsis_nn_dims *bias_dims,
+ const q31_t *bias_data,
+ const cmsis_nn_dims *output_dims,
+ q7_t *output_data);
+
+
#ifdef __cplusplus
}
#endif
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index 980d93e..6c02150 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -36,6 +36,8 @@
||arm_softmax_q7()| SOFTMAX | None | None | Yes | No | Not bit exact to TFLu but can be up to 70x faster |
||arm_softmax_s8()| SOFTMAX | None | None | No | Yes | Bit exact to TFLu |
||arm_softmax_u8()| SOFTMAX | None | None | No | No | Bit exact to TFLu |
+|[SVDF](https://arm-software.github.io/CMSIS_5/NN/html/group__SVDF.html)||||| | ||
+||arm_svdf_s8()| SVDF | None | None | Yes | No | Bit exact to TFLu |
|[Misc](https://arm-software.github.io/CMSIS_5/NN/html/group__groupNN.html)||||| | ||
||arm_reshape_s8()| SOFTMAX | None | None | No | No | |
||arm_elementwise_add_s8()| ELEMENTWISE ADD | None | None | Yes| Yes| Reshape is not done in this function <br/> Only minor improvements are expected |
diff --git a/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c b/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
new file mode 100644
index 0000000..11ea7fb
--- /dev/null
+++ b/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_svdf_s8.c
+ * Description: S8 basic SVDF layer function
+ *
+ * $Date: 17. August 2020
+ * $Revision: V.1.0.0
+ *
+ * Target Processor: Cortex-M processors
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+#include "arm_nn_types.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupNN
+ */
+
+/**
+ * @addtogroup SVDF
+ * @{
+ */
+
+/*
+ * S8 SVDF layer function for TensorFlow Lite
+ *
+ * Refer to header file for details.
+ *
+ */
+
+arm_status
+arm_svdf_s8(const cmsis_nn_context *input_ctx,
+ const cmsis_nn_context *output_ctx,
+ const cmsis_nn_svdf_params *svdf_params,
+ const cmsis_nn_per_tensor_quant_params *input_quant_params,
+ const cmsis_nn_per_tensor_quant_params *output_quant_params,
+ const cmsis_nn_dims *input_dims,
+ const q7_t *input_data,
+ const cmsis_nn_dims *state_dims,
+ q15_t *state_data,
+ const cmsis_nn_dims *weights_feature_dims,
+ const q7_t *weights_feature_data,
+ const cmsis_nn_dims *weights_time_dims,
+ const q15_t *weights_time_data,
+ const cmsis_nn_dims *bias_dims,
+ const q31_t *bias_data,
+ const cmsis_nn_dims *output_dims,
+ q7_t *output_data)
+{
+ (void)bias_dims;
+ (void)state_dims;
+ (void)output_dims;
+
+ const q31_t multiplier_in = input_quant_params->multiplier;
+ const q31_t shift_in = input_quant_params->shift;
+ const q31_t multiplier_out = output_quant_params->multiplier;
+ const q31_t shift_2 = output_quant_params->shift;
+ const int32_t zp_in = svdf_params->input_offset;
+ const int32_t zp_out = svdf_params->output_offset;
+ const int32_t in_activation_min = svdf_params->input_activation.min;
+ const int32_t in_activation_max = svdf_params->input_activation.max;
+ const int32_t out_activation_min = svdf_params->output_activation.min;
+ const int32_t out_activation_max = svdf_params->output_activation.max;
+ const int16_t rank = svdf_params->rank;
+
+ int32_t zp_32 = (-zp_in & 0xffff) |
+ ((-zp_in & 0xffff) << 16);
+
+ const int32_t input_batches = input_dims->n;
+ const int32_t input_height = input_dims->h;
+ const int32_t feature_batches = weights_feature_dims->n;
+ const int32_t time_batches = weights_time_dims->h;
+ const int32_t unit_cnt = feature_batches / rank;
+
+ q31_t *buffer_a = (q31_t *)input_ctx->buf;
+ q31_t *buffer_b = (q31_t *)output_ctx->buf;
+
+ memmove((q15_t *)state_data, (q15_t *)state_data + 1,
+ (size_t)(input_batches * feature_batches * time_batches * (int32_t)sizeof(int16_t)));
+
+ q15_t *res_ptr = state_data + (time_batches - 1);
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ const q7_t *buffer_1 = weights_feature_data;
+ for (int r = 0; r < feature_batches; r++)
+ {
+ q31_t dot_prod = 0;
+
+ const q7_t *buffer_2 = input_data + i_batch * input_height;
+
+#if defined(ARM_MATH_DSP)
+ int c = 0;
+ int32_t block_count = input_height >> 2;
+ for (int i = 0; i < block_count; i++)
+ {
+ c += 4;
+
+ q31_t r1 = arm_nn_read_q7x4_ia(&buffer_1);
+ q31_t r1_a = __SXTB16(r1);
+ q31_t r1_b = __SXTB16(__ROR((uint32_t)r1, 8));
+
+ q31_t r2 = arm_nn_read_q7x4_ia(&buffer_2);
+ q31_t r2_a = __SXTAB16(zp_32, r2);
+ q31_t r2_b = __SXTAB16(zp_32, __ROR((uint32_t)r2, 8));
+
+ dot_prod = __SMLAD(r1_a, r2_a, dot_prod);
+ dot_prod = __SMLAD(r1_b, r2_b, dot_prod);
+ }
+
+ for (; c < input_height; c++)
+ {
+ dot_prod += *buffer_1 * (*buffer_2 - zp_in);
+ buffer_1++;
+ buffer_2++;
+ }
+#else
+ for (int c = 0; c < input_height; c++)
+ {
+ dot_prod += *buffer_1 * (*buffer_2 - zp_in);
+ buffer_1++;
+ buffer_2++;
+ }
+#endif
+
+ dot_prod = arm_nn_requantize(dot_prod,
+ multiplier_in,
+ shift_in);
+ dot_prod = CLAMP(dot_prod, in_activation_max, in_activation_min);
+ *res_ptr = dot_prod;
+ res_ptr += time_batches;
+ }
+ }
+
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+
+ const q15_t *v1 = weights_time_data;
+ const q15_t *v2 = state_data + i_batch * time_batches * feature_batches;
+ for (int i_feature_batch = 0; i_feature_batch < feature_batches; i_feature_batch++)
+ {
+ *ptr_a = 0;
+
+ int32_t sum = 0;
+#if defined(ARM_MATH_DSP)
+ int j = 0;
+ int32_t block_count = time_batches >> 1;
+ for (int i = 0; i < block_count; i++)
+ {
+ j += 2;
+ q31_t r1 = arm_nn_read_q15x2_ia(&v1);
+ q31_t r2 = arm_nn_read_q15x2_ia(&v2);
+
+ sum = __SMLAD(r1, r2, sum);
+ }
+
+ // Process the remaining data
+ for (; j < time_batches; j++)
+ {
+ sum += *v1 * *v2;
+ v1++;
+ v2++;
+ }
+#else
+ int32_t sum = 0;
+ for (int j = 0; j < time_batches; j++)
+ {
+ sum += *v1 * *v2;
+ v1++;
+ v2++;
+ }
+#endif
+
+ *ptr_a = sum;
+ ptr_a++;
+ }
+ }
+
+ for (int i_batch = 0; i_batch < input_batches; i_batch++)
+ {
+ q31_t *output_data_temp = buffer_b + i_batch * unit_cnt;
+ q31_t *ptr_a = buffer_a + i_batch * feature_batches;
+
+ for (int i = 0; i < unit_cnt; i++)
+ {
+ output_data_temp[i] = bias_data[i];
+ for (int j = 0; j < rank; j++)
+ {
+ output_data_temp[i] += *ptr_a;
+ ptr_a++;
+ }
+ }
+ }
+
+ for (int i = 0; i < input_batches * unit_cnt; i++)
+ {
+ output_data[i] = (q7_t)CLAMP(arm_nn_requantize(buffer_b[i], multiplier_out, shift_2) + zp_out,
+ out_activation_max, out_activation_min);
+ }
+
+ return (ARM_MATH_SUCCESS);
+}
+
+/**
+ * @} end of SVDF group
+ */