Merge branch 'develop' of https://github.com/ARM-software/CMSIS_5 into develop
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 8a027a9..cd18337 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -2628,6 +2628,7 @@
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c"/>
+ <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c"/>
<file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c"/>
@@ -2641,6 +2642,8 @@
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c"/>
<file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c"/>
+ <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c"/>
+ <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c"/>
<file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c"/>
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 89123ab..c6ec83a 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,7 +21,7 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
- * $Date: 17. January 2018
+ * $Date: 13. July 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
@@ -159,6 +159,52 @@
q7_t * bufferB);
/**
+ * @brief Basic Q7 convolution function (non-sqaure shape)
+ * @param[in] Im_in pointer to input tensor
+ * @param[in] dim_im_in_x input tensor dimention x
+ * @param[in] dim_im_in_y input tensor dimention y
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] wt pointer to kernel weights
+ * @param[in] ch_im_out number of filters, i.e., output tensor channels
+ * @param[in] dim_kernel_x filter kernel size x
+ * @param[in] dim_kernel_y filter kernel size y
+ * @param[in] padding_x padding size x
+ * @param[in] padding_y padding size y
+ * @param[in] stride_x convolution stride x
+ * @param[in] stride_y convolution stride y
+ * @param[in] bias pointer to bias
+ * @param[in] bias_shift amount of left-shift for bias
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in,out] Im_out pointer to output tensor
+ * @param[in] dim_im_out_x output tensor dimension x
+ * @param[in] dim_im_out_y output tensor dimension y
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferB pointer to buffer space for output
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ */
+
+ arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const q7_t * wt,
+ const uint16_t ch_im_out,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const q7_t * bias,
+ const uint16_t bias_shift,
+ const uint16_t out_shift,
+ q7_t * Im_out,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q15_t * bufferA,
+ q7_t * bufferB);
+
+ /**
* @brief Basic Q15 convolution function
* @param[in] Im_in pointer to input tensor
* @param[in] dim_im_in input tensor dimention
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index 88cb536..8460190 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,7 +21,7 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
- * $Date: 17. January 2018
+ * $Date: 13. July 2018
* $Revision: V.1.0.0
*
* Target Processor: Cortex-M cores
@@ -136,6 +136,57 @@
#endif
/**
+ * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
+ *
+ * Basic Math Functions for Neural Network Computation
+ *
+ */
+
+/**
+ * @brief Q7 vector multiplication with variable output shifts
+ * @param[in] *pSrcA pointer to the first input vector
+ * @param[in] *pSrcB pointer to the second input vector
+ * @param[out] *pDst pointer to the output vector
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in] blockSize number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+void arm_nn_mult_q15(
+ q15_t * pSrcA,
+ q15_t * pSrcB,
+ q15_t * pDst,
+ const uint16_t out_shift,
+ uint32_t blockSize);
+
+/**
+ * @brief Q7 vector multiplication with variable output shifts
+ * @param[in] *pSrcA pointer to the first input vector
+ * @param[in] *pSrcB pointer to the second input vector
+ * @param[out] *pDst pointer to the output vector
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in] blockSize number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ */
+
+void arm_nn_mult_q7(
+ q7_t * pSrcA,
+ q7_t * pSrcB,
+ q7_t * pDst,
+ const uint16_t out_shift,
+ uint32_t blockSize);
+
+/**
* @brief defition to adding rouding offset
*/
#ifndef ARM_NN_TRUNCATE
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_nn_mult_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_nn_mult_ref.c
new file mode 100644
index 0000000..2cc6b72
--- /dev/null
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_nn_mult_ref.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+
+void arm_nn_mult_q7_ref(q7_t * pSrcA,
+ q7_t * pSrcB,
+ q7_t * pDst,
+ const uint16_t out_shift,
+ uint32_t blockSize) {
+ uint16_t i;
+
+for (i = 0; i < blockSize; i++)
+ {
+ q31_t product = pSrcA[i] * pSrcB[i];
+#ifndef ARM_NN_TRUNCATE
+ pDst[i] = (q7_t)__SSAT((product + (0x1 << (out_shift - 1)))>>out_shift, 8);
+#else
+ pDst[i] = (q7_t)__SSAT(product >> out_shift, 8);
+#endif
+ }
+}
+
+void arm_nn_mult_q15_ref(q15_t * pSrcA,
+ q15_t * pSrcB,
+ q15_t * pDst,
+ const uint16_t out_shift,
+ uint32_t blockSize) {
+ uint16_t i;
+
+for (i = 0; i < blockSize; i++)
+ {
+ q31_t product = pSrcA[i] * pSrcB[i];
+#ifndef ARM_NN_TRUNCATE
+ pDst[i] = (q15_t)__SSAT((product + (0x1 << (out_shift - 1)))>>out_shift, 16);
+#else
+ pDst[i] = (q15_t)__SSAT(product >> out_shift, 16);
+#endif
+
+
+ }
+}
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
index b6eb6b9..4a0647a 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
@@ -239,6 +239,10 @@
void arm_relu_q15_ref(q15_t * data, uint16_t size);
+ void arm_nn_mult_q7_ref(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+ void arm_nn_mult_q15_ref(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
#ifdef __cplusplus
}
#endif
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
index 93b168d..5cf72a2 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
@@ -45,6 +45,7 @@
#define TEST_IP
#define TEST_CONV
#define TEST_NONSQUARE
+#define TEST_NNMULT
int test_index = 0;
q7_t test_flags[50];
@@ -67,6 +68,51 @@
}
test_index = 0;
+#ifdef TEST_NNMULT
+#define NNMULT_DIM 128
+ test1 = new q7_t[NNMULT_DIM*2];
+ test2 = new q15_t[NNMULT_DIM*2];
+ test3 = new q7_t[NNMULT_DIM*2];
+ test4 = new q15_t[NNMULT_DIM*2];
+
+ q7_t * mult_out_q7 = test3;
+ q7_t * mult_ref_q7 = test3 + NNMULT_DIM;
+ q15_t * mult_out_q15 = test4;
+ q15_t * mult_ref_q15 = test4 + NNMULT_DIM;
+
+ for (int i=0;i<NNMULT_DIM*2;i++) {
+ test1[i] = (rand() % 256 - 128);
+ test2[i] = (rand() % 65536 - 32768);
+ }
+
+ // Test q7
+ arm_nn_mult_q7(test1, test1+NNMULT_DIM, mult_out_q7, 5, NNMULT_DIM);
+
+ arm_nn_mult_q7_ref(test1, test1+NNMULT_DIM, mult_ref_q7, 5, NNMULT_DIM);
+
+ verify_results_q7(mult_out_q7, mult_ref_q7, NNMULT_DIM);
+
+ arm_nn_mult_q7(test1, test1+NNMULT_DIM, mult_out_q7, 9, NNMULT_DIM);
+
+ arm_nn_mult_q7_ref(test1, test1+NNMULT_DIM, mult_ref_q7, 9, NNMULT_DIM);
+
+ verify_results_q7(mult_out_q7, mult_ref_q7, NNMULT_DIM);
+
+ // Test q15
+ arm_nn_mult_q15(test2, test2+NNMULT_DIM, mult_out_q15, 13, NNMULT_DIM);
+
+ arm_nn_mult_q15_ref(test2, test2+NNMULT_DIM, mult_ref_q15, 13, NNMULT_DIM);
+
+ verify_results_q15(mult_out_q15, mult_ref_q15, NNMULT_DIM);
+
+ arm_nn_mult_q15(test2, test2+NNMULT_DIM, mult_out_q15, 18, NNMULT_DIM);
+
+ arm_nn_mult_q15_ref(test2, test2+NNMULT_DIM, mult_ref_q15, 18, NNMULT_DIM);
+
+ verify_results_q15(mult_out_q15, mult_ref_q15, NNMULT_DIM);
+
+#endif
+
#ifdef TEST_SIGMOID
#define SIGMOID_DIM 128
@@ -488,6 +534,22 @@
initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
+ printf("start conv q7 nonsquare ref implementation\n");
+ arm_convolve_HWC_q7_ref_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
+ RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
+ RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_ref_q7,
+ RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
+
+ printf("start conv q7 nonsquare basic implementation\n");
+ arm_convolve_HWC_q7_basic_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
+ RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
+ RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_opt_q7,
+ RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
+
+ verify_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
+
+ initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
+
printf("start 1x1 conv q7 nonsquare fast implementation\n");
arm_convolve_HWC_q7_fast_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
RCONV_OUT_CH, 1, 1, 0, 0, RCONV_STRIDE_X,
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx
index 090a603..bcc0ff2 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx
@@ -236,7 +236,7 @@
<OPTFL>
<tvExp>1</tvExp>
<tvExpOptDlg>0</tvExpOptDlg>
- <IsCurrentTarget>1</IsCurrentTarget>
+ <IsCurrentTarget>0</IsCurrentTarget>
</OPTFL>
<CpuCode>7</CpuCode>
<DebugOpt>
@@ -566,7 +566,7 @@
<OPTFL>
<tvExp>1</tvExp>
<tvExpOptDlg>0</tvExpOptDlg>
- <IsCurrentTarget>0</IsCurrentTarget>
+ <IsCurrentTarget>1</IsCurrentTarget>
</OPTFL>
<CpuCode>7</CpuCode>
<DebugOpt>
@@ -1525,6 +1525,54 @@
<RteFlg>0</RteFlg>
<bShared>0</bShared>
</File>
+ <File>
+ <GroupNumber>1</GroupNumber>
+ <FileNumber>44</FileNumber>
+ <FileType>1</FileType>
+ <tvExp>0</tvExp>
+ <tvExpOptDlg>0</tvExpOptDlg>
+ <bDave2>0</bDave2>
+ <PathWithFileName>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</PathWithFileName>
+ <FilenameWithoutPath>arm_convolve_HWC_q7_basic_nonsquare.c</FilenameWithoutPath>
+ <RteFlg>0</RteFlg>
+ <bShared>0</bShared>
+ </File>
+ <File>
+ <GroupNumber>1</GroupNumber>
+ <FileNumber>45</FileNumber>
+ <FileType>1</FileType>
+ <tvExp>0</tvExp>
+ <tvExpOptDlg>0</tvExpOptDlg>
+ <bDave2>0</bDave2>
+ <PathWithFileName>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</PathWithFileName>
+ <FilenameWithoutPath>arm_nn_mult_q7.c</FilenameWithoutPath>
+ <RteFlg>0</RteFlg>
+ <bShared>0</bShared>
+ </File>
+ <File>
+ <GroupNumber>1</GroupNumber>
+ <FileNumber>46</FileNumber>
+ <FileType>1</FileType>
+ <tvExp>0</tvExp>
+ <tvExpOptDlg>0</tvExpOptDlg>
+ <bDave2>0</bDave2>
+ <PathWithFileName>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</PathWithFileName>
+ <FilenameWithoutPath>arm_nn_mult_q15.c</FilenameWithoutPath>
+ <RteFlg>0</RteFlg>
+ <bShared>0</bShared>
+ </File>
+ <File>
+ <GroupNumber>1</GroupNumber>
+ <FileNumber>47</FileNumber>
+ <FileType>1</FileType>
+ <tvExp>0</tvExp>
+ <tvExpOptDlg>0</tvExpOptDlg>
+ <bDave2>0</bDave2>
+ <PathWithFileName>.\Ref_Implementations\arm_nn_mult_ref.c</PathWithFileName>
+ <FilenameWithoutPath>arm_nn_mult_ref.c</FilenameWithoutPath>
+ <RteFlg>0</RteFlg>
+ <bShared>0</bShared>
+ </File>
</Group>
<Group>
@@ -1535,7 +1583,7 @@
<RteFlg>0</RteFlg>
<File>
<GroupNumber>2</GroupNumber>
- <FileNumber>44</FileNumber>
+ <FileNumber>48</FileNumber>
<FileType>5</FileType>
<tvExp>0</tvExp>
<tvExpOptDlg>0</tvExpOptDlg>
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx
index 1b1e251..a698fe1 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx
@@ -595,6 +595,26 @@
<FileType>1</FileType>
<FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
</File>
+ <File>
+ <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q7.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q15.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_ref.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+ </File>
</Files>
</Group>
<Group>
@@ -1207,6 +1227,26 @@
<FileType>1</FileType>
<FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
</File>
+ <File>
+ <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q7.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q15.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_ref.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+ </File>
</Files>
</Group>
<Group>
@@ -1819,6 +1859,26 @@
<FileType>1</FileType>
<FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
</File>
+ <File>
+ <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q7.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q15.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_ref.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+ </File>
</Files>
</Group>
<Group>
@@ -2431,6 +2491,26 @@
<FileType>1</FileType>
<FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
</File>
+ <File>
+ <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q7.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q15.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_ref.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+ </File>
</Files>
</Group>
<Group>
@@ -3043,6 +3123,26 @@
<FileType>1</FileType>
<FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
</File>
+ <File>
+ <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q7.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q15.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_ref.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+ </File>
</Files>
</Group>
<Group>
@@ -3655,6 +3755,26 @@
<FileType>1</FileType>
<FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
</File>
+ <File>
+ <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q7.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_q15.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+ </File>
+ <File>
+ <FileName>arm_nn_mult_ref.c</FileName>
+ <FileType>1</FileType>
+ <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+ </File>
</Files>
</Group>
<Group>
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
new file mode 100644
index 0000000..b426b92
--- /dev/null
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_convolve_HWC_q7_basic.c
+ * Description: Q7 version of convolution
+ *
+ * $Date: 13. July 2018
+ * $Revision: V.1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+
+/**
+ * @ingroup groupNN
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+ /**
+ * @brief Basic Q7 convolution function (non-sqaure shape)
+ * @param[in] Im_in pointer to input tensor
+ * @param[in] dim_im_in_x input tensor dimention x
+ * @param[in] dim_im_in_y input tensor dimention y
+ * @param[in] ch_im_in number of input tensor channels
+ * @param[in] wt pointer to kernel weights
+ * @param[in] ch_im_out number of filters, i.e., output tensor channels
+ * @param[in] dim_kernel_x filter kernel size x
+ * @param[in] dim_kernel_y filter kernel size y
+ * @param[in] padding_x padding size x
+ * @param[in] padding_y padding size y
+ * @param[in] stride_x convolution stride x
+ * @param[in] stride_y convolution stride y
+ * @param[in] bias pointer to bias
+ * @param[in] bias_shift amount of left-shift for bias
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in,out] Im_out pointer to output tensor
+ * @param[in] dim_im_out_x output tensor dimension x
+ * @param[in] dim_im_out_y output tensor dimension y
+ * @param[in,out] bufferA pointer to buffer space for input
+ * @param[in,out] bufferB pointer to buffer space for output
+ * @return The function returns <code>ARM_MATH_SUCCESS</code>
+ */
+
+arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
+ const uint16_t dim_im_in_x,
+ const uint16_t dim_im_in_y,
+ const uint16_t ch_im_in,
+ const q7_t * wt,
+ const uint16_t ch_im_out,
+ const uint16_t dim_kernel_x,
+ const uint16_t dim_kernel_y,
+ const uint16_t padding_x,
+ const uint16_t padding_y,
+ const uint16_t stride_x,
+ const uint16_t stride_y,
+ const q7_t * bias,
+ const uint16_t bias_shift,
+ const uint16_t out_shift,
+ q7_t * Im_out,
+ const uint16_t dim_im_out_x,
+ const uint16_t dim_im_out_y,
+ q15_t * bufferA,
+ q7_t * bufferB)
+{
+
+#if defined (ARM_MATH_DSP)
+ /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+ int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
+
+ /*
+ * Here we use bufferA as q15_t internally as computation are done with q15_t level
+ * im2col are done to output in q15_t format from q7_t input
+ */
+ q15_t *pBuffer = bufferA;
+ q7_t *pOut = Im_out;
+
+ /* This part implements the im2col function */
+ for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
+ {
+ for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
+ {
+ for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; i_ker_y++)
+ {
+ for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; i_ker_x++)
+ {
+ if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
+ {
+ /* Filling 0 for out-of-bound paddings */
+ /* arm_fill_q15(0, pBuffer, ch_im_in); */
+ memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
+ } else
+ {
+ /* Copying the pixel data to column */
+ arm_q7_to_q15_no_shift((q7_t *)
+ Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
+ }
+ pBuffer += ch_im_in;
+ }
+ }
+
+ /* Computation is filed for every 2 columns */
+ if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
+ {
+ pOut =
+ arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
+ ch_im_out,
+ ch_im_in *
+ dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
+
+ /* counter reset */
+ pBuffer = bufferA;
+ }
+ }
+ }
+
+ /* left-over because odd number of output pixels */
+ if (pBuffer != bufferA)
+ {
+ const q7_t *pA = wt;
+ int i;
+
+ for (i = 0; i < ch_im_out; i++)
+ {
+ /* Load the accumulator with bias first */
+ q31_t sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+
+ /* Point to the beging of the im2col buffer */
+ q15_t *pB = bufferA;
+
+ /* Each time it process 4 entries */
+ uint16_t colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
+
+ while (colCnt)
+ {
+ q31_t inA1, inA2;
+ q31_t inB1, inB2;
+
+ pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
+
+ inB1 = *__SIMD32(pB)++;
+ sum = __SMLAD(inA1, inB1, sum);
+ inB2 = *__SIMD32(pB)++;
+ sum = __SMLAD(inA2, inB2, sum);
+
+ colCnt--;
+ }
+ colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
+ while (colCnt)
+ {
+ q7_t inA1 = *pA++;
+ q15_t inB1 = *pB++;
+ sum += inA1 * inB1;
+ colCnt--;
+ }
+ *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
+ }
+ }
+#else
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+ uint16_t i, j, k, l, m, n;
+ int conv_out;
+ signed char in_row, in_col;
+
+ for (i = 0; i < ch_im_out; i++)
+ {
+ for (j = 0; j < dim_im_out_y; j++)
+ {
+ for (k = 0; k < dim_im_out_x; k++)
+ {
+ conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+ for (m = 0; m < dim_kernel_y; m++)
+ {
+ for (n = 0; n < dim_kernel_x; n++)
+ {
+ // if-for implementation
+ in_row = stride_y * j + m - padding_y;
+ in_col = stride_x * k + n - padding_x;
+ if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
+ {
+ for (l = 0; l < ch_im_in; l++)
+ {
+ conv_out +=
+ Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] *
+ wt[i * ch_im_in * dim_kernel_y * dim_kernel_x +
+ (m * dim_kernel_x + n) * ch_im_in + l];
+ }
+ }
+ }
+ }
+ Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
+ }
+ }
+ }
+
+#endif /* ARM_MATH_DSP */
+
+ /* Return to application */
+ return ARM_MATH_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
new file mode 100644
index 0000000..5a60459
--- /dev/null
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_nn_mult_q15.c
+ * Description: Q15 vector multiplication with variable output shifts
+ *
+ * $Date: 13. July 2018
+ * $Revision: V.1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup NNBasicMath
+ * @{
+ */
+
+
+/**
+ * @brief Q7 vector multiplication with variable output shifts
+ * @param[in] *pSrcA pointer to the first input vector
+ * @param[in] *pSrcB pointer to the second input vector
+ * @param[out] *pDst pointer to the output vector
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in] blockSize number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+void arm_nn_mult_q15(
+ q15_t * pSrcA,
+ q15_t * pSrcB,
+ q15_t * pDst,
+ const uint16_t out_shift,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+
+#if defined (ARM_MATH_DSP)
+
+/* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t inA1, inA2, inB1, inB2; /* temporary input variables */
+ q15_t out1, out2, out3, out4; /* temporary output variables */
+ q31_t mul1, mul2, mul3, mul4; /* temporary variables */
+
+ /* loop Unrolling */
+ blkCnt = blockSize >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* read two samples at a time from sourceA */
+ inA1 = *__SIMD32(pSrcA)++;
+ /* read two samples at a time from sourceB */
+ inB1 = *__SIMD32(pSrcB)++;
+ /* read two samples at a time from sourceA */
+ inA2 = *__SIMD32(pSrcA)++;
+ /* read two samples at a time from sourceB */
+ inB2 = *__SIMD32(pSrcB)++;
+
+ /* multiply mul = sourceA * sourceB */
+ mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
+ mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
+ mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
+ mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
+
+ /* saturate result to 16 bit */
+ out1 = (q15_t) __SSAT((mul1 + NN_ROUND(out_shift)) >> out_shift, 16);
+ out2 = (q15_t) __SSAT((mul2 + NN_ROUND(out_shift)) >> out_shift, 16);
+ out3 = (q15_t) __SSAT((mul3 + NN_ROUND(out_shift)) >> out_shift, 16);
+ out4 = (q15_t) __SSAT((mul4 + NN_ROUND(out_shift)) >> out_shift, 16);
+
+ /* store the result */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
+ *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
+
+#else
+
+ *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
+ *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * B */
+ /* Multiply the inputs and store the result in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16);
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of NNBasicMath group
+ */
+
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
new file mode 100644
index 0000000..3735c04
--- /dev/null
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project: CMSIS NN Library
+ * Title: arm_nn_mult_q7.c
+ * Description: Q7 vector multiplication with variable output shifts
+ *
+ * $Date: 13. July 2018
+ * $Revision: V.1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup NNBasicMath
+ * @{
+ */
+
+/**
+ * @brief Q7 vector multiplication with variable output shifts
+ * @param[in] *pSrcA pointer to the first input vector
+ * @param[in] *pSrcB pointer to the second input vector
+ * @param[out] *pDst pointer to the output vector
+ * @param[in] out_shift amount of right-shift for output
+ * @param[in] blockSize number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ */
+
+void arm_nn_mult_q7(
+ q7_t * pSrcA,
+ q7_t * pSrcB,
+ q7_t * pDst,
+ const uint16_t out_shift,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+
+#if defined (ARM_MATH_DSP)
+
+/* Run the below code for Cortex-M4 and Cortex-M3 */
+ q7_t out1, out2, out3, out4; /* Temporary variables to store the product */
+
+ /* loop Unrolling */
+ blkCnt = blockSize >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A * B */
+ /* Multiply the inputs and store the results in temporary variables */
+ out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+ out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+ out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+ out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+
+ /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
+ *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * B */
+ /* Multiply the inputs and store the result in the destination buffer */
+ *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of NNBasicMath group
+ */
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
index fa58a2c..264e760 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
@@ -130,5 +130,5 @@
}
/**
- * @} end of q7_to_x group
+ * @} end of nndata_convert group
*/