Merge branch 'develop' of https://github.com/ARM-software/CMSIS_5 into develop
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index 8a027a9..cd18337 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -2628,6 +2628,7 @@
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c"/>
+        <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c"/>
         <file category="source" name="CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c"/>
         
@@ -2641,6 +2642,8 @@
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nntables.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c"/>
+        <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c"/>
+        <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c"/>
 
         <file category="source" name="CMSIS/NN/Source/PoolingFunctions/arm_pool_q7_HWC.c"/>
         
diff --git a/CMSIS/NN/Include/arm_nnfunctions.h b/CMSIS/NN/Include/arm_nnfunctions.h
index 89123ab..c6ec83a 100644
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -21,7 +21,7 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        17. January 2018
+ * $Date:        13. July 2018
  * $Revision:    V.1.0.0
  *
  * Target Processor:  Cortex-M cores
@@ -159,6 +159,52 @@
                                          q7_t * bufferB);
 
   /**
+   * @brief Basic Q7 convolution function (non-sqaure shape)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in_x  input tensor dimention x
+   * @param[in]       dim_im_in_y  input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x filter kernel size x
+   * @param[in]       dim_kernel_y filter kernel size y
+   * @param[in]       padding_x    padding size x
+   * @param[in]       padding_y    padding size y
+   * @param[in]       stride_x     convolution stride x
+   * @param[in]       stride_y     convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out_x output tensor dimension x
+   * @param[in]       dim_im_out_y output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code> 
+   */
+
+    arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
+                                                  const uint16_t dim_im_in_x,
+                                                  const uint16_t dim_im_in_y,
+                                                  const uint16_t ch_im_in,
+                                                  const q7_t * wt,
+                                                  const uint16_t ch_im_out,
+                                                  const uint16_t dim_kernel_x,
+                                                  const uint16_t dim_kernel_y,
+                                                  const uint16_t padding_x,
+                                                  const uint16_t padding_y,
+                                                  const uint16_t stride_x,
+                                                  const uint16_t stride_y,
+                                                  const q7_t * bias,
+                                                  const uint16_t bias_shift,
+                                                  const uint16_t out_shift,
+                                                  q7_t * Im_out,
+                                                  const uint16_t dim_im_out_x,
+                                                  const uint16_t dim_im_out_y,
+                                                  q15_t * bufferA,
+                                                  q7_t * bufferB);
+
+  /**
    * @brief Basic Q15 convolution function
    * @param[in]       Im_in       pointer to input tensor
    * @param[in]       dim_im_in   input tensor dimention
diff --git a/CMSIS/NN/Include/arm_nnsupportfunctions.h b/CMSIS/NN/Include/arm_nnsupportfunctions.h
index 88cb536..8460190 100644
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -21,7 +21,7 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        17. January 2018
+ * $Date:        13. July 2018
  * $Revision:    V.1.0.0
  *
  * Target Processor:  Cortex-M cores
@@ -136,6 +136,57 @@
 #endif
 
 /**
+ * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
+ *
+ * Basic Math Functions for Neural Network Computation
+ *
+ */
+
+/**
+ * @brief           Q7 vector multiplication with variable output shifts
+ * @param[in]       *pSrcA        pointer to the first input vector
+ * @param[in]       *pSrcB        pointer to the second input vector
+ * @param[out]      *pDst         pointer to the output vector
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in]       blockSize     number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+void arm_nn_mult_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  const uint16_t out_shift,
+  uint32_t blockSize);
+  
+/**
+ * @brief           Q7 vector multiplication with variable output shifts
+ * @param[in]       *pSrcA        pointer to the first input vector
+ * @param[in]       *pSrcB        pointer to the second input vector
+ * @param[out]      *pDst         pointer to the output vector
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in]       blockSize     number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ */
+
+void arm_nn_mult_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  q7_t * pDst,
+  const uint16_t out_shift,
+  uint32_t blockSize);
+ 
+/**
  * @brief defition to adding rouding offset
  */
 #ifndef ARM_NN_TRUNCATE
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_nn_mult_ref.c b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_nn_mult_ref.c
new file mode 100644
index 0000000..2cc6b72
--- /dev/null
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/arm_nn_mult_ref.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+
+void      arm_nn_mult_q7_ref(q7_t * pSrcA, 
+                             q7_t * pSrcB, 
+                             q7_t * pDst, 
+                             const uint16_t out_shift, 
+                             uint32_t blockSize) {
+    uint16_t  i;
+
+for (i = 0; i < blockSize; i++)
+    {
+		q31_t product = pSrcA[i] * pSrcB[i];
+#ifndef ARM_NN_TRUNCATE
+        pDst[i] = (q7_t)__SSAT((product + (0x1 << (out_shift - 1)))>>out_shift, 8);
+#else
+        pDst[i] = (q7_t)__SSAT(product >> out_shift, 8);
+#endif
+    }
+}
+
+void     arm_nn_mult_q15_ref(q15_t * pSrcA, 
+                             q15_t * pSrcB, 
+                             q15_t * pDst, 
+                             const uint16_t out_shift, 
+                             uint32_t blockSize) {
+    uint16_t  i;
+
+for (i = 0; i < blockSize; i++)
+    {
+		q31_t product = pSrcA[i] * pSrcB[i];
+#ifndef ARM_NN_TRUNCATE
+        pDst[i] = (q15_t)__SSAT((product + (0x1 << (out_shift - 1)))>>out_shift, 16);
+#else
+        pDst[i] = (q15_t)__SSAT(product >> out_shift, 16);
+#endif
+
+
+    }
+}
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
index b6eb6b9..4a0647a 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/Ref_Implementations/ref_functions.h
@@ -239,6 +239,10 @@
 
     void      arm_relu_q15_ref(q15_t * data, uint16_t size);
 
+    void      arm_nn_mult_q7_ref(q7_t * pSrcA, q7_t * pSrcB, q7_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
+    void      arm_nn_mult_q15_ref(q15_t * pSrcA, q15_t * pSrcB, q15_t * pDst, const uint16_t out_shift, uint32_t blockSize);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
index 93b168d..5cf72a2 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.cpp
@@ -45,6 +45,7 @@
 #define TEST_IP
 #define TEST_CONV
 #define TEST_NONSQUARE
+#define TEST_NNMULT
 
 int test_index = 0;
 q7_t test_flags[50];
@@ -67,6 +68,51 @@
     }
     test_index = 0;
 
+#ifdef TEST_NNMULT
+#define NNMULT_DIM 128
+    test1 = new q7_t[NNMULT_DIM*2];
+    test2 = new q15_t[NNMULT_DIM*2];
+    test3 = new q7_t[NNMULT_DIM*2];
+    test4 = new q15_t[NNMULT_DIM*2];
+
+    q7_t * mult_out_q7 = test3;
+    q7_t * mult_ref_q7 = test3 + NNMULT_DIM;
+    q15_t * mult_out_q15 = test4;
+    q15_t * mult_ref_q15 = test4 + NNMULT_DIM;
+
+    for (int i=0;i<NNMULT_DIM*2;i++) {
+        test1[i] = (rand() % 256 - 128);
+        test2[i] = (rand() % 65536 - 32768);
+    }
+
+    // Test q7
+    arm_nn_mult_q7(test1, test1+NNMULT_DIM, mult_out_q7, 5, NNMULT_DIM);
+
+    arm_nn_mult_q7_ref(test1, test1+NNMULT_DIM, mult_ref_q7, 5, NNMULT_DIM);
+
+    verify_results_q7(mult_out_q7, mult_ref_q7, NNMULT_DIM);
+
+    arm_nn_mult_q7(test1, test1+NNMULT_DIM, mult_out_q7, 9, NNMULT_DIM);
+
+    arm_nn_mult_q7_ref(test1, test1+NNMULT_DIM, mult_ref_q7, 9, NNMULT_DIM);
+
+    verify_results_q7(mult_out_q7, mult_ref_q7, NNMULT_DIM);
+
+    // Test q15
+    arm_nn_mult_q15(test2, test2+NNMULT_DIM, mult_out_q15, 13, NNMULT_DIM);
+
+    arm_nn_mult_q15_ref(test2, test2+NNMULT_DIM, mult_ref_q15, 13, NNMULT_DIM);
+
+    verify_results_q15(mult_out_q15, mult_ref_q15, NNMULT_DIM);
+
+    arm_nn_mult_q15(test2, test2+NNMULT_DIM, mult_out_q15, 18, NNMULT_DIM);
+
+    arm_nn_mult_q15_ref(test2, test2+NNMULT_DIM, mult_ref_q15, 18, NNMULT_DIM);
+
+    verify_results_q15(mult_out_q15, mult_ref_q15, NNMULT_DIM);
+
+#endif
+
 #ifdef TEST_SIGMOID
 
 #define SIGMOID_DIM 128
@@ -488,6 +534,22 @@
 
     initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
 
+    printf("start conv q7 nonsquare ref implementation\n");
+    arm_convolve_HWC_q7_ref_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
+                                      RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
+                                      RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_ref_q7,
+                                      RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
+
+    printf("start conv q7 nonsquare basic implementation\n");
+    arm_convolve_HWC_q7_basic_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
+                                       RCONV_OUT_CH, RCONV_KER_DIM_X, RCONV_KER_DIM_Y, RCONV_PADDING_X, RCONV_PADDING_Y,
+                                       RCONV_STRIDE_X, RCONV_STRIDE_Y, rconv_bias_q7, 1, 7, rconv_im_out_opt_q7,
+                                       RCONV_OUT_DIM_X, RCONV_OUT_DIM_Y, rconv_buf, NULL);
+
+    verify_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
+
+    initialize_results_q7(rconv_im_out_ref_q7, rconv_im_out_opt_q7, RCONV_OUT_DIM_Y * RCONV_OUT_DIM_X * RCONV_OUT_CH);
+
     printf("start 1x1 conv q7 nonsquare fast implementation\n");
     arm_convolve_HWC_q7_fast_nonsquare(rconv_im_in_q7, RCONV_IM_DIM_X, RCONV_IM_DIM_Y, RCONV_IM_CH, rconv_weight_q7,
                                        RCONV_OUT_CH, 1, 1, 0, 0, RCONV_STRIDE_X,
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx
index 090a603..bcc0ff2 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvoptx
@@ -236,7 +236,7 @@
       <OPTFL>
         <tvExp>1</tvExp>
         <tvExpOptDlg>0</tvExpOptDlg>
-        <IsCurrentTarget>1</IsCurrentTarget>
+        <IsCurrentTarget>0</IsCurrentTarget>
       </OPTFL>
       <CpuCode>7</CpuCode>
       <DebugOpt>
@@ -566,7 +566,7 @@
       <OPTFL>
         <tvExp>1</tvExp>
         <tvExpOptDlg>0</tvExpOptDlg>
-        <IsCurrentTarget>0</IsCurrentTarget>
+        <IsCurrentTarget>1</IsCurrentTarget>
       </OPTFL>
       <CpuCode>7</CpuCode>
       <DebugOpt>
@@ -1525,6 +1525,54 @@
       <RteFlg>0</RteFlg>
       <bShared>0</bShared>
     </File>
+    <File>
+      <GroupNumber>1</GroupNumber>
+      <FileNumber>44</FileNumber>
+      <FileType>1</FileType>
+      <tvExp>0</tvExp>
+      <tvExpOptDlg>0</tvExpOptDlg>
+      <bDave2>0</bDave2>
+      <PathWithFileName>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</PathWithFileName>
+      <FilenameWithoutPath>arm_convolve_HWC_q7_basic_nonsquare.c</FilenameWithoutPath>
+      <RteFlg>0</RteFlg>
+      <bShared>0</bShared>
+    </File>
+    <File>
+      <GroupNumber>1</GroupNumber>
+      <FileNumber>45</FileNumber>
+      <FileType>1</FileType>
+      <tvExp>0</tvExp>
+      <tvExpOptDlg>0</tvExpOptDlg>
+      <bDave2>0</bDave2>
+      <PathWithFileName>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</PathWithFileName>
+      <FilenameWithoutPath>arm_nn_mult_q7.c</FilenameWithoutPath>
+      <RteFlg>0</RteFlg>
+      <bShared>0</bShared>
+    </File>
+    <File>
+      <GroupNumber>1</GroupNumber>
+      <FileNumber>46</FileNumber>
+      <FileType>1</FileType>
+      <tvExp>0</tvExp>
+      <tvExpOptDlg>0</tvExpOptDlg>
+      <bDave2>0</bDave2>
+      <PathWithFileName>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</PathWithFileName>
+      <FilenameWithoutPath>arm_nn_mult_q15.c</FilenameWithoutPath>
+      <RteFlg>0</RteFlg>
+      <bShared>0</bShared>
+    </File>
+    <File>
+      <GroupNumber>1</GroupNumber>
+      <FileNumber>47</FileNumber>
+      <FileType>1</FileType>
+      <tvExp>0</tvExp>
+      <tvExpOptDlg>0</tvExpOptDlg>
+      <bDave2>0</bDave2>
+      <PathWithFileName>.\Ref_Implementations\arm_nn_mult_ref.c</PathWithFileName>
+      <FilenameWithoutPath>arm_nn_mult_ref.c</FilenameWithoutPath>
+      <RteFlg>0</RteFlg>
+      <bShared>0</bShared>
+    </File>
   </Group>
 
   <Group>
@@ -1535,7 +1583,7 @@
     <RteFlg>0</RteFlg>
     <File>
       <GroupNumber>2</GroupNumber>
-      <FileNumber>44</FileNumber>
+      <FileNumber>48</FileNumber>
       <FileType>5</FileType>
       <tvExp>0</tvExp>
       <tvExpOptDlg>0</tvExpOptDlg>
diff --git a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx
index 1b1e251..a698fe1 100644
--- a/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx
+++ b/CMSIS/NN/NN_Lib_Tests/nn_test/arm_nnexamples_nn_test.uvprojx
@@ -595,6 +595,26 @@
               <FileType>1</FileType>
               <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
             </File>
+            <File>
+              <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q7.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q15.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_ref.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+            </File>
           </Files>
         </Group>
         <Group>
@@ -1207,6 +1227,26 @@
               <FileType>1</FileType>
               <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
             </File>
+            <File>
+              <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q7.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q15.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_ref.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+            </File>
           </Files>
         </Group>
         <Group>
@@ -1819,6 +1859,26 @@
               <FileType>1</FileType>
               <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
             </File>
+            <File>
+              <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q7.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q15.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_ref.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+            </File>
           </Files>
         </Group>
         <Group>
@@ -2431,6 +2491,26 @@
               <FileType>1</FileType>
               <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
             </File>
+            <File>
+              <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q7.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q15.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_ref.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+            </File>
           </Files>
         </Group>
         <Group>
@@ -3043,6 +3123,26 @@
               <FileType>1</FileType>
               <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
             </File>
+            <File>
+              <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q7.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q15.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_ref.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+            </File>
           </Files>
         </Group>
         <Group>
@@ -3655,6 +3755,26 @@
               <FileType>1</FileType>
               <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q15_fast_nonsquare.c</FilePath>
             </File>
+            <File>
+              <FileName>arm_convolve_HWC_q7_basic_nonsquare.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\ConvolutionFunctions\arm_convolve_HWC_q7_basic_nonsquare.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q7.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q7.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_q15.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>..\..\Source\NNSupportFunctions\arm_nn_mult_q15.c</FilePath>
+            </File>
+            <File>
+              <FileName>arm_nn_mult_ref.c</FileName>
+              <FileType>1</FileType>
+              <FilePath>.\Ref_Implementations\arm_nn_mult_ref.c</FilePath>
+            </File>
           </Files>
         </Group>
         <Group>
diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
new file mode 100644
index 0000000..b426b92
--- /dev/null
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_convolve_HWC_q7_basic.c
+ * Description:	 Q7 version of convolution
+ *
+ * $Date:        13. July 2018
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+
+/**
+ *  @ingroup groupNN
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+  /**
+   * @brief Basic Q7 convolution function (non-sqaure shape)
+   * @param[in]       Im_in        pointer to input tensor
+   * @param[in]       dim_im_in_x  input tensor dimention x
+   * @param[in]       dim_im_in_y  input tensor dimention y
+   * @param[in]       ch_im_in     number of input tensor channels
+   * @param[in]       wt           pointer to kernel weights
+   * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
+   * @param[in]       dim_kernel_x filter kernel size x
+   * @param[in]       dim_kernel_y filter kernel size y
+   * @param[in]       padding_x    padding size x
+   * @param[in]       padding_y    padding size y
+   * @param[in]       stride_x     convolution stride x
+   * @param[in]       stride_y     convolution stride y
+   * @param[in]       bias         pointer to bias
+   * @param[in]       bias_shift   amount of left-shift for bias
+   * @param[in]       out_shift    amount of right-shift for output
+   * @param[in,out]   Im_out       pointer to output tensor
+   * @param[in]       dim_im_out_x output tensor dimension x
+   * @param[in]       dim_im_out_y output tensor dimension y
+   * @param[in,out]   bufferA      pointer to buffer space for input
+   * @param[in,out]   bufferB      pointer to buffer space for output
+   * @return     The function returns <code>ARM_MATH_SUCCESS</code>
+   */
+
+arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
+                                               const uint16_t dim_im_in_x,
+                                               const uint16_t dim_im_in_y,
+                                               const uint16_t ch_im_in,
+                                               const q7_t * wt,
+                                               const uint16_t ch_im_out,
+                                               const uint16_t dim_kernel_x,
+                                               const uint16_t dim_kernel_y,
+                                               const uint16_t padding_x,
+                                               const uint16_t padding_y,
+                                               const uint16_t stride_x,
+                                               const uint16_t stride_y,
+                                               const q7_t * bias,
+                                               const uint16_t bias_shift,
+                                               const uint16_t out_shift,
+                                               q7_t * Im_out,
+                                               const uint16_t dim_im_out_x,
+                                               const uint16_t dim_im_out_y,
+                                               q15_t * bufferA,
+                                               q7_t * bufferB)
+{
+
+#if defined (ARM_MATH_DSP)
+    /* Run the following code for Cortex-M4 and Cortex-M7 */
+
+    int16_t   i_out_y, i_out_x, i_ker_y, i_ker_x;
+
+    /* 
+     *  Here we use bufferA as q15_t internally as computation are done with q15_t level
+     *  im2col are done to output in q15_t format from q7_t input
+     */
+    q15_t    *pBuffer = bufferA;
+    q7_t     *pOut = Im_out;
+
+    /* This part implements the im2col function */
+    for (i_out_y = 0; i_out_y < dim_im_out_y; i_out_y++)
+    {
+        for (i_out_x = 0; i_out_x < dim_im_out_x; i_out_x++)
+        {
+            for (i_ker_y = i_out_y * stride_y - padding_y; i_ker_y < i_out_y * stride_y - padding_y + dim_kernel_y; i_ker_y++)
+            {
+                for (i_ker_x = i_out_x * stride_x - padding_x; i_ker_x < i_out_x * stride_x - padding_x + dim_kernel_x; i_ker_x++)
+                {
+                    if (i_ker_y < 0 || i_ker_y >= dim_im_in_y || i_ker_x < 0 || i_ker_x >= dim_im_in_x)
+                    {
+                        /* Filling 0 for out-of-bound paddings */
+                        /* arm_fill_q15(0, pBuffer, ch_im_in); */
+                        memset(pBuffer, 0, sizeof(q15_t)*ch_im_in);
+                    } else
+                    {
+                        /* Copying the pixel data to column */
+                        arm_q7_to_q15_no_shift((q7_t *)
+                                               Im_in + (i_ker_y * dim_im_in_x + i_ker_x) * ch_im_in, pBuffer, ch_im_in);
+                    }
+                    pBuffer += ch_im_in;
+                }
+            }
+
+            /* Computation is filed for every 2 columns */
+            if (pBuffer == bufferA + 2 * ch_im_in * dim_kernel_y * dim_kernel_x)
+            {
+                pOut =
+                    arm_nn_mat_mult_kernel_q7_q15(wt, bufferA,
+                                                  ch_im_out,
+                                                  ch_im_in *
+                                                  dim_kernel_y * dim_kernel_x, bias_shift, out_shift, bias, pOut);
+
+                /* counter reset */
+                pBuffer = bufferA;
+            }
+        }
+    }
+
+    /* left-over because odd number of output pixels */
+    if (pBuffer != bufferA)
+    {
+        const q7_t *pA = wt;
+        int       i;
+
+        for (i = 0; i < ch_im_out; i++)
+        {
+            /* Load the accumulator with bias first */
+            q31_t     sum = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+
+            /* Point to the beging of the im2col buffer */
+            q15_t    *pB = bufferA;
+
+            /* Each time it process 4 entries */
+            uint16_t  colCnt = ch_im_in * dim_kernel_y * dim_kernel_x >> 2;
+
+            while (colCnt)
+            {
+                q31_t     inA1, inA2;
+                q31_t     inB1, inB2;
+
+                pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
+
+                inB1 = *__SIMD32(pB)++;
+                sum = __SMLAD(inA1, inB1, sum);
+                inB2 = *__SIMD32(pB)++;
+                sum = __SMLAD(inA2, inB2, sum);
+
+                colCnt--;
+            }
+            colCnt = ch_im_in * dim_kernel_y * dim_kernel_x & 0x3;
+            while (colCnt)
+            {
+                q7_t      inA1 = *pA++;
+                q15_t     inB1 = *pB++;
+                sum += inA1 * inB1;
+                colCnt--;
+            }
+            *pOut++ = (q7_t) __SSAT((sum >> out_shift), 8);
+        }
+    }
+#else
+    /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
+
+    uint16_t  i, j, k, l, m, n;
+    int       conv_out;
+    signed char in_row, in_col;
+
+    for (i = 0; i < ch_im_out; i++)
+    {
+        for (j = 0; j < dim_im_out_y; j++)
+        {
+            for (k = 0; k < dim_im_out_x; k++)
+            {
+                conv_out = ((q31_t)bias[i] << bias_shift) + NN_ROUND(out_shift);
+                for (m = 0; m < dim_kernel_y; m++)
+                {
+                    for (n = 0; n < dim_kernel_x; n++)
+                    {
+                        // if-for implementation
+                        in_row = stride_y * j + m - padding_y;
+                        in_col = stride_x * k + n - padding_x;
+                        if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_y && in_col < dim_im_in_x)
+                        {
+                            for (l = 0; l < ch_im_in; l++)
+                            {
+                                conv_out +=
+                                    Im_in[(in_row * dim_im_in_x + in_col) * ch_im_in + l] * 
+                                         wt[i * ch_im_in * dim_kernel_y * dim_kernel_x + 
+                                         (m * dim_kernel_x + n) * ch_im_in + l];
+                            }
+                        }
+                    }
+                }
+                Im_out[i + (j * dim_im_out_x + k) * ch_im_out] = (q7_t) __SSAT((conv_out >> out_shift), 8);
+            }
+        }
+    }
+
+#endif                          /* ARM_MATH_DSP */
+
+    /* Return to application */
+    return ARM_MATH_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
new file mode 100644
index 0000000..5a60459
--- /dev/null
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q15.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_mult_q15.c
+ * Description:  Q15 vector multiplication with variable output shifts
+ *
+ * $Date:        13. July 2018
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+
+/**    
+ * @ingroup groupSupport    
+ */
+
+/**
+ * @addtogroup NNBasicMath
+ * @{
+ */
+
+
+/**
+ * @brief           Q7 vector multiplication with variable output shifts
+ * @param[in]       *pSrcA        pointer to the first input vector
+ * @param[in]       *pSrcB        pointer to the second input vector
+ * @param[out]      *pDst         pointer to the output vector
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in]       blockSize     number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+void arm_nn_mult_q15(
+  q15_t * pSrcA,
+  q15_t * pSrcB,
+  q15_t * pDst,
+  const uint16_t out_shift,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* loop counters */
+
+#if defined (ARM_MATH_DSP)
+
+/* Run the below code for Cortex-M4 and Cortex-M3 */
+  q31_t inA1, inA2, inB1, inB2;                  /* temporary input variables */
+  q15_t out1, out2, out3, out4;                  /* temporary output variables */
+  q31_t mul1, mul2, mul3, mul4;                  /* temporary variables */
+
+  /* loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* read two samples at a time from sourceA */
+    inA1 = *__SIMD32(pSrcA)++;
+    /* read two samples at a time from sourceB */
+    inB1 = *__SIMD32(pSrcB)++;
+    /* read two samples at a time from sourceA */
+    inA2 = *__SIMD32(pSrcA)++;
+    /* read two samples at a time from sourceB */
+    inB2 = *__SIMD32(pSrcB)++;
+
+    /* multiply mul = sourceA * sourceB */
+    mul1 = (q31_t) ((q15_t) (inA1 >> 16) * (q15_t) (inB1 >> 16));
+    mul2 = (q31_t) ((q15_t) inA1 * (q15_t) inB1);
+    mul3 = (q31_t) ((q15_t) (inA2 >> 16) * (q15_t) (inB2 >> 16));
+    mul4 = (q31_t) ((q15_t) inA2 * (q15_t) inB2);
+
+    /* saturate result to 16 bit */
+    out1 = (q15_t) __SSAT((mul1 + NN_ROUND(out_shift)) >> out_shift, 16);
+    out2 = (q15_t) __SSAT((mul2 + NN_ROUND(out_shift)) >> out_shift, 16);
+    out3 = (q15_t) __SSAT((mul3 + NN_ROUND(out_shift)) >> out_shift, 16);
+    out4 = (q15_t) __SSAT((mul4 + NN_ROUND(out_shift)) >> out_shift, 16);
+
+    /* store the result */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
+    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
+
+#else
+
+    *__SIMD32(pDst)++ = __PKHBT(out2, out1, 16);
+    *__SIMD32(pDst)++ = __PKHBT(out4, out3, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+    /* Decrement the blockSize loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Run the below code for Cortex-M0 */
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+    /* Multiply the inputs and store the result in the destination buffer */
+    *pDst++ = (q15_t) __SSAT((((q31_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 16);
+
+    /* Decrement the blockSize loop counter */
+    blkCnt--;
+  }
+}
+
+/**
+ * @} end of NNBasicMath group
+ */
+
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
new file mode 100644
index 0000000..3735c04
--- /dev/null
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_mult_q7.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_mult_q7.c
+ * Description:  Q7 vector multiplication with variable output shifts
+ *
+ * $Date:        13. July 2018
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+
+/**    
+ * @ingroup groupSupport    
+ */
+
+/**
+ * @addtogroup NNBasicMath
+ * @{
+ */
+
+/**
+ * @brief           Q7 vector multiplication with variable output shifts
+ * @param[in]       *pSrcA        pointer to the first input vector
+ * @param[in]       *pSrcB        pointer to the second input vector
+ * @param[out]      *pDst         pointer to the output vector
+ * @param[in]       out_shift     amount of right-shift for output
+ * @param[in]       blockSize     number of samples in each vector
+ * @return none.
+ *
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ */
+
+void arm_nn_mult_q7(
+  q7_t * pSrcA,
+  q7_t * pSrcB,
+  q7_t * pDst,
+  const uint16_t out_shift,
+  uint32_t blockSize)
+{
+  uint32_t blkCnt;                               /* loop counters */
+
+#if defined (ARM_MATH_DSP)
+
+/* Run the below code for Cortex-M4 and Cortex-M3 */
+  q7_t out1, out2, out3, out4;                   /* Temporary variables to store the product */
+
+  /* loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+    /* Multiply the inputs and store the results in temporary variables */
+    out1 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+    out2 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+    out3 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+    out4 = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+
+    /* Store the results of 4 inputs in the destination buffer in single cycle by packing */
+    *__SIMD32(pDst)++ = __PACKq7(out1, out2, out3, out4);
+
+    /* Decrement the blockSize loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+
+  /* Run the below code for Cortex-M0 */
+
+  /* Initialize blkCnt with number of samples */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+
+  while (blkCnt > 0U)
+  {
+    /* C = A * B */
+    /* Multiply the inputs and store the result in the destination buffer */
+    *pDst++ = (q7_t) __SSAT((((q15_t) (*pSrcA++) * (*pSrcB++) + NN_ROUND(out_shift)) >> out_shift), 8);
+
+    /* Decrement the blockSize loop counter */
+    blkCnt--;
+  }
+}
+
+/**
+ * @} end of NNBasicMath group
+ */
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
index fa58a2c..264e760 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
@@ -130,5 +130,5 @@
 }
 
 /**    
- * @} end of q7_to_x group    
+ * @} end of nndata_convert group   
  */