CMSIS-DSP: Added support for Helium.
Only arm_dot_prod_f32 is currently providing an Helium implementation.
diff --git a/CMSIS/DSP/Include/arm_helium_utils.h b/CMSIS/DSP/Include/arm_helium_utils.h
new file mode 100755
index 0000000..fd4c707
--- /dev/null
+++ b/CMSIS/DSP/Include/arm_helium_utils.h
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------

+ * Project:      CMSIS DSP Library

+ * Title:        arm_helium_utils.h

+ * Description:  Utility functions for Helium development

+ *

+ * $Date:        09. September 2019

+ * $Revision:    V.1.5.1

+ *

+ * Target Processor: Cortex-M cores

+ * -------------------------------------------------------------------- */

+/*

+ * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.

+ *

+ * SPDX-License-Identifier: Apache-2.0

+ *

+ * Licensed under the Apache License, Version 2.0 (the License); you may

+ * not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ * www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT

+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+

+#ifndef _ARM_UTILS_HELIUM_H_

+#define _ARM_UTILS_HELIUM_H_

+

+#if defined (ARM_MATH_HELIUM)

+

+#define nbLanes(sz)             (128/sz)

+

+#define VEC_LANES_F32       nbLanes(32)

+#define VEC_LANES_F16       nbLanes(16)

+#define VEC_LANES_Q63       nbLanes(64)

+#define VEC_LANES_Q31       nbLanes(32)

+#define VEC_LANES_Q15       nbLanes(16)

+#define VEC_LANES_Q7        nbLanes(8)

+

+#define nb_vec_lanes(ptr) _Generic((ptr), \

+               uint32_t *: VEC_LANES_Q31, \

+               uint16_t *: VEC_LANES_Q15, \

+                uint8_t *: VEC_LANES_Q7,  \

+                  q31_t *: VEC_LANES_Q31, \

+                  q15_t *: VEC_LANES_Q15, \

+                   q7_t *: VEC_LANES_Q7,  \

+               float32_t*: VEC_LANES_F32, \

+               float16_t*: VEC_LANES_F16, \

+            const q31_t *: VEC_LANES_Q31, \

+            const q15_t *: VEC_LANES_Q15, \

+             const q7_t *: VEC_LANES_Q7,  \

+         const float32_t*: VEC_LANES_F32, \

+         const float16_t*: VEC_LANES_F16, \

+                  default: "err")

+

+__STATIC_FORCEINLINE float32_t vecAddAcrossF32Mve(float32x4_t in)

+{

+    float32_t acc;

+

+    acc = vgetq_lane(in, 0) + vgetq_lane(in, 1) +

+          vgetq_lane(in, 2) + vgetq_lane(in, 3);

+

+    return acc;

+}

+

+#define post_incr_vec_size(ptr)         ptr += nb_vec_lanes(ptr)

+

+#endif

+

+#endif
\ No newline at end of file
diff --git a/CMSIS/DSP/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c b/CMSIS/DSP/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c
index b373ffc..eccf725 100755
--- a/CMSIS/DSP/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c
+++ b/CMSIS/DSP/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c
@@ -128,6 +128,7 @@
   __set_MSPLIM((uint32_t)(&__STACK_LIMIT));
 
   SystemInit();                             /* CMSIS System Initialization */
+
   __PROGRAM_START();                        /* Enter PreMain (C library entry point) */
 }
 
diff --git a/CMSIS/DSP/Platforms/FVP/ARMv81MML/system_ARMv81MML.c b/CMSIS/DSP/Platforms/FVP/ARMv81MML/system_ARMv81MML.c
index 2919ec0..75e5164 100644
--- a/CMSIS/DSP/Platforms/FVP/ARMv81MML/system_ARMv81MML.c
+++ b/CMSIS/DSP/Platforms/FVP/ARMv81MML/system_ARMv81MML.c
@@ -40,6 +40,11 @@
 
 #define  SYSTEM_CLOCK    (5U * XTAL)
 
+#define DEBUG_DEMCR  (*((unsigned int *)0xE000EDFC))
+#define DEBUG_TRCENA (1<<24) //Global debug enable bit
+
+#define CCR      (*((volatile unsigned int *)0xE000ED14))
+#define CCR_DL   (1 << 19)
 
 /*----------------------------------------------------------------------------
   Externals
@@ -88,4 +93,10 @@
 
   SystemCoreClock = SYSTEM_CLOCK;
 
+  //Disable debug
+  DEBUG_DEMCR &=~ DEBUG_TRCENA;
+
+  // enable DL branch cache
+  CCR |= CCR_DL;
+
 }
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
index 3eee3b9..99ca7dc 100644
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -59,6 +59,46 @@
   @return        none
  */
 
+#if defined (ARM_MATH_HELIUM)
+
+#include "arm_mve.h"
+#include "arm_helium_utils.h"
+
+void arm_dot_prod_f32(
+    const float32_t * pSrcA,
+    const float32_t * pSrcB,
+    uint32_t    blockSize,
+    float32_t * result)
+{
+    float32x4_t vecA, vecB;
+    float32x4_t vecSum;
+    vecSum = vdupq_n_f32(0.0);
+
+    do {
+        /*
+         * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
+         * Calculate dot product and then store the result in a temporary buffer.
+         */
+        mve_pred16_t p = vctp32q(blockSize);
+
+        vecA = vldrwq_z_f32(pSrcA, p);
+        vecB = vldrwq_z_f32(pSrcB, p);
+        vecSum = vfmaq_m(vecSum, vecA, vecB, p);
+        /*
+         * Decrement the blockSize loop counter
+         * Advance vector source and destination pointers
+         */
+        post_incr_vec_size(pSrcA);
+        post_incr_vec_size(pSrcB);
+        blockSize -= VEC_LANES_F32;
+    }
+    while ((int32_t) blockSize > 0);
+
+    *result = vecAddAcrossF32Mve(vecSum);
+}
+
+#else
+
 void arm_dot_prod_f32(
   const float32_t * pSrcA,
   const float32_t * pSrcB,
@@ -158,6 +198,7 @@
   *result = sum;
 }
 
+#endif /* ARM_MATH_HELIUM */
 /**
   @} end of BasicDotProd group
  */
diff --git a/CMSIS/DSP/Source/CMakeLists.txt b/CMSIS/DSP/Source/CMakeLists.txt
index ac56523..8248c21 100755
--- a/CMSIS/DSP/Source/CMakeLists.txt
+++ b/CMSIS/DSP/Source/CMakeLists.txt
@@ -17,6 +17,7 @@
 option(LOOPUNROLL "Loop unrolling" ON)
 option(ROUNDING "Rounding" OFF)
 option(MATRIXCHECK "Matrix Checks" OFF)
+option(HELIUM "Helium acceleration" OFF)
 
 # Select which parts of the CMSIS-DSP must be compiled.
 # There are some dependencies between the parts but they are not tracked
diff --git a/CMSIS/DSP/Testing/CMakeLists.txt b/CMSIS/DSP/Testing/CMakeLists.txt
index de00fe2..be96fef 100644
--- a/CMSIS/DSP/Testing/CMakeLists.txt
+++ b/CMSIS/DSP/Testing/CMakeLists.txt
@@ -7,7 +7,7 @@
 
 function(writeConfig path)
   set(output "")
-  list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n")
+  list(APPEND output "OPTIMIZED,HARDFP,FASTMATH,NEON,HELIUM,UNROLL,ROUNDING,PLATFORM,CORE,COMPILER,VERSION\n")
 
   if (OPTIMIZED)
     list(APPEND output "1")
@@ -33,6 +33,12 @@
     list(APPEND output ",0")
   endif()
 
+  if (HELIUM)
+    list(APPEND output ",1")
+  else()
+    list(APPEND output ",0")
+  endif()
+
   if (LOOPUNROLL)
     list(APPEND output ",1")
   else()
diff --git a/CMSIS/DSP/Testing/addToDB.py b/CMSIS/DSP/Testing/addToDB.py
index 514df54..bdf570a 100755
--- a/CMSIS/DSP/Testing/addToDB.py
+++ b/CMSIS/DSP/Testing/addToDB.py
@@ -19,7 +19,7 @@
 
 # For table creation
 MKSTRFIELD=['NAME']
-MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
+MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
 MKINTFIELD=['ID', 'CYCLES']
 MKDATEFIELD=['DATE']
 MKKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
@@ -31,7 +31,7 @@
 
 # For table value extraction
 VALSTRFIELD=['NAME','VERSION']
-VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
+VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
 VALINTFIELD=['ID', 'CYCLES']
 VALDATEFIELD=['DATE']
 VALKEYFIELD=['CATEGORY', 'PLATFORM', 'CORE', 'COMPILER','TYPE']
@@ -246,16 +246,17 @@
    conn.commit() 
 
 def addOneBenchmark(elem,fullPath,db,group):
-   full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
-   full['DATE'] = datetime.datetime.now()
-   if group:
-      tableName = group
-   else:
-      tableName = elem.data["class"]
-   conn = sqlite3.connect(db)
-   createTableIfMissing(conn,elem,tableName,full)
-   addRows(conn,elem,tableName,full)
-   conn.close()
+   if os.path.isfile(fullPath):
+      full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
+      full['DATE'] = datetime.datetime.now()
+      if group:
+         tableName = group
+      else:
+         tableName = elem.data["class"]
+      conn = sqlite3.connect(db)
+      createTableIfMissing(conn,elem,tableName,full)
+      addRows(conn,elem,tableName,full)
+      conn.close()
 
 
 def addToDB(benchmark,dbpath,elem,group):
diff --git a/CMSIS/DSP/Testing/addToRegDB.py b/CMSIS/DSP/Testing/addToRegDB.py
index 50c3e76..acbf3fd 100755
--- a/CMSIS/DSP/Testing/addToRegDB.py
+++ b/CMSIS/DSP/Testing/addToRegDB.py
@@ -19,7 +19,7 @@
 
 # For table creation
 MKSTRFIELD=['NAME','Regression']
-MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
+MKBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
 MKINTFIELD=['ID','MAX']
 MKREALFIELD=['MAXREGCOEF']
 MKDATEFIELD=['DATE']
@@ -32,7 +32,7 @@
 
 # For table value extraction
 VALSTRFIELD=['NAME','VERSION','Regression']
-VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'UNROLL', 'ROUNDING','OPTIMIZED']
+VALBOOLFIELD=['HARDFP', 'FASTMATH', 'NEON', 'HELIUM','UNROLL', 'ROUNDING','OPTIMIZED']
 VALINTFIELD=['ID', 'MAX']
 VALREALFIELD=['MAXREGCOEF']
 VALDATEFIELD=['DATE']
@@ -257,16 +257,17 @@
    conn.commit() 
 
 def addOneBenchmark(elem,fullPath,db,group):
-   full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
-   full['DATE'] = datetime.datetime.now()
-   if group:
-      tableName = group
-   else:
-      tableName = elem.data["class"]
-   conn = sqlite3.connect(db)
-   createTableIfMissing(conn,elem,tableName,full)
-   addRows(conn,elem,tableName,full)
-   conn.close()
+   if os.path.isfile(fullPath):
+      full=pd.read_csv(fullPath,dtype={'OLDID': str} ,keep_default_na = False)
+      full['DATE'] = datetime.datetime.now()
+      if group:
+         tableName = group
+      else:
+         tableName = elem.data["class"]
+      conn = sqlite3.connect(db)
+      createTableIfMissing(conn,elem,tableName,full)
+      addRows(conn,elem,tableName,full)
+      conn.close()
 
 
 def addToDB(benchmark,dbpath,elem,group):
diff --git a/CMSIS/DSP/Testing/createDb.sql b/CMSIS/DSP/Testing/createDb.sql
index 64737b3..2c9b161 100755
--- a/CMSIS/DSP/Testing/createDb.sql
+++ b/CMSIS/DSP/Testing/createDb.sql
@@ -70,3 +70,4 @@
 INSERT INTO CORE VALUES(10,"a7","ARMCA7");
 INSERT INTO CORE VALUES(11,"a9","ARMCA9");
 INSERT INTO CORE VALUES(12,"a15","ARMCA15");
+INSERT INTO CORE VALUES(13,"helium","ARMv81MML_DSP_DP_MVE_FP");
diff --git a/CMSIS/DSP/Testing/summaryBench.py b/CMSIS/DSP/Testing/summaryBench.py
index 14cb2be..ff758bc 100644
--- a/CMSIS/DSP/Testing/summaryBench.py
+++ b/CMSIS/DSP/Testing/summaryBench.py
@@ -57,38 +57,40 @@
 
 def summaryBenchmark(resultPath,elem,path):
    regressionPath=os.path.join(os.path.dirname(path),"regression.csv")
-   print("  Generating %s" % regressionPath)
-   full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False)
-   #print(full)
-   
-   csvheaders = []
-   with open(os.path.join(resultPath,'currentConfig.csv'), 'r') as f:
-        reader = csv.reader(f)
-        csvheaders = next(reader, None)
 
-   groupList = list(set(elem.params.full) - set(elem.params.summary))
-   #grouped=full.groupby(list(elem.params.summary) + ['ID','CATEGORY']).max()
-   #grouped.reset_index(level=grouped.index.names, inplace=True)
-   #print(grouped)
-   #print(grouped.columns)
+   if os.path.isfile(path):
+      print("  Generating %s" % regressionPath)
+      full=pd.read_csv(path,dtype={'OLDID': str} ,keep_default_na = False)
+      #print(full)
+      
+      csvheaders = []
+      with open(os.path.join(resultPath,'currentConfig.csv'), 'r') as f:
+           reader = csv.reader(f)
+           csvheaders = next(reader, None)
+   
+      groupList = list(set(elem.params.full) - set(elem.params.summary))
+      #grouped=full.groupby(list(elem.params.summary) + ['ID','CATEGORY']).max()
+      #grouped.reset_index(level=grouped.index.names, inplace=True)
+      #print(grouped)
+      #print(grouped.columns)
 
   
-   def reg(d):
-    m=d["CYCLES"].max()
-    results = smf.ols('CYCLES ~ ' + elem.params.formula, data=d).fit()
-    f=joinit([formatProd(a,b) for (a,b) in zip(results.params.index,results.params.values)]," + ")
-    f="".join(f)
-    f = re.sub(r':','*',f)
-    #print(results.summary())
-    return(pd.Series({'Regression':"%s" % f,'MAX' : m,'MAXREGCOEF' : results.params.values[-1]}))
-
-   regList = ['ID','OLDID','CATEGORY','NAME'] + csvheaders + groupList 
+      def reg(d):
+       m=d["CYCLES"].max()
+       results = smf.ols('CYCLES ~ ' + elem.params.formula, data=d).fit()
+       f=joinit([formatProd(a,b) for (a,b) in zip(results.params.index,results.params.values)]," + ")
+       f="".join(f)
+       f = re.sub(r':','*',f)
+       #print(results.summary())
+       return(pd.Series({'Regression':"%s" % f,'MAX' : m,'MAXREGCOEF' : results.params.values[-1]}))
    
-   regression=full.groupby(regList).apply(reg)
-   regression.reset_index(level=regression.index.names, inplace=True)
-   renamingDict = { a : b for (a,b) in zip(elem.params.full,elem.params.paramNames)}
-   regression = regression.rename(columns=renamingDict)
-   regression.to_csv(regressionPath,index=False,quoting=csv.QUOTE_NONNUMERIC)
+      regList = ['ID','OLDID','CATEGORY','NAME'] + csvheaders + groupList 
+      
+      regression=full.groupby(regList).apply(reg)
+      regression.reset_index(level=regression.index.names, inplace=True)
+      renamingDict = { a : b for (a,b) in zip(elem.params.full,elem.params.paramNames)}
+      regression = regression.rename(columns=renamingDict)
+      regression.to_csv(regressionPath,index=False,quoting=csv.QUOTE_NONNUMERIC)
 
 
 def extractBenchmarks(resultPath,benchmark,elem):
diff --git a/CMSIS/DSP/Testing/testmain.cpp b/CMSIS/DSP/Testing/testmain.cpp
index eb5014a..51d7dd1 100644
--- a/CMSIS/DSP/Testing/testmain.cpp
+++ b/CMSIS/DSP/Testing/testmain.cpp
@@ -18,9 +18,13 @@
 #include "Patterns.h"
 
 
+
 int testmain()
 {
     char *memoryBuf=NULL;
+
+
+
   
     memoryBuf = (char*)malloc(MEMSIZE);
     if (memoryBuf !=NULL)
diff --git a/CMSIS/DSP/configCore.cmake b/CMSIS/DSP/configCore.cmake
index eda57ec..f7a24f6 100644
--- a/CMSIS/DSP/configCore.cmake
+++ b/CMSIS/DSP/configCore.cmake
@@ -180,7 +180,6 @@
   #
     
   if (NEON AND NOT CORTEXM)
-    #target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON __FPU_PRESENT)
     target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON)
   endif()
   
@@ -189,6 +188,10 @@
     target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_NEON_EXPERIMENTAL)
   endif()
 
+  if (HELIUM AND CORTEXM)
+    target_compile_definitions(${PROJECTNAME} PRIVATE ARM_MATH_HELIUM)
+  endif()
+
   compilerSpecificCompileOptions(${PROJECTNAME} ${ROOT})
 
 endfunction()
\ No newline at end of file