CMSIS-DSP: Some improvement to a few f16 functions
diff --git a/CMSIS/DSP/Include/arm_helium_utils.h b/CMSIS/DSP/Include/arm_helium_utils.h
index e39a886..7a6fa9c 100755
--- a/CMSIS/DSP/Include/arm_helium_utils.h
+++ b/CMSIS/DSP/Include/arm_helium_utils.h
@@ -63,10 +63,10 @@
     return acc;
 }
 
-__STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
+__STATIC_FORCEINLINE _Float16 vecAddAcrossF16Mve(float16x8_t in)
 {
     float16x8_t tmpVec;
-    float16_t acc;
+    _Float16 acc;
 
     tmpVec = (float16x8_t) vrev32q_s16((int16x8_t) in);
     in = vaddq_f16(tmpVec, in);
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c
index fca2440..fd61160 100755
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_dot_prod_f16.c
@@ -126,7 +126,7 @@
         float16_t * result)
 {
         uint32_t blkCnt;                               /* Loop counter */
-        float16_t sum = 0.0f;                          /* Temporary return variable */
+        _Float16 sum = 0.0f;                          /* Temporary return variable */
 
 
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
@@ -141,13 +141,13 @@
     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
 
     /* Calculate dot product and store result in a temporary buffer. */
-    sum += (*pSrcA++) * (*pSrcB++);
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
 
-    sum += (*pSrcA++) * (*pSrcB++);
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
 
-    sum += (*pSrcA++) * (*pSrcB++);
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
 
-    sum += (*pSrcA++) * (*pSrcB++);
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
 
     /* Decrement loop counter */
     blkCnt--;
@@ -168,7 +168,7 @@
     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
 
     /* Calculate dot product and store result in a temporary buffer. */
-    sum += (*pSrcA++) * (*pSrcB++);
+    sum += (_Float16)(*pSrcA++) * (_Float16)(*pSrcB++);
 
     /* Decrement loop counter */
     blkCnt--;
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
index c73a852..c4cbac2 100755
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
@@ -50,6 +50,7 @@
 
 #include "arm_helium_utils.h"
 
+#include <stdio.h>
 void arm_svm_linear_predict_f16(
     const arm_svm_linear_instance_f16 *S,
     const float16_t * in,
@@ -65,7 +66,7 @@
     uint32_t         row;
     uint32_t         blkCnt;     /* loop counters */
     const float16_t *pDualCoef = S->dualCoefficients;
-    float16_t       sum = S->intercept;
+    _Float16       sum = S->intercept;
     row = numRows;
 
     /*
@@ -145,10 +146,10 @@
         /*
          * Sum the partial parts
          */
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc0);
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc1);
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc2);
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc3);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc1);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc2);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc3);
 
         pSrcA += numCols * 4;
         /*
@@ -215,8 +216,8 @@
         /*
          * Sum the partial parts
          */
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc0);
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc1);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc1);
 
         pSrcA += numCols * 2;
         row -= 2;
@@ -269,7 +270,7 @@
         /*
          * Sum the partial parts
          */
-        sum += *pDualCoef++ * vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
 
     }
 
diff --git a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
index 93bf97d..31848cf 100755
--- a/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
+++ b/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@@ -67,7 +67,7 @@
     uint32_t         row;
     uint32_t         blkCnt;     /* loop counters */
     const float16_t *pDualCoef = S->dualCoefficients;
-    float16_t       sum = S->intercept;
+    _Float16       sum = S->intercept;
     f16x8_t         vSum = vdupq_n_f16(0.0f);
 
     row = numRows;
diff --git a/CMSIS/DSP/Testing/TestScripts/doc/Format.py b/CMSIS/DSP/Testing/TestScripts/doc/Format.py
index dffb339..d38182d 100755
--- a/CMSIS/DSP/Testing/TestScripts/doc/Format.py
+++ b/CMSIS/DSP/Testing/TestScripts/doc/Format.py
@@ -11,7 +11,7 @@
         yield x
 
 # To format, in HTML, the cores in the right order.
-# First we order tje categories
+# First we order the categories
 # Then we order the cores in each category
 # The final ORDEREDCORES is what is used
 # to order tjhe values
@@ -33,6 +33,8 @@
     quit()
   ORDEREDCORES += cores
 
+ORDEREDTYPES=["q7","q15","q31","u32","f16","f32","f64"]
+
 class Markdown:
   def __init__(self,output):
     self._id=0
@@ -485,13 +487,14 @@
     return(result)
 
 class HTML:
-  def __init__(self,output,regMode):
+  def __init__(self,output,regMode,reorder):
     self._id=0
     self._sectionID = 0
     self._barID = 0
     self._histID = 0
     self._output = output
     self._regMode = regMode
+    self._reorder = reorder
 
   def visitBarChart(self,bar):
       data=bar.data
@@ -545,7 +548,10 @@
         self._output.write(str(col))
         self._output.write("</th>\n")
 
-      perm,restricted=permutation(ORDEREDCORES,table.cores)
+      if self._reorder:
+         perm,restricted=permutation(ORDEREDCORES,table.cores)
+      else:
+         restricted = table.cores
 
       for col in restricted:
         if firstCore:
@@ -570,7 +576,10 @@
         params=row[0:nbParams]
         values=row[nbParams:]
 
-        row = params + reorder(perm,values)
+        if self._reorder:
+          row = params + reorder(perm,values)
+        else:
+          row = params + values
 
         for elem in row:
             if i < nbParams:
diff --git a/CMSIS/DSP/Testing/extractDb.py b/CMSIS/DSP/Testing/extractDb.py
index 3b9daaf..a723035 100755
--- a/CMSIS/DSP/Testing/extractDb.py
+++ b/CMSIS/DSP/Testing/extractDb.py
@@ -55,6 +55,7 @@
 parser.add_argument('-details', action='store_true', help="Details about runids")
 parser.add_argument('-lastid', action='store_true', help="Get last ID")
 parser.add_argument('-comments', nargs='?',type = str, default="comments.txt", help="Comment section")
+parser.add_argument('-byd', action='store_true', help="Result oganized by datatype")
 
 # For runid or runid range
 parser.add_argument('others', nargs=argparse.REMAINDER,help="Run ID")
@@ -685,7 +686,10 @@
           if args.t=="md":
              document.accept(Markdown(output))
           if args.t=="html":
-             document.accept(HTML(output,args.r))
+             reorder=True
+             if args.byc:
+               reorder=False
+             document.accept(HTML(output,args.r,reorder))
 
 finally:
      c.close()