CMSIS-DSP: Improved tests on matrix inversions

And correction of an internal pointer bug in pivot code.
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
index 3f0a923..f40dcb3 100755
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
@@ -201,7 +201,7 @@
                 /*
                  * Loop over the number rows present below
                  */
-                for (i = (l + 1U); i < numRows; i++)
+                for (i = 1U; i < numRows-l; i++)
                 {
                     /*
                      * Update the input and destination pointers
@@ -680,7 +680,7 @@
       {
         /* Loop over the number rows present below */
 
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows-l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
index b897240..b0ef760 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -219,7 +219,7 @@
                 /*
                  * Loop over the number rows present below
                  */
-                for (i = (l + 1U); i < numRows; i++)
+                for (i = 1U; i < numRows-l; i++)
                 {
                     /*
                      * Update the input and destination pointers
@@ -698,7 +698,7 @@
       if (*pInT1 == 0.0f)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows - l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
@@ -1105,7 +1105,7 @@
       {
         /* Loop over the number rows present below */
 
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows - l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
@@ -1407,7 +1407,7 @@
       if (*pInT1 == 0.0f)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows-l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
index 141dff0..bf99a79 100644
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
@@ -63,7 +63,7 @@
 #if defined (ARM_MATH_DSP)
 
   float64_t Xchg, in = 0.0, in1;                /* Temporary input values  */
-  uint32_t i, rowCnt, flag = 0U, j, loopCnt,k, l;      /* loop counters */
+  uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -174,11 +174,14 @@
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
+    
+
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+
+        for (i = 1U; i < numRows - l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
@@ -224,6 +227,8 @@
             break;
           }
 
+
+          /* Decrement loop counter */
         }
       }
 
@@ -474,13 +479,11 @@
       /* Temporary variable to hold the pivot value */
       in = *pInT1;
 
-
-
       /* Check if the pivot element is zero */
       if (*pInT1 == 0.0)
       {
         /* Loop over the number rows present below */
-        for (i = (l + 1U); i < numRows; i++)
+        for (i = 1U; i < numRows-l; i++)
         {
           /* Update the input and destination pointers */
           pInT2 = pInT1 + (numCols * i);
@@ -513,11 +516,10 @@
             /* Break after exchange is done */
             break;
           }
-
- 
         }
       }
 
+
       /* Update the status if the matrix is singular */
       if ((flag != 1U) && (in == 0.0))
       {
diff --git a/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF16.cpp b/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF16.cpp
index 7504005..2b3745a 100755
--- a/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF16.cpp
+++ b/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF16.cpp
@@ -299,6 +299,22 @@
 
     }
 
+static void refInnerTail(float16_t *b)
+{
+    b[0] = 1.0f16;
+    b[1] = -2.0f16;
+    b[2] = 3.0f16;
+    b[3] = -4.0f16;
+}
+
+static void checkInnerTail(float16_t *b)
+{
+    ASSERT_TRUE(b[0] == 1.0f16);
+    ASSERT_TRUE(b[1] == -2.0f16);
+    ASSERT_TRUE(b[2] == 3.0f16);
+    ASSERT_TRUE(b[3] == -4.0f16);
+}
+
 void UnaryTestsF16::test_mat_inverse_f16()
     {     
       const float16_t *inp1=input1.ptr();    
@@ -319,15 +335,18 @@
 
           PREPAREDATA1(false);
 
+          refInnerTail(outp+(rows * columns));
+
           status=arm_mat_inverse_f16(&this->in1,&this->out);
           ASSERT_TRUE(status==ARM_MATH_SUCCESS);
 
           outp += (rows * columns);
           inp1 += (rows * columns);
 
+          checkInnerTail(outp);
+
       }
 
-      ASSERT_EMPTY_TAIL(output);
 
       ASSERT_SNR(output,ref,(float16_t)SNR_THRESHOLD_INV);
 
diff --git a/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF32.cpp b/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF32.cpp
index a5120fe..a024beb 100755
--- a/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF32.cpp
+++ b/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF32.cpp
@@ -323,6 +323,24 @@
 
     }
 
+static void refInnerTail(float32_t *b)
+{
+    b[0] = 1.0f;
+    b[1] = -2.0f;
+    b[2] = 3.0f;
+    b[3] = -4.0f;
+}
+
+static void checkInnerTail(float32_t *b)
+{
+    ASSERT_TRUE(b[0] == 1.0f);
+    ASSERT_TRUE(b[1] == -2.0f);
+    ASSERT_TRUE(b[2] == 3.0f);
+    ASSERT_TRUE(b[3] == -4.0f);
+}
+
+
+
 void UnaryTestsF32::test_mat_inverse_f32()
     {     
       const float32_t *inp1=input1.ptr();    
@@ -343,15 +361,18 @@
 
           PREPAREDATA1(false);
 
+          refInnerTail(outp+(rows * columns));
+
           status=arm_mat_inverse_f32(&this->in1,&this->out);
           ASSERT_TRUE(status==ARM_MATH_SUCCESS);
 
           outp += (rows * columns);
           inp1 += (rows * columns);
 
+          checkInnerTail(outp);
+
       }
 
-      ASSERT_EMPTY_TAIL(output);
 
       ASSERT_SNR(output,ref,(float32_t)SNR_THRESHOLD_INV);
 
diff --git a/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF64.cpp b/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF64.cpp
index a839fad..1073c63 100755
--- a/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF64.cpp
+++ b/CMSIS/DSP/Testing/Source/Tests/UnaryTestsF64.cpp
@@ -171,6 +171,24 @@
 
 }
 
+/*
+
+Test framework is only adding 16 bytes of free memory after the end of a buffer.
+So, we limit to 2 float64 for checking out of buffer write.
+*/
+static void refInnerTail(float64_t *b)
+{
+    b[0] = 1.0;
+    b[1] = -2.0;
+}
+
+static void checkInnerTail(float64_t *b)
+{
+    ASSERT_TRUE(b[0] == 1.0);
+    ASSERT_TRUE(b[1] == -2.0);
+}
+
+
 void UnaryTestsF64::test_mat_inverse_f64()
     {     
       const float64_t *inp1=input1.ptr();    
@@ -183,7 +201,7 @@
       int rows,columns;                      
       int i;
       arm_status status;
-
+      
       for(i=0;i < nbMatrixes ; i ++)
       {
           rows = *dimsp++;
@@ -191,15 +209,18 @@
 
           PREPAREDATA1(false);
 
+          refInnerTail(outp+(rows * columns));
+
           status=arm_mat_inverse_f64(&this->in1,&this->out);
           ASSERT_TRUE(status==ARM_MATH_SUCCESS);
 
           outp += (rows * columns);
           inp1 += (rows * columns);
 
+          checkInnerTail(outp);
+
       }
 
-      ASSERT_EMPTY_TAIL(output);
 
       ASSERT_SNR(output,ref,(float64_t)SNR_THRESHOLD);