CMSIS-NN: update MVE intrinsics usage and predication (#1554)

diff --git a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
index adfa702..22bba53 100644
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2021 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mult_s8.c
  * Description:  General Matrix-multiplication function
  *
- * $Date:        27. October 2021
- * $Revision:    V.2.0.6
+ * $Date:        16 August 2022
+ * $Revision:    V.2.0.7
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -73,7 +73,7 @@
             for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
             {
                 mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
-                const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
+                const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
                 row_len_tmp -= 8;
 
                 int16x8_t c0 = vldrbq_s16(ip_c0);
@@ -133,7 +133,7 @@
                 for (int i_row_loop = 0; i_row_loop < row_loop_cnt; i_row_loop++)
                 {
                     const mve_pred16_t p = vctp16q((uint32_t)row_len_tmp);
-                    const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
+                    const int16x8_t offset = vdupq_x_n_s16(col_offset, p);
                     row_len_tmp -= 8;
 
                     int16x8_t c0 = vldrbq_s16(ip_c0);
diff --git a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
index 608019a..d708b34 100644
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_vec_mat_mult_t_s8
  * Description:  s8 vector by matrix (transposed) multiplication
  *
- * $Date:        2 May 2022
- * $Revision:    V.4.0.1
+ * $Date:        16 Aug 2022
+ * $Revision:    V.4.0.2
  *
  * Target Processor:  Cortex-M
  *
@@ -115,7 +115,7 @@
         if (bias)
         {
             int32x4_t b = vldrwq_z_s32(bias, p);
-            acc = vaddq_m_s32(vuninitializedq_s32(), acc, b, p);
+            acc = vaddq_x_s32(acc, b, p);
             bias += 3;
         }
         const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
index 5fbe1f9..a9886b7 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s16.c
  * Description:  Pooling function implementations
  *
- * $Date:        20 June 2022
- * $Revision:    V.2.1.0
+ * $Date:        16 August 2022
+ * $Revision:    V.2.1.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -94,8 +94,8 @@
         mve_pred16_t p = vctp16q((uint32_t)length);
         length -= 8;
         const int16x8_t src = vldrhq_z_s16(source, p);
-        int16x8_t res = vmaxq_m_s16(vuninitializedq_s16(), src, min, p);
-        res = vminq_m_s16(vuninitializedq_s16(), res, max, p);
+        int16x8_t res = vmaxq_x_s16(src, min, p);
+        res = vminq_x_s16(res, max, p);
         vstrhq_p_s16(source, res, p);
         source += 8;
     }
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
index b5cdc87..6ebc788 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s8.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s8.c
  * Description:  Pooling function implementations
  *
- * $Date:        19 April 2022
- * $Revision:    V.3.0.0
+ * $Date:        16 August 2022
+ * $Revision:    V.3.0.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -40,7 +40,7 @@
         mve_pred16_t p = vctp8q((uint32_t)length);
         const int8x16_t op_1 = vldrbq_z_s8(base, p);
         const int8x16_t op_2 = vldrbq_z_s8(target, p);
-        const int8x16_t max = vmaxq_m_s8(vuninitializedq_s8(), op_1, op_2, p);
+        const int8x16_t max = vmaxq_x_s8(op_1, op_2, p);
         vstrbq_p_s8(base, max, p);
         base += 16;
         target += 16;
@@ -98,15 +98,16 @@
 {
 #if defined(ARM_MATH_MVEI)
     int32_t loop_count = (length + 15) / 16;
+    const int8x16_t vmin = vdupq_n_s8((int8_t)act_min);
+    const int8x16_t vmax = vdupq_n_s8((int8_t)act_max);
+
     for (int i = 0; i < loop_count; i++)
     {
         mve_pred16_t p = vctp8q((uint32_t)length);
         length -= 16;
         const int8x16_t src = vldrbq_z_s8(source, p);
-        const int8x16_t predicated_min = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_min, p);
-        const int8x16_t predicated_max = vdupq_m_n_s8(vuninitializedq_s8(), (int8_t)act_max, p);
-        int8x16_t res = vmaxq_m_s8(vuninitializedq_s8(), src, predicated_min, p);
-        res = vminq_m_s8(vuninitializedq_s8(), res, predicated_max, p);
+        int8x16_t res = vmaxq_x_s8(src, vmin, p);
+        res = vminq_x_s8(res, vmax, p);
         vstrbq_p_s8(source, res, p);
         source += 16;
     }