CMSIS-NN: Add MVEI support for int16 max pooling (#1509)

Change-Id: Ia75af0245dbe471aae629110b53b0d0998afca2b
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index ce57db3..63a1d97 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -14,6 +14,7 @@
        - Changed return types of all API's
        - Support for int16 average pooling DSP implementation
        - Support for DSP extension optimization for int16 depthwise_conv
+       - Support for MVEI extension optimization for int16 max pooling
     </release>
     <release version="5.9.0" date="2022-05-02">
       CMSIS-Core(M): 5.6.0
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 466ee6b..61e8170 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -13,6 +13,7 @@
       <li> Replaced arm_status with arm_cmsis_nn_status struct </li>
       <li> Added DSP support in arm_avgpool_s16.c </li>
       <li> Added support for DSP extension optimization for int16 depthwise_conv </li>
+      <li> Added support for MVEI extension optimization for int16 max pooling </li>
       </ul>
     </td>
   </tr>
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index 6b5fd04..6621912 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -46,7 +46,7 @@
 || arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
 || arm_avgpool_s16() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| No| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
 || arm_maxpool_s8() | MAX POOL | None | None | Yes| Yes|  |
-|| arm_maxpool_s16() | MAX POOL | None | None | No| No|  |
+|| arm_maxpool_s16() | MAX POOL | None | None | No| Yes|  |
 |[Softmax](https://arm-software.github.io/CMSIS_5/NN/html/group__Softmax.html)||||| |  ||
 ||arm_softmax_q7()| SOFTMAX | None | None | Yes | No | Not bit exact to TFLu but can be up to 70x faster |
 ||arm_softmax_s8()| SOFTMAX | None | None | No | Yes | Bit exact to TFLu |
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
index ef60571..5fbe1f9 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_max_pool_s16.c
  * Description:  Pooling function implementations
  *
- * $Date:        19 April 2022
- * $Revision:    V.2.0.0
+ * $Date:        20 June 2022
+ * $Revision:    V.2.1.0
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -33,6 +33,20 @@
 
 static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, int32_t length)
 {
+#if defined(ARM_MATH_MVEI)
+    int32_t loop_count = (length + 7) / 8;
+    for (int i = 0; i < loop_count; i++)
+    {
+        mve_pred16_t p = vctp16q((uint32_t)length);
+        const int16x8_t op_1 = vldrhq_z_s16(base, p);
+        const int16x8_t op_2 = vldrhq_z_s16(target, p);
+        const int16x8_t max = vmaxq_s16(op_1, op_2);
+        vstrhq_p_s16(base, max, p);
+        base += 8;
+        target += 8;
+        length -= 8;
+    }
+#else
     q15_t *dst = base;
     const q15_t *src = target;
     union arm_nnword ref_max;
@@ -65,10 +79,27 @@
             *dst = *src;
         }
     }
+#endif
 }
 
 static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, const int16_t act_max)
 {
+#if defined(ARM_MATH_MVEI)
+    const int16x8_t min = vdupq_n_s16((int16_t)act_min);
+    const int16x8_t max = vdupq_n_s16((int16_t)act_max);
+
+    int32_t loop_count = (length + 7) / 8;
+    for (int i = 0; i < loop_count; i++)
+    {
+        mve_pred16_t p = vctp16q((uint32_t)length);
+        length -= 8;
+        const int16x8_t src = vldrhq_z_s16(source, p);
+        int16x8_t res = vmaxq_m_s16(vuninitializedq_s16(), src, min, p);
+        res = vminq_m_s16(vuninitializedq_s16(), res, max, p);
+        vstrhq_p_s16(source, res, p);
+        source += 8;
+    }
+#else
     union arm_nnword in;
     int32_t cnt = length >> 1;
 
@@ -92,6 +123,7 @@
         comp = MIN(comp, act_max);
         *source = comp;
     }
+#endif
 }
 
 /**