CMSIS-NN: Add MVEI support for int16 max pooling (#1509)
Change-Id: Ia75af0245dbe471aae629110b53b0d0998afca2b
diff --git a/ARM.CMSIS.pdsc b/ARM.CMSIS.pdsc
index ce57db3..63a1d97 100644
--- a/ARM.CMSIS.pdsc
+++ b/ARM.CMSIS.pdsc
@@ -14,6 +14,7 @@
- Changed return types of all API's
- Support for int16 average pooling DSP implementation
- Support for DSP extension optimization for int16 depthwise_conv
+ - Support for MVEI extension optimization for int16 max pooling
</release>
<release version="5.9.0" date="2022-05-02">
CMSIS-Core(M): 5.6.0
diff --git a/CMSIS/DoxyGen/NN/src/history.txt b/CMSIS/DoxyGen/NN/src/history.txt
index 466ee6b..61e8170 100644
--- a/CMSIS/DoxyGen/NN/src/history.txt
+++ b/CMSIS/DoxyGen/NN/src/history.txt
@@ -13,6 +13,7 @@
<li> Replaced arm_status with arm_cmsis_nn_status struct </li>
<li> Added DSP support in arm_avgpool_s16.c </li>
<li> Added support for DSP extension optimization for int16 depthwise_conv </li>
+ <li> Added support for MVEI extension optimization for int16 max pooling </li>
</ul>
</td>
</tr>
diff --git a/CMSIS/NN/README.md b/CMSIS/NN/README.md
index 6b5fd04..6621912 100644
--- a/CMSIS/NN/README.md
+++ b/CMSIS/NN/README.md
@@ -46,7 +46,7 @@
|| arm_avgpool_s8() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| Yes| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
|| arm_avgpool_s16() | AVERAGE POOL | None | input_ch * 4<br/>(DSP only) | Yes| No| Best case is when channels are multiple of 4 or <br/> at the least >= 4 |
|| arm_maxpool_s8() | MAX POOL | None | None | Yes| Yes| |
-|| arm_maxpool_s16() | MAX POOL | None | None | No| No| |
+|| arm_maxpool_s16() | MAX POOL | None | None | No| Yes| |
|[Softmax](https://arm-software.github.io/CMSIS_5/NN/html/group__Softmax.html)||||| | ||
||arm_softmax_q7()| SOFTMAX | None | None | Yes | No | Not bit exact to TFLu but can be up to 70x faster |
||arm_softmax_s8()| SOFTMAX | None | None | No | Yes | Bit exact to TFLu |
diff --git a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
index ef60571..5fbe1f9 100644
--- a/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_max_pool_s16.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
*
* SPDX-License-Identifier: Apache-2.0
*
@@ -21,8 +21,8 @@
* Title: arm_max_pool_s16.c
* Description: Pooling function implementations
*
- * $Date: 19 April 2022
- * $Revision: V.2.0.0
+ * $Date: 20 June 2022
+ * $Revision: V.2.1.0
*
* Target Processor: Cortex-M CPUs
*
@@ -33,6 +33,20 @@
static void compare_and_replace_if_larger(int16_t *base, const int16_t *target, int32_t length)
{
+#if defined(ARM_MATH_MVEI)
+ int32_t loop_count = (length + 7) / 8;
+ for (int i = 0; i < loop_count; i++)
+ {
+ mve_pred16_t p = vctp16q((uint32_t)length);
+ const int16x8_t op_1 = vldrhq_z_s16(base, p);
+ const int16x8_t op_2 = vldrhq_z_s16(target, p);
+ const int16x8_t max = vmaxq_s16(op_1, op_2);
+ vstrhq_p_s16(base, max, p);
+ base += 8;
+ target += 8;
+ length -= 8;
+ }
+#else
q15_t *dst = base;
const q15_t *src = target;
union arm_nnword ref_max;
@@ -65,10 +79,27 @@
*dst = *src;
}
}
+#endif
}
static void clamp_output(int16_t *source, int32_t length, const int16_t act_min, const int16_t act_max)
{
+#if defined(ARM_MATH_MVEI)
+ const int16x8_t min = vdupq_n_s16((int16_t)act_min);
+ const int16x8_t max = vdupq_n_s16((int16_t)act_max);
+
+ int32_t loop_count = (length + 7) / 8;
+ for (int i = 0; i < loop_count; i++)
+ {
+ mve_pred16_t p = vctp16q((uint32_t)length);
+ length -= 8;
+ const int16x8_t src = vldrhq_z_s16(source, p);
+ int16x8_t res = vmaxq_m_s16(vuninitializedq_s16(), src, min, p);
+ res = vminq_m_s16(vuninitializedq_s16(), res, max, p);
+ vstrhq_p_s16(source, res, p);
+ source += 8;
+ }
+#else
union arm_nnword in;
int32_t cnt = length >> 1;
@@ -92,6 +123,7 @@
comp = MIN(comp, act_max);
*source = comp;
}
+#endif
}
/**