CMSIS-DSP: Improvements for building with gcc on M55.

Solve most of f16 issues. But there are still some remaining
build issues with gcc10q4.

2 functions are reverting to scalar version when build with gcc on M55.
(Since Helium versions of those functions are not building).
diff --git a/CMSIS/DSP/Include/arm_helium_utils.h b/CMSIS/DSP/Include/arm_helium_utils.h
index df49734..93645a3 100755
--- a/CMSIS/DSP/Include/arm_helium_utils.h
+++ b/CMSIS/DSP/Include/arm_helium_utils.h
@@ -122,7 +122,7 @@
      *  re0+re1 | im0+im1 | re0+re1 | im0+im1
      *  re2+re3 | im2+im3 | re2+re3 | im2+im3
      */
-    vecTmp = vaddq(vecTmp, vecIn);
+    vecTmp = vaddq_f16(vecTmp, vecIn);
     vecOut = vecTmp;
     /*
      * shift left, random tmp insertion in bottom
@@ -133,7 +133,7 @@
      *    DONTCARE     |    DONTCARE     | re0+re1+re0+re1 |im0+im1+im0+im1
      * re0+re1+re2+re3 | im0+im1+im2+im3 | re2+re3+re2+re3 |im2+im3+im2+im3
      */
-    vecOut = vaddq(vecOut, vecTmp);
+    vecOut = vaddq_f16(vecOut, vecTmp);
     /*
      * Cmplx sum is in 4rd & 5th f16 elt
      * return full vector
diff --git a/CMSIS/DSP/Include/arm_math.h b/CMSIS/DSP/Include/arm_math.h
index 404ee91..98bc9f9 100644
--- a/CMSIS/DSP/Include/arm_math.h
+++ b/CMSIS/DSP/Include/arm_math.h
@@ -49,6 +49,14 @@
    * The library has generally separate functions for operating on 8-bit integers, 16-bit integers,
    * 32-bit integer and 32-bit floating-point values.
    *
+   * The library is providing vectorized versions of most algorthms for Helium
+   * and of most f32 algorithms for Neon.
+   *
+   * When using a vectorized version, provide a little bit of padding after the end of
+   * a buffer (3 words) because the vectorized code may read a little bit after the end
+   * of a buffer. You don't have to modify your buffers but just ensure that the
+   * end of buffer + padding is not outside of a memory region.
+   *
    * \section using Using the Library
    *
    * The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
diff --git a/CMSIS/DSP/Include/arm_math_types.h b/CMSIS/DSP/Include/arm_math_types.h
index 7c87b36..01e18a7 100755
--- a/CMSIS/DSP/Include/arm_math_types.h
+++ b/CMSIS/DSP/Include/arm_math_types.h
@@ -110,10 +110,7 @@
     #define ARM_MATH_MVEF
   #endif
   #if !defined(ARM_MATH_MVE_FLOAT16)
-  /* HW Float16 not yet well supported on gcc for M55 */
-    #if !defined(__CMSIS_GCC_H)
        #define ARM_MATH_MVE_FLOAT16
-    #endif
   #endif
 #endif
 
@@ -130,10 +127,7 @@
   #endif
 
   #if !defined(ARM_MATH_MVE_FLOAT16)
-    /* HW Float16 not yet well supported on gcc for M55 */
-    #if !defined(__CMSIS_GCC_H)
        #define ARM_MATH_MVE_FLOAT16
-    #endif
   #endif
 #endif
 
diff --git a/CMSIS/DSP/PythonWrapper/setup.py b/CMSIS/DSP/PythonWrapper/setup.py
index defb900..2d6b8cc 100644
--- a/CMSIS/DSP/PythonWrapper/setup.py
+++ b/CMSIS/DSP/PythonWrapper/setup.py
@@ -11,11 +11,8 @@
 
 if sys.platform == 'win32':
   cflags = ["-DWIN",config.cflags,"-DUNALIGNED_SUPPORT_DISABLE"] 
-  # Custom because a customized arm_math.h is required to build on windows
-  # since the visual compiler and the win platform are
-  # not supported by default in arm_math.h
 else:
-  cflags = ["-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
+  cflags = ["-Wno-attributes","-Wno-unused-function","-Wno-unused-variable","-Wno-implicit-function-declaration",config.cflags,"-D__GNUC_PYTHON__"]
 
 transform = glob.glob(os.path.join(ROOT,"Source","TransformFunctions","*.c"))
 #transform.remove(os.path.join(ROOT,"Source","TransformFunctions","arm_dct4_init_q15.c"))
@@ -69,18 +66,18 @@
 allsrcs = allsrcs + controller + transform + modulesrc + common+ interpolation
 
 def notf16(number):
-  if re.match(r'^.*_f16.c$',number):
+  if re.search(r'f16',number):
      return(False)
-  else:
-     return(True)
+  if re.search(r'F16',number):
+     return(False)
+  return(True)
 
-# If there are too many files, the linker command in failing on Windows.
+# If there are too many files, the linker command is failing on Windows.
 # So f16 functions are removed since they are not currently available in the wrapper.
 # A next version will have to structure this wrapper more cleanly so that the
 # build can work even with more functions
 srcs = list(filter(notf16, allsrcs))
 
-
 module1 = Extension(config.extensionName,
                     sources = (srcs
                               )
diff --git a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c
index e4df864..cb52180 100755
--- a/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c
+++ b/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_f16.c
@@ -56,7 +56,8 @@
   @return        none
  */
 
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 
 
 #include "arm_helium_utils.h"
 
diff --git a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
index 11f4e6e..0c3da12 100755
--- a/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
+++ b/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
@@ -46,7 +46,12 @@
   @param[in]     blockSize number of samples to process
   @return        none
  */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_biquad_cascade_stereo_df2T_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
 void arm_biquad_cascade_stereo_df2T_f16(
   const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
   const float16_t * pSrc,
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
index 977cba7..358dde3 100755
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
@@ -50,7 +50,12 @@
                    - \ref ARM_MATH_SUCCESS       : Operation successful
                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  */
-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
+#endif 
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) &&  !defined(__CMSIS_GCC_H)
 
 #include "arm_helium_utils.h"
 
@@ -382,7 +387,7 @@
     uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
     uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
     uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
-    uint16_t  col, i = 0U, row = numRowsA, colCnt;  /* loop counters */
+    uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
     arm_status status;          /* status of matrix multiplication */
     uint16x8_t vecOffs, vecColBOffs;
     uint32_t  blkCnt,rowCnt;           /* loop counters */
@@ -466,7 +471,6 @@
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
             float16_t const *pInA0 = pInA;
@@ -612,7 +616,6 @@
             /*
              * Matrix A columns number of MAC operations are to be performed
              */
-            colCnt = numColsA;
 
             float16_t const *pSrcA0Vec;
             float16_t const *pInA0 = pInA;
diff --git a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
index e15a6b3..1530e79 100755
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
@@ -87,7 +87,7 @@
     /*
      * move to 2nd column of matrix A
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
     /*
      * load {a01 a01 a11 a11 x x x x}
      */
@@ -95,7 +95,7 @@
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 2);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
     /*
      * load {b10, b11, b10, b11, x x x x }
      */
@@ -157,7 +157,7 @@
     /*
      * move to 2nd column of matrix A
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
     /*
      * load {a01 a01 a01 a11 a11 a11 a21 a21}
      */
@@ -165,7 +165,7 @@
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
     /*
      * load {b10, b11, b12, b10, b11, b12, b10, b11}
      */
@@ -179,7 +179,7 @@
     /*
      * move to 3rd column of matrix A
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 1);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
     /*
      * load {a02 a02 a02 a12 a12 a12 a22 a22}
      */
@@ -187,7 +187,7 @@
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 3);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
     /*
      * load {b20, b21, b22, b20, b21, b22, b20, b21}
      */
@@ -253,7 +253,7 @@
     /*
      * jump 2 x A rows (2nd half of matrix)
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
     /*
      * load {a20 a20 a20 a20 a30 a30 a30 a30}
      */
@@ -274,7 +274,7 @@
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
     /*
      * load {b10, b11, b12, b13, b10, b11, b12, b13}
      */
@@ -287,7 +287,7 @@
     /*
      * jump 2 x A rows (2nd half of matrix)
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
     /*
      * load {a21 a21 a21 a21 a31 a31 a31 a31}
      */
@@ -309,7 +309,7 @@
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
     /*
      * load {b20, b21, b22, b23, b20, b21, b22, b23}
      */
@@ -322,7 +322,7 @@
     /*
      * jump 2 x A rows
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
 
     /*
      * load {a22 a22 a22 a22 a32 a32 a32 a32}
@@ -345,7 +345,7 @@
     /*
      * move to next B row
      */
-    vecOffsB = vaddq(vecOffsB, (uint16_t) 4);
+    vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
     /*
      * load {b30, b31, b32, b33, b30, b31, b32, b33}
      */
@@ -358,7 +358,7 @@
     /*
      * jump 2 x A rows
      */
-    vecOffsA = vaddq(vecOffsA, (uint16_t) 8);
+    vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
     /*
      * load {a23 a23 a23 a23 a33 a33 a33 a33}
      */
diff --git a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
index 7dcce7f..738cd43 100755
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
@@ -275,7 +275,7 @@
 
         /*  Initializations of Instance structure depending on the FFT length */
         switch (S->fftLen) {
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
             /*  Initializations of structure parameters for 4096 point FFT */
         case 4096U:
             /*  Initialise the bit reversal table modifier */
@@ -283,7 +283,7 @@
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
             /*  Initializations of structure parameters for 2048 point FFT */
         case 2048U:
             /*  Initialise the bit reversal table modifier */
@@ -292,7 +292,7 @@
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
             /*  Initializations of structure parameters for 1024 point FFT */
         case 1024U:
             /*  Initialise the bit reversal table modifier */
@@ -301,7 +301,7 @@
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
             /*  Initializations of structure parameters for 512 point FFT */
         case 512U:
             /*  Initialise the bit reversal table modifier */
@@ -309,31 +309,31 @@
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
         case 256U:
             FFTINIT(q15,256);
             break;
 #endif
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
         case 128U:
             FFTINIT(q15,128);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
         case 64U:
             FFTINIT(q15,64);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
         case 32U:
             FFTINIT(q15,32);
             break;
 #endif 
 
-#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
         case 16U:
             /*  Initializations of structure parameters for 16 point FFT */
             FFTINIT(q15,16);
diff --git a/CMSIS/DSP/Testing/PatternGeneration/Tools.py b/CMSIS/DSP/Testing/PatternGeneration/Tools.py
index abda8d4..70e6b2a 100755
--- a/CMSIS/DSP/Testing/PatternGeneration/Tools.py
+++ b/CMSIS/DSP/Testing/PatternGeneration/Tools.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 def normalize(a):
-  return(a/max(np.abs(a)))
+  return(a/np.max(np.abs(a)))
 
 TAILONLY = 1
 BODYONLY = 2
diff --git a/CMSIS/DSP/Toolchain/GCC.cmake b/CMSIS/DSP/Toolchain/GCC.cmake
index 4e0dcf1..c0e051d 100644
--- a/CMSIS/DSP/Toolchain/GCC.cmake
+++ b/CMSIS/DSP/Toolchain/GCC.cmake
@@ -43,6 +43,7 @@
 
   # Need to add other gcc config for other cortex-m cores
   if (ARM_CPU STREQUAL "cortex-m55" )
+     target_compile_options(${PROJECTNAME} PUBLIC "-march=armv8.1-m.main+mve.fp+fp.dp")
      target_compile_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
      target_link_options(${PROJECTNAME} PUBLIC "-mfpu=fpv5-d16")
   endif()
diff --git a/CMSIS/DSP/gcc.cmake b/CMSIS/DSP/gcc.cmake
index 3f7b6e6..9cb2f42 100644
--- a/CMSIS/DSP/gcc.cmake
+++ b/CMSIS/DSP/gcc.cmake
@@ -6,26 +6,25 @@
 
 
 
-#SET(tools "C:/PROGRA~2/GNUTOO~1/82018-~1")
 
-#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
-#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++.exe")
-#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc.exe")
+#SET(CMAKE_C_COMPILER "${tools}/bin/arm-none-eabi-gcc")
+#SET(CMAKE_CXX_COMPILER "${tools}/bin/arm-none-eabi-g++")
+#SET(CMAKE_ASM_COMPILER "${tools}/bin/arm-none-eabi-gcc")
 
 find_program(CMAKE_C_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
 find_program(CMAKE_CXX_COMPILER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
 find_program(CMAKE_ASM_COMPILER NAMES arm-none-eabi-gcc arm-none-eabi-gcc.exe)
 
-#SET(CMAKE_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
-find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
-find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
+SET(CMAKE_AR "${tools}/bin/ar")
+SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/ar")
+SET(CMAKE_C_COMPILER_AR "${tools}/bin/ar")
+
+#find_program(CMAKE_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
+#find_program(CMAKE_CXX_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe )
+#find_program(CMAKE_C_COMPILER_AR NAMES arm-none-eabi-gcc-ar arm-none-eabi-gcc-ar.exe)
 
 
-#SET(CMAKE_CXX_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-#SET(CMAKE_C_COMPILER_AR "${tools}/bin/arm-none-eabi-gcc-ar.exe")
-
-#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++.exe")
+#SET(CMAKE_LINKER "${tools}/bin/arm-none-eabi-g++")
 find_program(CMAKE_LINKER NAMES arm-none-eabi-g++ arm-none-eabi-g++.exe)
 
 SET(CMAKE_C_LINK_EXECUTABLE "<CMAKE_LINKER> <LINK_FLAGS> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
@@ -48,10 +47,17 @@
     )
 endif(NOT ARM_CPU)
 
+if (ARM_CPU STREQUAL "cortex-m55")
+SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "C compiler common flags")
+SET(CMAKE_ASM_FLAGS "-march=armv8.1-m.main+mve.fp+fp.dp" CACHE INTERNAL "ASM compiler common flags")
+SET(CMAKE_EXE_LINKER_FLAGS "-fno-use-linker-plugin -march=armv8.1-m.main+mve.fp+fp.dp"  CACHE INTERNAL "linker flags")
+else()
 SET(CMAKE_C_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
 SET(CMAKE_CXX_FLAGS "-ffunction-sections -fdata-sections -mcpu=${ARM_CPU}" CACHE INTERNAL "C compiler common flags")
 SET(CMAKE_ASM_FLAGS "-mcpu=${ARM_CPU}" CACHE INTERNAL "ASM compiler common flags")
 SET(CMAKE_EXE_LINKER_FLAGS "-mcpu=${ARM_CPU}"  CACHE INTERNAL "linker flags")
+endif()
 
 get_property(IS_IN_TRY_COMPILE GLOBAL PROPERTY IN_TRY_COMPILE)
 if(IS_IN_TRY_COMPILE)