TFTF: Fix Aarch32 zeromem() function

This patch fixes the following bugs in zeromem() Aarch32 variant:
- Removed ASM_ASSERT for length parameter passed in R1 register.
  This is causing assertion in tftf_entrypoint() code
 	ldr	r0, =__BSS_START__
	ldr	r1, =__BSS_SIZE__
	bl	zeromem
  when __BSS_SIZE__ is not 4-byte aligned:
  0x000000000000cced __BSS_SIZE__ = SIZEOF (.bss)
- With ENABLE_ASSERTIONS = 0 for RELEASE builds and R1 not 4-bytes
  aligned zeromem() was writing 0 into all the memory in infinite
  z_loop, because of the code:
  z_loop:
	cmp	r2, r0
	beq	z_end
	str	r1, [r0], #4
	b	z_loop
  with R0 being increased by 4 on each step and R2 = R0 condition
  would never be met.
- BLT instruction was used for 'unsigned int length' value in R1
  changed to BLO.
This patch also fixes BLO instruction bug in 'memcpy4()' and function
itself is optimised.

Signed-off-by: Alexei Fedorov <Alexei.Fedorov@arm.com>
Change-Id: I8128399681def8ba80241882e355c3ca2778605f
diff --git a/lib/aarch32/misc_helpers.S b/lib/aarch32/misc_helpers.S
index ab37be9..6cabea2 100644
--- a/lib/aarch32/misc_helpers.S
+++ b/lib/aarch32/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, Arm Limited. All rights reserved.
+ * Copyright (c) 2018-2020, Arm Limited. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -16,24 +16,29 @@
  * void zeromem(void *mem, unsigned int length);
  *
  * Initialise a memory region to 0.
- * The memory address and length must be 4-byte aligned.
+ * The memory address must be 4-byte aligned.
  * -----------------------------------------------------------------------
  */
 func zeromem
 #if ENABLE_ASSERTIONS
 	tst	r0, #0x3
 	ASM_ASSERT(eq)
-	tst	r1, #0x3
-	ASM_ASSERT(eq)
 #endif
-	add	r2, r0, r1
-	mov	r1, #0
-z_loop:
-	cmp	r2, r0
-	beq	z_end
-	str	r1, [r0], #4
-	b	z_loop
-z_end:
+	mov	r2, #0
+/* zero 4 bytes at a time */
+z_loop4:
+	cmp	r1, #4
+	blo	z_loop1
+	str	r2, [r0], #4
+	subs	r1, r1, #4
+	bne	z_loop4
+	bx	lr
+
+/* zero byte per byte */
+z_loop1:
+	strb	r2, [r0], #1
+	subs	r1, r1, #1
+	bne	z_loop1
 	bx	lr
 endfunc zeromem
 
@@ -54,20 +59,19 @@
 /* copy 4 bytes at a time */
 m_loop4:
 	cmp	r2, #4
-	blt	m_loop1
+	blo	m_loop1
 	ldr	r3, [r1], #4
 	str	r3, [r0], #4
-	sub	r2, r2, #4
-	b	m_loop4
+	subs	r2, r2, #4
+	bne	m_loop4
+	bx	lr
+
 /* copy byte per byte */
 m_loop1:
-	cmp	r2,#0
-	beq	m_end
 	ldrb	r3, [r1], #1
 	strb	r3, [r0], #1
 	subs	r2, r2, #1
 	bne	m_loop1
-m_end:
 	bx	lr
 endfunc memcpy4