SPMC: save other world FP state at S-EL2

This change saves the FP/NEON/SIMD state of incoming world in the other
world VM vCPU when entering the SPMC and restores the other world state
when exiting. This avoids doing the save/restore at EL3/SPMD and later
cater for a larger EL3 footprint needed by SVE.

Change-Id: I93923340db60a2262c4deb5d1ca1f391434d1a4d
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/src/arch/aarch64/hypervisor/exceptions.S b/src/arch/aarch64/hypervisor/exceptions.S
index 0bc59c0..bba56ea 100644
--- a/src/arch/aarch64/hypervisor/exceptions.S
+++ b/src/arch/aarch64/hypervisor/exceptions.S
@@ -96,6 +96,28 @@
 .endm
 
 /**
+ * Helper macro for SIMD vectors save/restore operations.
+ */
+.macro simd_op_vectors op reg
+	\op q0, q1, [\reg], #32
+	\op q2, q3, [\reg], #32
+	\op q4, q5, [\reg], #32
+	\op q6, q7, [\reg], #32
+	\op q8, q9, [\reg], #32
+	\op q10, q11, [\reg], #32
+	\op q12, q13, [\reg], #32
+	\op q14, q15, [\reg], #32
+	\op q16, q17, [\reg], #32
+	\op q18, q19, [\reg], #32
+	\op q20, q21, [\reg], #32
+	\op q22, q23, [\reg], #32
+	\op q24, q25, [\reg], #32
+	\op q26, q27, [\reg], #32
+	\op q28, q29, [\reg], #32
+	\op q30, q31, [\reg], #32
+.endm
+
+/**
  * The following is the exception table. A pointer to it will be stored in
  * register vbar_el2.
  */
@@ -296,25 +318,10 @@
 	/* Save floating point registers. */
 	/* Use x28 as the base. */
 	add x28, x1, #VCPU_FREGS
-	stp q0, q1, [x28], #32
-	stp q2, q3, [x28], #32
-	stp q4, q5, [x28], #32
-	stp q6, q7, [x28], #32
-	stp q8, q9, [x28], #32
-	stp q10, q11, [x28], #32
-	stp q12, q13, [x28], #32
-	stp q14, q15, [x28], #32
-	stp q16, q17, [x28], #32
-	stp q18, q19, [x28], #32
-	stp q20, q21, [x28], #32
-	stp q22, q23, [x28], #32
-	stp q24, q25, [x28], #32
-	stp q26, q27, [x28], #32
-	stp q28, q29, [x28], #32
-	stp q30, q31, [x28], #32
+	simd_op_vectors stp, x28
 	mrs x3, fpsr
 	mrs x4, fpcr
-	stp x3, x4, [x28], #32
+	stp x3, x4, [x28]
 
 	/* Save new vCPU pointer in non-volatile register. */
 	mov x19, x0
@@ -343,9 +350,17 @@
 
 other_world_loop:
 	/*
-	 * Prepare arguments from other world VM vCPU.
 	 * x19 holds the other world VM vCPU pointer.
 	 */
+
+	/* Restore the other world SIMD context to the other world VM vCPU. */
+	add x18, x19, #VCPU_FREGS
+	simd_op_vectors ldp, x18
+	ldp x0, x1, [x18]
+	msr fpsr, x0
+	msr fpcr, x1
+
+	/* Prepare arguments from other world VM vCPU. */
 	ldp x0, x1, [x19, #VCPU_REGS + 8 * 0]
 	ldp x2, x3, [x19, #VCPU_REGS + 8 * 2]
 	ldp x4, x5, [x19, #VCPU_REGS + 8 * 4]
@@ -364,6 +379,13 @@
 	stp x4, x5, [x19, #VCPU_REGS + 8 * 4]
 	stp x6, x7, [x19, #VCPU_REGS + 8 * 6]
 
+	/* Save the other world SIMD context to the other world VM vCPU. */
+	add x18, x19, #VCPU_FREGS
+	simd_op_vectors stp, x18
+	mrs x0, fpsr
+	mrs x1, fpcr
+	stp x0, x1, [x18]
+
 	/*
 	 * Stack is at top and execution can restart straight into C code.
 	 * Handle the FF-A call from other world.
@@ -394,28 +416,10 @@
 
 	/*
 	 * Restore floating point registers.
-	 *
-	 * Offset is too large, so start from a new base.
 	 */
 	add x2, x0, #VCPU_FREGS
-	ldp q0, q1, [x2, #32 * 0]
-	ldp q2, q3, [x2, #32 * 1]
-	ldp q4, q5, [x2, #32 * 2]
-	ldp q6, q7, [x2, #32 * 3]
-	ldp q8, q9, [x2, #32 * 4]
-	ldp q10, q11, [x2, #32 * 5]
-	ldp q12, q13, [x2, #32 * 6]
-	ldp q14, q15, [x2, #32 * 7]
-	ldp q16, q17, [x2, #32 * 8]
-	ldp q18, q19, [x2, #32 * 9]
-	ldp q20, q21, [x2, #32 * 10]
-	ldp q22, q23, [x2, #32 * 11]
-	ldp q24, q25, [x2, #32 * 12]
-	ldp q26, q27, [x2, #32 * 13]
-	ldp q28, q29, [x2, #32 * 14]
-	/* Offset becomes too large, so move the base. */
-	ldp q30, q31, [x2, #32 * 15]!
-	ldp x3, x4, [x2, #32 * 1]
+	simd_op_vectors ldp, x2
+	ldp x3, x4, [x2]
 	msr fpsr, x3
 
 	/*