feat(lib/arch): add support for NS SME context

This feature adds support for Scalable Matrix Extension (SME) in RMM for
handling the Non Secure SME state. If the CPU supports SME and if Realm
accesses FPU/SVE functionality then NS SME state is saved to allow Realm
to use FPU/SVE register state.

Within SME, only the Streaming SVE register state is managed by RMM, as
it shares the register state with FPU/SVE. As Realms do not support SME,
the ZA register state is never managed.

This patch adds necessary changes to support the configuration where
only SME is implemented in the CPU and SVE is not implemented.

This change also caters to the use case of RMM using FPU at REL2 when
built with RMM_FPU_USE_AT_REL2=ON.

Note: SME is not supported for Realms yet.

Signed-off-by: Arunachalam Ganapathy <arunachalam.ganapathy@arm.com>
Change-Id: I127e9aa2e6203ddfe48551443e76e95df476cc35
diff --git a/lib/arch/include/arch.h b/lib/arch/include/arch.h
index 757841c..8d0ed42 100644
--- a/lib/arch/include/arch.h
+++ b/lib/arch/include/arch.h
@@ -278,6 +278,7 @@
 #define ESR_EL2_EC_SMC		INPLACE(ESR_EL2_EC, UL(23))
 #define ESR_EL2_EC_SYSREG	INPLACE(ESR_EL2_EC, UL(24))
 #define ESR_EL2_EC_SVE		INPLACE(ESR_EL2_EC, UL(25))
+#define ESR_EL2_EC_SME		INPLACE(ESR_EL2_EC, UL(29))
 #define ESR_EL2_EC_INST_ABORT		INPLACE(ESR_EL2_EC, UL(32))
 #define ESR_EL2_EC_INST_ABORT_SEL	INPLACE(ESR_EL2_EC, UL(33))
 #define ESR_EL2_EC_DATA_ABORT		INPLACE(ESR_EL2_EC, UL(36))
@@ -468,6 +469,12 @@
 #define ID_AA64PFR1_EL1_MTE_SHIFT	UL(8)
 #define ID_AA64PFR1_EL1_MTE_WIDTH	UL(4)
 
+#define ID_AA64PFR1_EL1_SME_SHIFT		UL(24)
+#define ID_AA64PFR1_EL1_SME_WIDTH		UL(4)
+#define ID_AA64PFR1_EL1_SME_NOT_IMPLEMENTED	UL(0)
+#define ID_AA64PFR1_EL1_SME_IMPLEMENTED		UL(1)
+#define ID_AA64PFR1_EL1_SME2_IMPLEMENTED	UL(2)
+
 /* ID_AA64MMFR0_EL1 definitions */
 #define ID_AA64MMFR0_EL1_PARANGE_SHIFT	U(0)
 #define ID_AA64MMFR0_EL1_PARANGE_WIDTH	UL(4)
@@ -536,6 +543,9 @@
 /* SVE Feature ID register 0 */
 #define ID_AA64ZFR0_EL1				S3_0_C0_C4_4
 
+/* SME Feature ID register 0 */
+#define ID_AA64SMFR0_EL1			S3_0_C0_C4_5
+
 /* HPFAR_EL2 definitions */
 #define HPFAR_EL2_FIPA_SHIFT		4
 #define HPFAR_EL2_FIPA_WIDTH		U(40)
@@ -595,6 +605,24 @@
 
 #define ZCR_EL12			S3_5_C1_C2_0
 
+/* SME Control Register */
+#define SMCR_EL2			S3_4_C1_C2_6
+#define SMCR_EL2_LEN_SHIFT		UL(0)
+#define SMCR_EL2_LEN_WIDTH		UL(4)
+/*
+ * SMCR_EL2_RAZ_LEN is defined to find the architecturally permitted SVL. This
+ * is a combination of RAZ and LEN bit fields.
+ */
+#define SMCR_EL2_RAZ_LEN_SHIFT		UL(0)
+#define SMCR_EL2_RAZ_LEN_WIDTH		UL(9)
+#define SMCR_EL2_EZT0_BIT		(UL(1) << 30)
+#define SMCR_EL2_FA64_BIT		(UL(1) << 31)
+
+/* Streaming Vector Control register */
+#define SVCR				S3_3_C4_C2_2
+#define SVCR_SM_BIT			(UL(1) << 0)
+#define SVCR_ZA_BIT			(UL(1) << 1)
+
 /* VTCR definitions */
 #define VTCR_T0SZ_SHIFT		0
 #define VTCR_T0SZ_WIDTH		U(6)
@@ -747,9 +775,16 @@
 #define CPTR_EL2_VHE_ZEN_TRAP_ALL_00	UL(0x0)
 #define CPTR_EL2_VHE_ZEN_NO_TRAP_11	UL(0x3)
 
-/* Trap all AMU, trace, FPU, SVE accesses */
+#define CPTR_EL2_VHE_SMEN_SHIFT		UL(24)
+#define CPTR_EL2_VHE_SMEN_WIDTH		UL(2)
+#define CPTR_EL2_VHE_SMEN_TRAP_ALL_00	UL(0x0)
+#define CPTR_EL2_VHE_SMEN_NO_TRAP_11	UL(0x3)
+
+/* Trap all AMU, trace, FPU, SVE, SME accesses */
 #define CPTR_EL2_VHE_INIT		((CPTR_EL2_VHE_ZEN_TRAP_ALL_00 << \
 					  CPTR_EL2_VHE_ZEN_SHIFT)	| \
+					 (CPTR_EL2_VHE_SMEN_TRAP_ALL_00 << \
+					  CPTR_EL2_VHE_SMEN_SHIFT)	| \
 					 (CPTR_EL2_VHE_FPEN_TRAP_ALL_00 << \
 					  CPTR_EL2_VHE_FPEN_SHIFT)	| \
 					 CPTR_EL2_VHE_TTA		| \
@@ -880,6 +915,7 @@
 #define ESR_EL2_SYSREG_ID_AA64PFR0_EL1	SYSREG_ESR(3, 0, 0, 4, 0)
 #define ESR_EL2_SYSREG_ID_AA64PFR1_EL1	SYSREG_ESR(3, 0, 0, 4, 1)
 #define ESR_EL2_SYSREG_ID_AA64ZFR0_EL1	SYSREG_ESR(3, 0, 0, 4, 4)
+#define ESR_EL2_SYSREG_ID_AA64SMFR0_EL1	SYSREG_ESR(3, 0, 0, 4, 5)
 
 #define ESR_EL2_SYSREG_ID_AA64DFR0_EL1	SYSREG_ESR(3, 0, 0, 5, 0)
 #define ESR_EL2_SYSREG_ID_AA64DFR1_EL1	SYSREG_ESR(3, 0, 0, 5, 1)
diff --git a/lib/arch/include/arch_features.h b/lib/arch/include/arch_features.h
index 75d7aa0..386b2ac 100644
--- a/lib/arch/include/arch_features.h
+++ b/lib/arch/include/arch_features.h
@@ -28,6 +28,18 @@
 }
 
 /*
+ * Check if SME is enabled
+ * ID_AA64PFR1_EL1.SME, bits [27:24]:
+ * 0b0000 SME architectural state and programmers' model are not implemented.
+ * 0b0001 SME architectural state and programmers' model are implemented.
+ * 0b0010 SME2 implemented. As 0b0001, plus the SME2 ZT0 register.
+ */
+static inline bool is_feat_sme_present(void)
+{
+	return (EXTRACT(ID_AA64PFR1_EL1_SME, read_id_aa64pfr1_el1()) != 0UL);
+}
+
+/*
  * Check if RNDR is available
  */
 static inline bool is_feat_rng_present(void)
diff --git a/lib/arch/include/arch_helpers.h b/lib/arch/include/arch_helpers.h
index 42e161f..c51635f 100644
--- a/lib/arch/include/arch_helpers.h
+++ b/lib/arch/include/arch_helpers.h
@@ -106,6 +106,12 @@
 			  CPTR_EL2_VHE_ZEN_NO_TRAP_11)
 
 /*******************************************************************************
+ * SME management
+ ******************************************************************************/
+#define is_smen_enabled() (EXTRACT(CPTR_EL2_SMEN, read_cptr_el2()) == \
+			   CPTR_EL2_SMEN_NO_TRAP_11)
+
+/*******************************************************************************
  * Misc. accessor prototypes
  ******************************************************************************/
 
@@ -279,6 +285,8 @@
 DEFINE_SYSREG_RW_FUNCS(cpacr_el12)
 DEFINE_RENAME_SYSREG_RW_FUNCS(zcr_el2, ZCR_EL2)
 DEFINE_RENAME_SYSREG_RW_FUNCS(zcr_el12, ZCR_EL12)
+DEFINE_RENAME_SYSREG_RW_FUNCS(smcr_el2, SMCR_EL2)
+DEFINE_RENAME_SYSREG_RW_FUNCS(svcr, SVCR)
 DEFINE_SYSREG_RW_FUNCS(ttbr0_el12)
 DEFINE_SYSREG_RW_FUNCS(ttbr1_el12)
 DEFINE_SYSREG_RW_FUNCS(tcr_el12)
@@ -316,6 +324,7 @@
 DEFINE_SYSREG_READ_FUNC(id_aa64dfr0_el1)
 DEFINE_SYSREG_READ_FUNC(id_aa64dfr1_el1)
 DEFINE_RENAME_SYSREG_READ_FUNC(id_aa64zfr0_el1, ID_AA64ZFR0_EL1)
+DEFINE_RENAME_SYSREG_READ_FUNC(id_aa64smfr0_el1, ID_AA64SMFR0_EL1)
 DEFINE_SYSREG_READ_FUNC(id_aa64isar0_el1)
 DEFINE_SYSREG_READ_FUNC(id_aa64isar1_el1)
 DEFINE_SYSREG_READ_FUNC(id_aa64mmfr0_el1)
diff --git a/lib/arch/include/simd.h b/lib/arch/include/simd.h
index 81c4e22..4b7bb41 100644
--- a/lib/arch/include/simd.h
+++ b/lib/arch/include/simd.h
@@ -51,6 +51,7 @@
 
 /* Flags for SIMD type */
 #define SIMD_TFLAG_SVE		(U(1) << 0)
+#define SIMD_TFLAG_SME		(U(1) << 1)
 
 /* Flags for SIMD status */
 #define SIMD_SFLAG_INIT_DONE	(U(1) << 0)
@@ -98,6 +99,9 @@
 	 * 0xf - 2048 bits VL (arch max)
 	 */
 	uint32_t sve_vq;
+
+	/* SME enabled flag */
+	bool sme_en;
 };
 
 /* This structure holds the SIMD related EL2 registers */
@@ -119,6 +123,12 @@
 	uint64_t cptr_el2;
 
 	/*
+	 * SME specific Streaming vector control register. Contains CPU global
+	 * PSTATE.SM and PSTATE.ZA flags.
+	 */
+	uint64_t svcr;
+
+	/*
 	 * EL2 config registers.
 	 * SIMD_OWNER_NWD: RMM intializes based on CPU supported configuration
 	 * SIMD_OWNER_REL1: RMM intializes based on Realm configuration
diff --git a/lib/arch/src/include/simd_private.h b/lib/arch/src/include/simd_private.h
index 7902cd2..9adf97e 100644
--- a/lib/arch/src/include/simd_private.h
+++ b/lib/arch/src/include/simd_private.h
@@ -45,4 +45,16 @@
 /* Clear SVE P and FFR registers */
 void sve_clear_p_ffr_registers(void);
 
+/* Returns 'true' when CPU in Streaming SVE mode, else 'false' */
+static inline bool is_sme_sm(void)
+{
+	return ((read_svcr() & SVCR_SM_BIT) != 0U);
+}
+
+/* Returns 'true' when FEAT_SME_FA64 is enabled at the current exception level */
+static inline bool sme_feat_fa64_enabled(void)
+{
+	return ((read_smcr_el2() & SMCR_EL2_FA64_BIT) != 0U);
+}
+
 #endif /* SIMD_PRIVATE_H */
diff --git a/lib/arch/src/simd.c b/lib/arch/src/simd.c
index 48fd265..624443b 100644
--- a/lib/arch/src/simd.c
+++ b/lib/arch/src/simd.c
@@ -24,11 +24,24 @@
 static bool g_simd_state_saved[MAX_CPUS];
 
 #define simd_has_sve(sc)	(((sc)->tflags & SIMD_TFLAG_SVE) != 0U)
+#define simd_has_sme(sc)	(((sc)->tflags & SIMD_TFLAG_SME) != 0U)
 
 #define is_ctx_init_done(sc)	(((sc)->sflags & SIMD_SFLAG_INIT_DONE) != 0U)
 #define is_ctx_saved(sc)	(((sc)->sflags & SIMD_SFLAG_SAVED) != 0U)
 #define is_ctx_sve_hint_set(sc)	(((sc)->sflags & SIMD_SFLAG_SVE_HINT) != 0U)
 
+/* Check whether the SVE is being used in Streaming mode */
+#define is_sve_mode_streaming(sc)	(simd_has_sme(sc) && is_sme_sm())
+
+/*
+ * Check whether SVE is being used in normal (non-streaming) mode by the
+ * context. The SVE hint bit is not set which implies that the full normal SVE
+ * register context is in use.
+ */
+#define is_sve_mode_normal(sc)		(simd_has_sve(sc) &&		\
+					 !is_ctx_sve_hint_set(sc) &&	\
+					 !is_sve_mode_streaming(sc))
+
 /*
  * Returns 'true' if the current CPU's SIMD (FPU/SVE) live state is saved in
  * memory else 'false'.
@@ -65,10 +78,26 @@
 	 */
 	restore_simd_el2_config(ctx, &ctx->el2_regs);
 
-	/* Save SVE vector registers */
-	if (simd_has_sve(ctx) && !is_ctx_sve_hint_set(ctx)) {
+	/*
+	 * For SME, save the Streaming Vector Control Register that contains CPU
+	 * global PSTATE.SM and PSTATE.ZA mode bits before saving the context.
+	 */
+	if (simd_has_sme(ctx)) {
+		ctx->svcr = read_svcr();
+	}
+
+	/* Save SVE vector registers for normal or streaming SVE mode */
+	if (is_sve_mode_normal(ctx) || is_sve_mode_streaming(ctx)) {
+		bool include_ffr;
+
+		if (is_sve_mode_streaming(ctx)) {
+			include_ffr = sme_feat_fa64_enabled();
+		} else {
+			include_ffr = true;
+		}
+
 		/* Saving SVE Z registers emcompasses FPU Q registers */
-		sve_save_vector_registers(&ctx->vregs.sve, true);
+		sve_save_vector_registers(&ctx->vregs.sve, include_ffr);
 	} else {
 		/* Save FPU Q registers */
 		fpu_save_registers(&ctx->vregs.fpu);
@@ -80,6 +109,20 @@
 	}
 	ctx->ctl_status_regs.fpsr = read_fpsr();
 	ctx->ctl_status_regs.fpcr = read_fpcr();
+
+	/*
+	 * Exit streaming mode after saving context by setting PSTATE.SM to 0.
+	 * This will set each implemented bit of SVE registers Z0-Z31, P0-P15,
+	 * and FFR to zero. Even if Normal SVE implements a  greater vector
+	 * length than Streaming SVE, always the whole state is cleared.
+	 *
+	 * Exiting Streaming SVE mode doesn't have any impact on SME ZA storage
+	 * or, if implemented, ZT0 storage.
+	 */
+	if (is_sve_mode_streaming(ctx)) {
+		write_svcr(ctx->svcr & ~SVCR_SM_BIT);
+		isb();
+	}
 }
 
 static void restore_simd_context(struct simd_context *ctx)
@@ -90,10 +133,36 @@
 	 */
 	restore_simd_el2_config(ctx, &ctx->el2_regs);
 
-	/* Restore SVE vector registers */
-	if (simd_has_sve(ctx) && !is_ctx_sve_hint_set(ctx)) {
+	/*
+	 * For SME, restore the Streaming Vector Control Register that contains
+	 * CPU global PSTATE.SM and PSTATE.ZA mode bits before restoring the
+	 * context.
+	 *
+	 * If PSTATE.SM is set to 1, then CPU enters Streaming SVE mode and each
+	 * implemented bit of SVE registers Z0-Z31, P0-P15, and FFR in the new
+	 * mode is set to zero. Even if Normal SVE implements a  greater vector
+	 * length than Streaming SVE, always the whole state is cleared.
+	 *
+	 * Entering Streaming SVE mode doesn't have any impact on SME ZA storage
+	 * or, if implemented, ZT0 storage.
+	 */
+	if (simd_has_sme(ctx)) {
+		write_svcr(ctx->svcr);
+		isb();
+	}
+
+	/* Restore SVE vector registers for normal or streaming SVE mode */
+	if (is_sve_mode_normal(ctx) || is_sve_mode_streaming(ctx)) {
+		bool include_ffr;
+
+		if (is_sve_mode_streaming(ctx)) {
+			include_ffr = sme_feat_fa64_enabled();
+		} else {
+			include_ffr = true;
+		}
+
 		/* Restoring SVE Z registers emcompasses FPU Q registers */
-		sve_restore_vector_registers(&ctx->vregs.sve, true);
+		sve_restore_vector_registers(&ctx->vregs.sve, include_ffr);
 	} else {
 		/* Restore FPU Q registers */
 		fpu_restore_registers(&ctx->vregs.fpu);
@@ -210,6 +279,10 @@
 		}
 	}
 
+	if (simd_cfg->sme_en && !g_simd_cfg.sme_en) {
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -283,6 +356,15 @@
 					      CPTR_EL2_VHE_ZEN_NO_TRAP_11);
 	}
 
+	/* Initialize SME related fields */
+	if (simd_cfg->sme_en) {
+		simd_ctx->tflags |= SIMD_TFLAG_SME;
+		simd_ctx->svcr = 0UL;
+
+		simd_ctx->cptr_el2 |= INPLACE(CPTR_EL2_VHE_SMEN,
+					      CPTR_EL2_VHE_SMEN_NO_TRAP_11);
+	}
+
 	simd_ctx->sflags |= SIMD_SFLAG_INIT_DONE;
 
 	return 0;
@@ -357,6 +439,54 @@
 	g_simd_cfg.sve_vq = sve_max_vq;
 }
 
+/* Find the architecturally permitted (Streaming Vector Length) SVL */
+static void sme_init_once(void)
+{
+	uint32_t __unused sme_svq_arch_max;
+	unsigned long cptr_new, cptr_saved;
+	uint64_t smcr_val;
+
+	cptr_saved = read_cptr_el2();
+
+	/* Enable access to FPU, SVE, SME */
+	cptr_new = cptr_saved |
+		INPLACE(CPTR_EL2_VHE_FPEN, CPTR_EL2_VHE_FPEN_NO_TRAP_11) |
+		INPLACE(CPTR_EL2_VHE_ZEN, CPTR_EL2_VHE_ZEN_NO_TRAP_11) |
+		INPLACE(CPTR_EL2_VHE_SMEN, CPTR_EL2_VHE_SMEN_NO_TRAP_11);
+	write_cptr_el2(cptr_new);
+	isb();
+
+	/*
+	 * Arm SME supplement recommends to request an out of range vector
+	 * length of 8192 bytes by writing 0x1ff to SMCR_ELx[8:0]. Reading back
+	 * the register will give the max architecturally permitted SVQ.
+	 *
+	 * The old smcr_el2 is not restored as this is called only once during
+	 * cold boot.
+	 */
+	smcr_val = read_smcr_el2();
+	smcr_val |= MASK(SMCR_EL2_RAZ_LEN);
+	write_smcr_el2(smcr_val);
+	isb();
+	sme_svq_arch_max = (uint32_t)EXTRACT(SMCR_EL2_RAZ_LEN, read_smcr_el2());
+
+	/*
+	 * Streaming SVE shares SVE register set Z/P/FFR. RMM will save and
+	 * restore Streaming SVE state in 'sve_state', and 'sve_state' can hold
+	 * vector registers for up to 2048 bits (vq: 15). If Streaming SVE mode
+	 * reports a larger value than SVE_VQ_ARCH_MAX, then assert. Hopefully
+	 * we won't hit this condition in the near future.
+	 */
+	assert(sme_svq_arch_max <= SVE_VQ_ARCH_MAX);
+
+	/* Restore saved cptr */
+	write_cptr_el2(cptr_saved);
+	isb();
+
+	/* Update global system simd config */
+	g_simd_cfg.sme_en = true;
+}
+
 /*
  * This function initializes the SIMD layer depending on SIMD capability of the
  * CPU (FPU/SVE). If CPU supports SVE, the max VQ will be discovered. Global
@@ -373,5 +503,9 @@
 		sve_init_once();
 	}
 
+	if (is_feat_sme_present()) {
+		sme_init_once();
+	}
+
 	g_simd_init_done = true;
 }