feat(runtime): for each CPU only map their own stack respectively

This commit changes the memory mapping of the stack area. Instead of
flat-mapping the whole stack area for each CPU, only map the stack
area for that particular CPU in the High region.

For this to work, the following changes are made:
 - move the mapping of the high region out from the realm lib to the
   xlat library
 - move the build time config option RMM_NUM_PAGES_PER_STACK to the xlat
   library.
 - add a new mm region to the high VA space for the stack of that CPU.
 - reorder segments in the linker script so that `.percpu` segment is
   the last. Change flat-mapping configurations so that the `.percpu`
   segment is not added.

Change-Id: I997c1b5ff2b972807d75692b5efc9075c5a29a30
Signed-off-by: Mate Toth-Pal <mate.toth-pal@arm.com>
diff --git a/cmake/CommonConfigs.cmake b/cmake/CommonConfigs.cmake
index ee896ac..b8709ac 100644
--- a/cmake/CommonConfigs.cmake
+++ b/cmake/CommonConfigs.cmake
@@ -31,6 +31,13 @@
     DEFAULT 0x0)
 
 arm_config_option(
+    NAME RMM_NUM_PAGES_PER_STACK
+    HELP "Number of pages to use per CPU stack"
+    TYPE STRING
+    DEFAULT 5
+    ADVANCED)
+
+arm_config_option(
     NAME RMM_DOCS
     HELP "RMM Documentation build"
     TYPE BOOL
@@ -76,6 +83,9 @@
 target_compile_definitions(rmm-common
     INTERFACE "RMM_MAX_GRANULES=U(${RMM_MAX_GRANULES})")
 
+target_compile_definitions(rmm-common
+    INTERFACE "RMM_NUM_PAGES_PER_STACK=UL(${RMM_NUM_PAGES_PER_STACK})")
+
 if(RMM_FPU_USE_AT_REL2)
     target_compile_definitions(rmm-common
         INTERFACE "RMM_FPU_USE_AT_REL2=1")
diff --git a/docs/getting_started/build-options.rst b/docs/getting_started/build-options.rst
index 6667fe8..b863cd5 100644
--- a/docs/getting_started/build-options.rst
+++ b/docs/getting_started/build-options.rst
@@ -246,7 +246,7 @@
    RMM_UART_ADDR		,			,0x0			,"Base addr of UART to be used for RMM logs"
    PLAT_CMN_CTX_MAX_XLAT_TABLES ,			,0			,"Maximum number of translation tables used by the runtime context"
    PLAT_CMN_EXTRA_MMAP_REGIONS	,			,0			,"Extra platform mmap regions that need to be mapped in S1 xlat tables"
-   RMM_NUM_PAGES_PER_STACK	,			,3			,"Number of pages to use per CPU stack"
+   RMM_NUM_PAGES_PER_STACK	,			,5			,"Number of pages to use per CPU stack"
    MBEDTLS_ECP_MAX_OPS		,248 -			,1000			,"Number of max operations per ECC signing iteration"
    RMM_FPU_USE_AT_REL2		,ON | OFF		,OFF(fake_host) ON(aarch64),"Enable FPU/SIMD usage in RMM."
    RMM_MAX_GRANULES		,			,0			,"Maximum number of memory granules available to the system"
diff --git a/lib/realm/include/buffer.h b/lib/realm/include/buffer.h
index 8bab8bc..af2306e 100644
--- a/lib/realm/include/buffer.h
+++ b/lib/realm/include/buffer.h
@@ -55,24 +55,6 @@
 		     void *src);
 
 /*
- * Initializes and enables the VMSA for the slot buffer mechanism.
- *
- * Create an empty translation context for the current PE.
- * If the context already exists (e.g. current PE was previously
- * turned on and therefore the context is already in memory),
- * nothing happens.
- */
-void slot_buf_setup_xlat(void);
-
-/*
- * Initializes the slot buffer components common to all PEs. This function
- * must only be called once during cold boot initialization.
- *
- * Returns 0 on success and a negative POSIX error code otherwise.
- */
-int slot_buf_coldboot_init(void);
-
-/*
  * Finishes initializing the slot buffer mechanism.
  * This function should be called after the MMU is enabled, during the
  * warmboot path.
diff --git a/lib/realm/src/buffer.c b/lib/realm/src/buffer.c
index 5f86fdd..a4b1728 100644
--- a/lib/realm/src/buffer.c
+++ b/lib/realm/src/buffer.c
@@ -16,59 +16,12 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <table.h>
+#include <utils_def.h>
 #include <xlat_contexts.h>
+#include <xlat_high_va.h>
 #include <xlat_tables.h>
 
-/*
- * All the slot buffers for a given PE must be mapped by a single translation
- * table, which means the max VA size should be <= 4KB * 512
- */
-COMPILER_ASSERT((RMM_SLOT_BUF_VA_SIZE) <= (GRANULE_SIZE * XLAT_TABLE_ENTRIES));
-
-/*
- * For all translation stages if FEAT_TTST is implemented, while
- * the PE is executing in AArch64 state and is using 4KB
- * translation granules, the min address space size is 64KB
- */
-COMPILER_ASSERT((RMM_SLOT_BUF_VA_SIZE) >= (1 << 16U));
-
-#define RMM_SLOT_BUF_MMAP	MAP_REGION_TRANSIENT(			\
-					SLOT_VIRT,			\
-					RMM_SLOT_BUF_VA_SIZE,		\
-					PAGE_SIZE)
-
-#define SLOT_BUF_MMAP_REGIONS		UL(1)
-
-/*
- * Attributes for a buffer slot page descriptor.
- * Note that the AF bit on the descriptor is handled by the translation
- * library (it assumes that access faults are not handled) so it does not
- * need to be specified here.
- */
-#define SLOT_DESC_ATTR		(MT_RW_DATA | MT_NG)
-
-/*
- * The base tables for all the contexts are manually allocated as a continous
- * block of memory (one L3 table per PE).
- */
-static uint64_t slot_buf_s1tt[XLAT_TABLE_ENTRIES * MAX_CPUS]
-				    __aligned(XLAT_TABLES_ALIGNMENT);
-
-/* Allocate per-cpu xlat_ctx_tbls */
-static struct xlat_ctx_tbls slot_buf_tbls[MAX_CPUS];
-
-/* Allocate xlat_ctx_cfg for high VA which will be common to all PEs */
-static struct xlat_ctx_cfg slot_buf_xlat_ctx_cfg;
-
-/* context definition */
-static struct xlat_ctx slot_buf_xlat_ctx[MAX_CPUS];
-
-/*
- * Allocate a cache to store the last level table info where the slot buffers
- * are mapped to avoid needing to perform a table walk every time a buffer
- * slot operation has to be done.
- */
-static struct xlat_llt_info llt_info_cache[MAX_CPUS];
+COMPILER_ASSERT(XLAT_HIGH_VA_SLOT_NUM >= NR_CPU_SLOTS);
 
 uintptr_t slot_to_va(enum buffer_slot slot)
 {
@@ -77,13 +30,15 @@
 	return (SLOT_VIRT + (GRANULE_SIZE * (unsigned long)slot));
 }
 
-static inline struct xlat_ctx *get_slot_buf_xlat_ctx(void)
-{
-	return &slot_buf_xlat_ctx[my_cpuid()];
-}
-
 struct xlat_llt_info *get_cached_llt_info(void)
 {
+	/*
+	 * Allocate a cache to store the last level table info where the slot buffers
+	 * are mapped to avoid needing to perform a table walk every time a buffer
+	 * slot operation has to be done.
+	 */
+	static struct xlat_llt_info llt_info_cache[MAX_CPUS];
+
 	return &llt_info_cache[my_cpuid()];
 }
 
@@ -96,57 +51,6 @@
 	return xlat_read_tte(entry);
 }
 
-int slot_buf_coldboot_init(void)
-{
-	static struct xlat_mmap_region slot_buf_regions[] = {
-		RMM_SLOT_BUF_MMAP,
-	};
-
-	/*
-	 * Initialize the common configuration used for all
-	 * translation contexts
-	 */
-	return xlat_ctx_cfg_init(&slot_buf_xlat_ctx_cfg, VA_HIGH_REGION,
-				 &slot_buf_regions[0], 1U,
-				 RMM_SLOT_BUF_VA_SIZE);
-}
-
-/*
- * Setup xlat table for slot buffer mechanism for each PE.
- * Must be called for every PE in the system
- */
-void slot_buf_setup_xlat(void)
-{
-	unsigned int cpuid = my_cpuid();
-	struct xlat_ctx *slot_buf_ctx = get_slot_buf_xlat_ctx();
-
-	/*
-	 * Initialize the translation tables for the current context.
-	 * This is done on the first boot of each PE.
-	 */
-	int ret = xlat_ctx_init(slot_buf_ctx,
-				&slot_buf_xlat_ctx_cfg,
-				&slot_buf_tbls[cpuid],
-				&slot_buf_s1tt[XLAT_TABLE_ENTRIES * cpuid], 1U);
-
-	if (!((ret == 0) || (ret == -EALREADY))) {
-		/*
-		 * If the context was already created, carry on with the
-		 * initialization. If it cannot be created, panic.
-		 */
-		ERROR("%s (%u): Failed to initialize the xlat context for the slot buffers (-%i)\n",
-					__func__, __LINE__, ret);
-		panic();
-	}
-
-	/* Configure MMU registers */
-	if (xlat_arch_setup_mmu_cfg(get_slot_buf_xlat_ctx())) {
-		ERROR("%s (%u): MMU registers failed to initialize\n",
-					__func__, __LINE__);
-		panic();
-	}
-}
-
 /*
  * Finishes initializing the slot buffer mechanism.
  * This function must be called after the MMU is enabled.
@@ -162,7 +66,7 @@
 	 */
 	if ((get_cached_llt_info())->table == NULL) {
 		if (xlat_get_llt_from_va(get_cached_llt_info(),
-					 get_slot_buf_xlat_ctx(),
+					 xlat_get_high_va_xlat_ctx(),
 					 slot_to_va(SLOT_NS)) != 0) {
 			ERROR("%s (%u): Failed to initialize table entry cache for CPU %u\n",
 					__func__, __LINE__, my_cpuid());
@@ -313,7 +217,13 @@
 
 void *buffer_map_internal(enum buffer_slot slot, unsigned long addr)
 {
-	uint64_t attr = SLOT_DESC_ATTR;
+	/*
+	 * Attributes for a buffer slot page descriptor.
+	 * Note that the AF bit on the descriptor is handled by the translation
+	 * library (it assumes that access faults are not handled) so it does not
+	 * need to be specified here.
+	 */
+	uint64_t attr = XLAT_NG_DATA_ATTR;
 	uintptr_t va = slot_to_va(slot);
 	struct xlat_llt_info *entry = get_cached_llt_info();
 
diff --git a/lib/realm/src/include/buffer_private.h b/lib/realm/src/include/buffer_private.h
index cf0bb1c..5e0f92b 100644
--- a/lib/realm/src/include/buffer_private.h
+++ b/lib/realm/src/include/buffer_private.h
@@ -6,19 +6,6 @@
 #include <buffer.h>
 #include <xlat_tables.h>
 
-/*
- * The VA space size for the high region, which maps the slot buffers,
- * needs to be a power of two, so round NR_CPU_SLOTS up to the closest
- * power of two.
- */
-#define ROUNDED_NR_CPU_SLOTS	(1ULL << (64U - \
-	(unsigned int)__builtin_clzll(((unsigned long long)NR_CPU_SLOTS) - 1U)))
-
-#define RMM_SLOT_BUF_VA_SIZE	((ROUNDED_NR_CPU_SLOTS) * (GRANULE_SIZE))
-
-#define SLOT_VIRT		((ULL(0xffffffffffffffff) - \
-				 RMM_SLOT_BUF_VA_SIZE + ULL(1)))
-
 struct xlat_llt_info *get_cached_llt_info(void);
 uintptr_t slot_to_va(enum buffer_slot slot);
 bool memcpy_ns_read(void *dest, const void *ns_src, size_t size);
diff --git a/lib/realm/tests/buffer.cpp b/lib/realm/tests/buffer.cpp
index 7f97e6e..4be126e 100644
--- a/lib/realm/tests/buffer.cpp
+++ b/lib/realm/tests/buffer.cpp
@@ -1303,14 +1303,6 @@
 	test_helpers_fail_if_no_assert_failed();
 }
 
-TEST(slot_buffer, slot_buf_setup_xlat_TC1)
-{
-	/*
-	 * slot_buf_setup_xlat() has already been used during initialization
-	 * for all tests, so skip it.
-	 */
-}
-
 TEST(slot_buffer, slot_buf_finish_warmboot_init_TC1)
 {
 	/*
diff --git a/lib/xlat/CMakeLists.txt b/lib/xlat/CMakeLists.txt
index 567e688..f4b4c00 100644
--- a/lib/xlat/CMakeLists.txt
+++ b/lib/xlat/CMakeLists.txt
@@ -21,14 +21,17 @@
     PRIVATE "src/xlat_tables_core.c"
             "src/xlat_tables_utils.c"
             "src/xlat_tables_arch.c"
-            "src/xlat_contexts.c")
+            "src/xlat_contexts.c"
+            "src/xlat_high_va.c")
 
 if(RMM_ARCH STREQUAL fake_host)
     target_sources(rmm-lib-xlat
-        PRIVATE "src/fake_host/enable_mmu.c")
+        PRIVATE "src/fake_host/enable_mmu.c"
+                "src/fake_host/xlat_helpers.c")
 else()
     target_sources(rmm-lib-xlat
-        PRIVATE "src/aarch64/enable_mmu.S")
+        PRIVATE "src/aarch64/enable_mmu.S"
+                "src/aarch64/xlat_helper.S")
 endif()
 
 arm_config_option(
diff --git a/lib/xlat/include/xlat_defs.h b/lib/xlat/include/xlat_defs.h
index 30a5281..1a45998 100644
--- a/lib/xlat/include/xlat_defs.h
+++ b/lib/xlat/include/xlat_defs.h
@@ -33,7 +33,7 @@
 
 /* Values for number of entries in each MMU translation table */
 #define XLAT_TABLE_ENTRIES_SHIFT (XLAT_TABLE_SIZE_SHIFT - XLAT_ENTRY_SIZE_SHIFT)
-#define XLAT_TABLE_ENTRIES	(U(1) << XLAT_TABLE_ENTRIES_SHIFT)
+#define XLAT_TABLE_ENTRIES	(UL(1) << XLAT_TABLE_ENTRIES_SHIFT)
 #define XLAT_TABLE_ENTRIES_MASK	(XLAT_TABLE_ENTRIES - U(1))
 
 /* Values for number of entries in a MMU translation table at level -1 */
diff --git a/lib/xlat/include/xlat_high_va.h b/lib/xlat/include/xlat_high_va.h
new file mode 100644
index 0000000..af9e16c
--- /dev/null
+++ b/lib/xlat/include/xlat_high_va.h
@@ -0,0 +1,48 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SPDX-FileCopyrightText: Copyright TF-RMM Contributors.
+ */
+
+#ifndef XLAT_HIGH_VA_H
+#define XLAT_HIGH_VA_H
+
+#include <utils_def.h>
+#include <xlat_tables.h>
+
+/* The per CPU High VA space is used for Slot buffer and per-CPU stack mapping */
+
+#define XLAT_HIGH_VA_SLOT_NUM	(U(25))
+/* Calculate the slot buffer's virtual address */
+#define SLOT_VIRT		(((UL(~0)) - (XLAT_HIGH_VA_SLOT_NUM * GRANULE_SIZE)) + 1U)
+
+#define RMM_CPU_STACK_SIZE	(RMM_NUM_PAGES_PER_STACK * PAGE_SIZE)
+/* Leave some pages of gap above the stack top */
+#define GAP_PAGE_COUNT		(U(1))
+#define CPU_STACK_GAP		(GAP_PAGE_COUNT * PAGE_SIZE)
+#define RMM_CPU_STACK_END_VA	(SLOT_VIRT - CPU_STACK_GAP)
+#define CPU_STACK_VIRT		(RMM_CPU_STACK_END_VA - RMM_CPU_STACK_SIZE)
+
+/* TTE attribute for per CPU private VA (nG) Data mapping) */
+#define XLAT_NG_DATA_ATTR	(MT_RW_DATA | MT_NG)
+
+#if !(defined(__ASSEMBLER__) || defined(__LINKER__))
+
+/* Return a pointer to the High VA xlat context for the current CPU */
+struct xlat_ctx *xlat_get_high_va_xlat_ctx(void);
+
+/*
+ * Initializes and enables the VMSA for the high va region.
+ *
+ * Create an empty translation context for the current PE.
+ * If the context already exists (e.g. current PE was previously
+ * turned on and therefore the context is already in memory),
+ * nothing happens.
+ */
+int xlat_high_va_setup(void);
+
+/* Return the stack physical address for a given PE index */
+uintptr_t rmm_get_my_stack(unsigned long cpuid);
+
+#endif /* !(defined(__ASSEMBLER__) || defined(__LINKER__)) */
+
+#endif /* XLAT_HIGH_VA_H */
diff --git a/lib/xlat/src/aarch64/enable_mmu.S b/lib/xlat/src/aarch64/enable_mmu.S
index 1fd2cfd..f723d89 100644
--- a/lib/xlat/src/aarch64/enable_mmu.S
+++ b/lib/xlat/src/aarch64/enable_mmu.S
@@ -7,6 +7,7 @@
 /* This file is derived from xlat_table_v2 library in TF-A project */
 
 #include <asm_macros.S>
+#include <xlat_high_va.h>
 #include <xlat_tables.h>
 
 	.global	xlat_enable_mmu_el2
@@ -35,6 +36,13 @@
 		orr	x4, x4, x5
 		msr	sctlr_el2, x4
 
+		/*
+		 * Update SP to point to the VA of the stack bottom for this
+		 * CPU after enabling MMU
+		 */
+		mov_imm	x5, RMM_CPU_STACK_END_VA
+		mov	sp, x5
+
 		isb
 
 		ret
diff --git a/lib/xlat/src/aarch64/xlat_helper.S b/lib/xlat/src/aarch64/xlat_helper.S
new file mode 100644
index 0000000..cfad870
--- /dev/null
+++ b/lib/xlat/src/aarch64/xlat_helper.S
@@ -0,0 +1,31 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SPDX-FileCopyrightText: Copyright TF-RMM Contributors.
+ */
+
+#include <asm_macros.S>
+#include <xlat_defs.h>
+#include <xlat_high_va.h>
+
+.globl rmm_get_my_stack
+
+ /*
+ * Return the stack for a given PE index in x0
+ * stack-start				     rmm_stack_end
+ *       o--sz---o....o--sz---o--sz---o--sz---o
+ *       ^\_____/^....^\_____/^\_____/^\_____/^
+ * id = (MAX_CPU-1)      2       1       0
+ * Arg : x0 - CPU position
+ * sz: RMM_CPU_STACK_SIZE bytes.
+ */
+func rmm_get_my_stack
+#ifndef NDEBUG
+	cmp	x0, #MAX_CPUS
+	ASM_ASSERT lo
+#endif
+	adrp	x1, rmm_stack_end
+	add	x1, x1, :lo12:rmm_stack_end
+	mov_imm	x2, RMM_CPU_STACK_SIZE	/* stack size per CPU */
+	umsubl	x0, w0, w2, x1
+	ret
+endfunc rmm_get_my_stack
diff --git a/lib/xlat/src/fake_host/xlat_helpers.c b/lib/xlat/src/fake_host/xlat_helpers.c
new file mode 100644
index 0000000..bf7f171
--- /dev/null
+++ b/lib/xlat/src/fake_host/xlat_helpers.c
@@ -0,0 +1,11 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SPDX-FileCopyrightText: Copyright TF-RMM Contributors.
+ */
+
+#include <xlat_high_va.h>
+
+uintptr_t rmm_get_my_stack(unsigned long cpuid)
+{
+	return 0x10000UL;
+}
diff --git a/lib/xlat/src/xlat_high_va.c b/lib/xlat/src/xlat_high_va.c
new file mode 100644
index 0000000..ab132fc
--- /dev/null
+++ b/lib/xlat/src/xlat_high_va.c
@@ -0,0 +1,122 @@
+/*
+ * SPDX-License-Identifier: BSD-3-Clause
+ * SPDX-FileCopyrightText: Copyright TF-RMM Contributors.
+ */
+
+#include <assert.h>
+#include <cpuid.h>
+#include <debug.h>
+#include <errno.h>
+#include <import_sym.h>
+#include <sizes.h>
+#include <stdint.h>
+#include <string.h>
+#include <utils_def.h>
+#include <xlat_contexts.h>
+#include <xlat_high_va.h>
+#include <xlat_tables.h>
+
+#define RMM_SLOT_BUF_SIZE	((XLAT_HIGH_VA_SLOT_NUM) * (GRANULE_SIZE))
+#define HIGH_VA_SIZE		(XLAT_TABLE_ENTRIES * PAGE_SIZE)
+
+#define RMM_STACK_MMAP_IDX	0U
+#define RMM_STACK_MMAP		MAP_REGION_FULL_SPEC(				\
+					0UL, /* PA is different for each CPU */	\
+					CPU_STACK_VIRT,				\
+					RMM_CPU_STACK_SIZE,			\
+					XLAT_NG_DATA_ATTR,			\
+					PAGE_SIZE)
+
+/* The Slot buffers are mapped/unmapped dynamically. Hence define TRANSIENT mmap region */
+#define RMM_SLOT_BUF_MMAP	MAP_REGION_TRANSIENT(		\
+					SLOT_VIRT,		\
+					RMM_SLOT_BUF_SIZE,	\
+					PAGE_SIZE)
+
+#define MMAP_REGION_COUNT	2U
+
+/*
+ * A single L3 page is used to map the slot buffers as well as the stack, so
+ * enforce here that they fit within that L3 page.
+ */
+COMPILER_ASSERT((RMM_NUM_PAGES_PER_STACK + GAP_PAGE_COUNT + XLAT_HIGH_VA_SLOT_NUM) <=
+	XLAT_TABLE_ENTRIES);
+
+/* context definition */
+static struct xlat_ctx high_va_xlat_ctx[MAX_CPUS];
+
+struct xlat_ctx *xlat_get_high_va_xlat_ctx(void)
+{
+	return &high_va_xlat_ctx[my_cpuid()];
+}
+
+/*
+ * Setup xlat table for the high VA region for each PE.
+ * Must be called for every PE in the system.
+ * The layout of the High VA space:
+ *
+ * +-----------------+  0xFFFFFFFFFFFFFFFF (VA max)
+ * |                 |
+ * |   Slot buffer   |  XLAT_HIGH_VA_SLOT_NUM * GRANULE_SIZE bytes
+ * |                 |
+ * +-----------------+
+ * |                 |
+ * |   Stack guard   |  CPU_STACK_GAP bytes, Unmapped
+ * |                 |
+ * +-----------------+
+ * |                 |
+ * |    RMM stack    |  RMM_CPU_STACK_SIZE bytes,
+ * |                 |
+ * +-----------------+
+ * |                 |
+ * |     Unmapped    |
+ * |                 |
+ * +-----------------+  0xFFFFFFFFFFE00000
+ */
+int xlat_high_va_setup(void)
+{
+	/* Allocate xlat_ctx_cfg for high VA which will be specific to PEs */
+	static struct xlat_ctx_cfg high_va_xlat_ctx_cfgs[MAX_CPUS];
+	/* Allocate per-cpu xlat_ctx_tbls */
+	static struct xlat_ctx_tbls high_va_tbls[MAX_CPUS];
+	/* Allocate xlat_mmap_region for high VA mappings which will be specific to PEs */
+	static struct xlat_mmap_region mm_regions_array[MAX_CPUS][MMAP_REGION_COUNT] = {
+		[0 ... MAX_CPUS-1] = {RMM_STACK_MMAP, RMM_SLOT_BUF_MMAP}};
+	/*
+	 * The base tables for all the contexts are manually allocated as a continuous
+	 * block of memory (one L3 table per PE).
+	 */
+	static uint64_t high_va_tts[XLAT_TABLE_ENTRIES * MAX_CPUS] __aligned(XLAT_TABLES_ALIGNMENT);
+
+	unsigned int cpuid = my_cpuid();
+	int ret;
+
+	/* Set stack PA for this CPU */
+	mm_regions_array[cpuid][RMM_STACK_MMAP_IDX].base_pa =
+		rmm_get_my_stack(cpuid) - RMM_CPU_STACK_SIZE;
+
+	/* Initialize the context configuration for this CPU */
+	ret = xlat_ctx_cfg_init(&high_va_xlat_ctx_cfgs[cpuid], VA_HIGH_REGION,
+				 &mm_regions_array[cpuid][0U],
+				 MMAP_REGION_COUNT,
+				 HIGH_VA_SIZE);
+	if (!((ret == 0) || (ret == -EALREADY))) {
+		return ret;
+	}
+
+	/*
+	 * Initialize the translation tables for the current context.
+	 * This is done on the first boot of each PE.
+	 */
+	ret = xlat_ctx_init(&high_va_xlat_ctx[cpuid],
+				&high_va_xlat_ctx_cfgs[cpuid],
+				&high_va_tbls[cpuid],
+				&high_va_tts[XLAT_TABLE_ENTRIES * cpuid], 1U);
+
+	if (!((ret == 0) || (ret == -EALREADY))) {
+		return ret;
+	}
+
+	/* Configure MMU registers */
+	return xlat_arch_setup_mmu_cfg(&high_va_xlat_ctx[cpuid]);
+}
diff --git a/plat/common/src/plat_common_init.c b/plat/common/src/plat_common_init.c
index 2a2afa1..2e51d7b 100644
--- a/plat/common/src/plat_common_init.c
+++ b/plat/common/src/plat_common_init.c
@@ -16,6 +16,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <xlat_contexts.h>
+#include <xlat_high_va.h>
 #include <xlat_tables.h>
 
 
@@ -179,8 +180,7 @@
 	/* Read supported GIC virtualization features and init GIC variables */
 	gic_get_virt_features();
 
-	/* Perform coold boot initialization of the slot buffer mechanism */
-	return slot_buf_coldboot_init();
+	return 0;
 }
 
 /*
@@ -202,8 +202,13 @@
 		return ret;
 	}
 
-	/* Setup the MMU cfg for the slot buffer context (high region) */
-	slot_buf_setup_xlat();
+	/* Perform warm boot initialization of the high VA region */
+	ret = xlat_high_va_setup();
+	if (ret != 0) {
+		ERROR("%s (%u): Failed to setup high VA for CPU[%u]\n",
+			__func__, __LINE__, my_cpuid());
+		return ret;
+	}
 
 	VERBOSE("xlat tables configured for CPU[%u]\n", my_cpuid());
 	return 0;
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 2df8a60..14cdd37 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -19,16 +19,6 @@
 target_compile_definitions(rmm-runtime
     PRIVATE "RSI_LOG_LEVEL=${RSI_LOG_LEVEL}")
 
-arm_config_option(
-    NAME RMM_NUM_PAGES_PER_STACK
-    HELP "Number of pages to use per CPU stack"
-    TYPE STRING
-    DEFAULT 5
-    ADVANCED)
-
-target_compile_definitions(rmm-runtime
-    PRIVATE "RMM_NUM_PAGES_PER_STACK=${RMM_NUM_PAGES_PER_STACK}")
-
 target_link_libraries(rmm-runtime
     PRIVATE rmm-lib
             rmm-platform)
diff --git a/runtime/core/aarch64/head.S b/runtime/core/aarch64/head.S
index 7d253e1..e6341d0 100644
--- a/runtime/core/aarch64/head.S
+++ b/runtime/core/aarch64/head.S
@@ -6,9 +6,6 @@
 #include <arch.h>
 #include <asm_macros.S>
 #include <rmm_el3_ifc.h>
-#include <xlat_tables.h>
-
-#define RMM_STACK_SIZE		(SZ_4K * RMM_NUM_PAGES_PER_STACK)
 
 .globl rmm_entry
 
@@ -55,7 +52,15 @@
 	mov	x0, x20
 	bl	rmm_el3_ifc_validate_cpuid
 
-	/* Setup stack on this CPU. X0 already contains the CPU Id */
+	/*
+	 * Setup stack on this CPU. X0 already contains the CPU Id. The stack
+	 * address set here is the physical address of the stack buffer. During
+	 * RMM boot the stack for this CPU is going to be mapped in the high VA
+	 * space of this CPU, and the SP is updated to the VA of the stack
+	 * bottom after MMU activation. After that, accessing the physical stack
+	 * address causes exception as the stack buffer is not flat-mapped in
+	 * the VA space of the CPU.
+	 */
 	bl	rmm_get_my_stack
 	mov	sp, x0
 
@@ -151,24 +156,3 @@
 cold_boot_flag:
 	.dword		1
 endfunc rmm_entry
-
-/*
- * Return the stack for a given PE index in x0
- * stack-start				     stack_end
- *       o--sz---o....o--sz---o--sz---o--sz---o
- *       ^\_____/^....^\_____/^\_____/^\_____/^
- * id = (MAX_CPU-1)      2       1       0
- * Arg : x0 - CPU position
- * sz: RMM_STACK_SIZE bytes.
- */
-func rmm_get_my_stack
-#ifndef NDEBUG
-	cmp	x0, #MAX_CPUS
-	ASM_ASSERT lo
-#endif
-	adrp	x1, stack_end
-	add	x1, x1, :lo12:stack_end
-	mov	x2, #(RMM_STACK_SIZE)	/* stack size per CPU */
-	umsubl	x0, w0, w2, x1
-	ret
-endfunc rmm_get_my_stack
diff --git a/runtime/linker.lds b/runtime/linker.lds
index a850530..a8bef69 100644
--- a/runtime/linker.lds
+++ b/runtime/linker.lds
@@ -5,6 +5,10 @@
 
 #include <sizes.h>
 
+#if ((RMM_NUM_PAGES_PER_STACK == 0) || (GRANULE_SIZE == 0) || (MAX_CPUS== 0))
+#error "Required config options not set for the linker script."
+#endif
+
 ENTRY(rmm_entry)
 
 MEMORY {
@@ -67,12 +71,6 @@
 		rmm_rela_end = .;
 	} >RAM
 
-	.percpu ALIGN(GRANULE_SIZE) (NOLOAD) : {
-		stack_start = .;
-		. = . + (RMM_NUM_PAGES_PER_STACK * GRANULE_SIZE * MAX_CPUS);
-		stack_end = .;
-	} >RAM
-
 	.bss ALIGN(16) (NOLOAD) : {
 		bss_start = .;
 		*(.bss*)
@@ -81,10 +79,19 @@
 
 	. = ALIGN(GRANULE_SIZE);
 	rmm_rw_end = .;
-	rmm_end = rmm_rw_end;
 
 	ASSERT(rmm_rw_end == ALIGN(GRANULE_SIZE), "rmm_rw_end is not page aligned")
 
+	.stack ALIGN(GRANULE_SIZE) (NOLOAD) : {
+		rmm_stack_start = .;
+		. = . + (RMM_NUM_PAGES_PER_STACK * GRANULE_SIZE * MAX_CPUS);
+		rmm_stack_end = .;
+	} >RAM
+
+	rmm_end = .;
+
+	ASSERT(rmm_end == ALIGN(GRANULE_SIZE), "rmm_end is not page aligned")
+
 	/DISCARD/ : { *(.dynstr*) }
 	/DISCARD/ : { *(.dynsym*) }
 	/DISCARD/ : { *(.dynamic*) }