feat(rdv3): enable numa aware per-cpu for RD-V3-Cfg2

RD-V3-Cfg2 being quad chip can make use of NUMA allocation within the
per-cpu framework. With NUMA allocation, the platform can distribute
per-cpu objects within a memory that is local to a particular node.
RD-V3-Cfg2 in this case has the per-cpu objects distributed across
different SRAMs present on the system.

introduce platform-specific helper functions to enhance the per_cpu
framework. Adds a helper function to zero init per_cpu sections,
ensuring clean initialization of per-cpu data. Introduces a function
to obtain the base address of per_cpu sections, facilitating efficient
access to per-CPU data structures. Enhances the per_cpu framework's
capability to handle platform-specific requirements.

These additions are crucial for maintaining the integrity and performance
of per-cpu operations.

Change-Id: If08169ba0de8fd7263db07d1587e598cffcf959a
Signed-off-by: Sammit Joshi <sammit.joshi@arm.com>
Signed-off-by: Rohit Mathew <rohit.mathew@arm.com>
diff --git a/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_css_fw_def3.h b/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_css_fw_def3.h
index bbfbe01..e4cccaa 100644
--- a/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_css_fw_def3.h
+++ b/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_css_fw_def3.h
@@ -118,6 +118,10 @@
 			BL32_LIMIT - BL32_BASE,				\
 			MT_MEMORY | MT_RW | MT_SECURE)
 #endif
+/*******************************************************************************
+ * Helper macros
+ ******************************************************************************/
+#define NRD_CSS_ALIGN_TO_4K(size) (((size) + 0xFFF) & ~0xFFF)
 
 #if RESET_TO_BL31
 /*******************************************************************************
diff --git a/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_plat_arm_def3.h b/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_plat_arm_def3.h
index 4936344..5d56cba 100644
--- a/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_plat_arm_def3.h
+++ b/plat/arm/board/neoverse_rd/common/include/nrd3/nrd_plat_arm_def3.h
@@ -30,6 +30,8 @@
 					NRD_MAX_CPUS_PER_CLUSTER *	\
 					NRD_MAX_PE_PER_CPU)
 
+#define PLATFORM_NODE_CORE_COUNT	(PLATFORM_CORE_COUNT/NRD_CHIP_COUNT)
+
 /*******************************************************************************
  * PA/VA config
  ******************************************************************************/
@@ -56,8 +58,8 @@
  * chips are accessed - secure ram, css device and soc device regions.
  */
 #if defined(IMAGE_BL31)
-#  define PLAT_ARM_MMAP_ENTRIES		(10 + ((NRD_CHIP_COUNT - 1) * 3))
-#  define MAX_XLAT_TABLES		(10 + ((NRD_CHIP_COUNT - 1) * 3))
+#  define PLAT_ARM_MMAP_ENTRIES		(11 + ((NRD_CHIP_COUNT - 1) * 3))
+#  define MAX_XLAT_TABLES		(11 + ((NRD_CHIP_COUNT - 1) * 3))
 #elif defined(IMAGE_BL32)
 # define PLAT_ARM_MMAP_ENTRIES		U(8)
 # define MAX_XLAT_TABLES		U(5)
diff --git a/plat/arm/board/neoverse_rd/platform/rdv3/platform.mk b/plat/arm/board/neoverse_rd/platform/rdv3/platform.mk
index c587a55..394fe9b 100644
--- a/plat/arm/board/neoverse_rd/platform/rdv3/platform.mk
+++ b/plat/arm/board/neoverse_rd/platform/rdv3/platform.mk
@@ -53,6 +53,10 @@
 # RD-V3 uses MHUv3
 PLAT_MHU := MHUv3
 
+ifeq (${NRD_PLATFORM_VARIANT}, 2)
+override NUMA_AWARE_PER_CPU	:= 1
+endif
+
 include plat/arm/board/neoverse_rd/common/nrd-common.mk
 include drivers/arm/rse/rse_comms.mk
 include drivers/auth/mbedtls/mbedtls_common.mk
@@ -116,6 +120,7 @@
 			${RDV3_BASE}/rdv3_topology.c		\
 			${RDV3_BASE}/rdv3_plat_attest_token.c	\
 			${RDV3_BASE}/rdv3_realm_attest_key.c	\
+			${RDV3_BASE}/rdv3_per_cpu.S			\
 			drivers/arm/smmu/smmu_v3.c			\
 			drivers/cfi/v2m/v2m_flash.c			\
 			lib/psa/cca_attestation.c			\
diff --git a/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_bl31_setup.c b/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_bl31_setup.c
index e32e761..fec855a 100644
--- a/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_bl31_setup.c
+++ b/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_bl31_setup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, Arm Limited and Contributors. All rights reserved.
+ * Copyright (c) 2025, Arm Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -8,9 +8,11 @@
 #include <drivers/arm/gic600_multichip.h>
 #include <drivers/arm/rse_comms.h>
 #include <drivers/arm/smmu_v3.h>
-
+#include <lib/per_cpu/per_cpu.h>
 #include <plat/arm/common/plat_arm.h>
 #include <plat/common/platform.h>
+
+#include <nrd_css_fw_def3.h>
 #include <nrd_plat.h>
 #include <nrd_variant.h>
 #include <rdv3_rse_comms.h>
@@ -132,6 +134,29 @@
 };
 #endif /* NRD_PLATFORM_VARIANT == 2 */
 
+void __init bl31_plat_arch_setup(void)
+{
+#if (NRD_PLATFORM_VARIANT == 2)
+#if NUMA_AWARE_PER_CPU
+	int ret;
+#endif
+	arm_bl31_plat_arch_setup();
+#if NUMA_AWARE_PER_CPU
+	for(int i = 1; i <= 3 ; i++) {
+		ret = mmap_add_dynamic_region(
+		NRD_REMOTE_CHIP_MEM_OFFSET(i),
+		NRD_REMOTE_CHIP_MEM_OFFSET(i),
+		NRD_CSS_ALIGN_TO_4K(PER_CPU_SIZE),
+		MT_MEMORY | MT_RW | EL3_PAS);
+		if (ret != 0) {
+			ERROR("Failed to add per-cpu mmap (ret=%d)", ret);
+			panic();
+		}
+	}
+#endif
+#endif
+}
+
 void bl31_platform_setup(void)
 {
 	/*
@@ -183,6 +208,16 @@
 	}
 }
 
+/* Base address for different per CPU sections */
+const uintptr_t per_cpu_remote_section_base[] = {
+	(uintptr_t)PER_CPU_START,
+#if NUMA_AWARE_PER_CPU
+	(uintptr_t)NRD_REMOTE_CHIP_MEM_OFFSET(1),
+	(uintptr_t)NRD_REMOTE_CHIP_MEM_OFFSET(2),
+	(uintptr_t)NRD_REMOTE_CHIP_MEM_OFFSET(3)
+#endif
+};
+
 #if RESET_TO_BL31
 /*
  * The GPT library might modify the gpt regions structure to optimize
diff --git a/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_per_cpu.S b/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_per_cpu.S
new file mode 100644
index 0000000..f723a70
--- /dev/null
+++ b/plat/arm/board/neoverse_rd/platform/rdv3/rdv3_per_cpu.S
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2025, Arm Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef RDV3_PER_CPU_S
+#define RDV3_PER_CPU_S
+
+#include <asm_macros.S>
+#include <lib/per_cpu/per_cpu_defs.h>
+
+.globl plat_zero_init_per_cpu_section
+.globl plat_per_cpu_section_base
+
+/* ----------------------------------------------------------------------------
+ * Zero-initialization all nodes .per_cpu section
+ * Clobbers : x0 - x3, x10, x30
+ * ----------------------------------------------------------------------------
+*/
+func plat_zero_init_per_cpu_section
+	mov	x10, x30
+	/* Zeromem requires x1 to be populated with the size */
+	ldr	x1,= __PER_CPU_SIZE__
+
+	/* Load the base address of array */
+	adrp	x2, per_cpu_remote_section_base
+	add	x2, x2, :lo12:per_cpu_remote_section_base
+
+	/* Start with 1st node */
+	mov	x3, #1
+1:
+	/* Compare with number of nodes */
+	cmp	x3, #NRD_CHIP_COUNT
+	bge	2f
+
+	/* Calculate the address of base[x0] */
+	/* Assuming uintptr_t is 8 bytes */
+	add	x2, x2, x3, lsl #3
+	ldr	x0, [x2]
+	bl	zeromem
+	add	x3, x3, #1
+	b	1b
+
+2:
+	mov	x30, x10
+	ret
+endfunc plat_zero_init_per_cpu_section
+
+/* ----------------------------------------------------------------------------
+ * Calculate per cpu section base for particular cpu.
+ * Clobbers : x0 - x1
+ * return per cpu section base address in x0
+ * ----------------------------------------------------------------------------
+*/
+func plat_per_cpu_section_base
+	mov	x1, #PLATFORM_NODE_CORE_COUNT
+	/* x0 = cpu_idx(x0) / (PLATFORM_NODE_CORE_COUNT) */
+	udiv	x0, x0, x1
+
+	/* Load the base address of array */
+	adrp	x1, per_cpu_remote_section_base
+	add	x1, x1, :lo12:per_cpu_remote_section_base
+
+	/* Calculate the address of base[x0] */
+	/* Assuming uintptr_t is 8 bytes */
+	add	x1, x1, x0, lsl #3
+	/* Loading x0 with particular per cpu section base address in x0 */
+	ldr	x0, [x1]
+	ret
+endfunc plat_per_cpu_section_base
+
+#endif /* RDV3_PER_CPU_S*/