refactor(psci): unify coherency exit between AArch64 and AArch32

The procedure is fairly simple: if we have hardware assisted coherency,
call into the cpu driver and let it do its thing. If we don't, then we
must turn data caches off, handle the confusion that causes with the
stack, and call into the cpu driver which will flush the caches that
need flushing.

On AArch32 the above happens in common code. On AArch64, however, the
turning off of the caches happens in the cpu driver. Since we're dealing
with the stack, we must exercise control over it and implement this in
assembly. But as the two implementations are nominally different (in the
ordering of operations), the part that is in assembly is quite large as
jumping back to C to handle the difference might involve the stack.

Presumably, the AArch difference was introduced in order to cater for a
possible implementation where turning off the caches requires an IMP DEF
sequence. Well, Arm no longer makes cores without hardware assisted
coherency, so this eventually is not possible.

So take this part out of the cpu driver and put it into common code,
just like in AArch32. With this, there is no longer a need call
prepare_cpu_pwr_dwn() in a different order either - we can delay it a
bit to happen after the stack management. So the two AArch-s flows
become identical. We can convert prepare_cpu_pwr_dwn() to C and leave
psci_do_pwrdown_cache_maintenance() only to exercise control over stack.

Change-Id: Ie4759ebe20bb74b60533c6a47dbc2b101875900f
Signed-off-by: Boyan Karatotev <boyan.karatotev@arm.com>
diff --git a/lib/cpus/aarch32/cpu_helpers.S b/lib/cpus/aarch32/cpu_helpers.S
index 83e3e49..863448c 100644
--- a/lib/cpus/aarch32/cpu_helpers.S
+++ b/lib/cpus/aarch32/cpu_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023, Arm Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2025, Arm Limited and Contributors. All rights reserved.
  *
  * SPDX-License-Identifier: BSD-3-Clause
  */
@@ -47,46 +47,7 @@
 
 #endif
 
-#ifdef IMAGE_BL32 /* The power down core and cluster is needed only in  BL32 */
-	/*
-	 * void prepare_cpu_pwr_dwn(unsigned int power_level)
-	 *
-	 * Prepare CPU power down function for all platforms. The function takes
-	 * a domain level to be powered down as its parameter. After the cpu_ops
-	 * pointer is retrieved from cpu_data, the handler for requested power
-	 * level is called.
-	 */
-	.globl	prepare_cpu_pwr_dwn
-func prepare_cpu_pwr_dwn
-	/*
-	 * If the given power level exceeds CPU_MAX_PWR_DWN_OPS, we call the
-	 * power down handler for the last power level
-	 */
-	mov	r2, #(CPU_MAX_PWR_DWN_OPS - 1)
-	cmp	r0, r2
-	movhi	r0, r2
-
-	push	{r0, lr}
-	bl	_cpu_data
-	pop	{r2, lr}
-
-	ldr	r0, [r0, #CPU_DATA_CPU_OPS_PTR]
-#if ENABLE_ASSERTIONS
-	cmp	r0, #0
-	ASM_ASSERT(ne)
-#endif
-
-	/* Get the appropriate power down handler */
-	mov	r1, #CPU_PWR_DWN_OPS
-	add	r1, r1, r2, lsl #2
-	ldr	r1, [r0, r1]
-#if ENABLE_ASSERTIONS
-	cmp	r1, #0
-	ASM_ASSERT(ne)
-#endif
-	bx	r1
-endfunc prepare_cpu_pwr_dwn
-
+#ifdef IMAGE_BL32
 	/*
 	 * Initializes the cpu_ops_ptr if not already initialized
 	 * in cpu_data. This must only be called after the data cache
diff --git a/lib/cpus/aarch64/aem_generic.S b/lib/cpus/aarch64/aem_generic.S
index 9843943..243f657 100644
--- a/lib/cpus/aarch64/aem_generic.S
+++ b/lib/cpus/aarch64/aem_generic.S
@@ -12,15 +12,6 @@
 
 func aem_generic_core_pwr_dwn
 	/* ---------------------------------------------
-	 * Disable the Data Cache.
-	 * ---------------------------------------------
-	 */
-	mrs	x1, sctlr_el3
-	bic	x1, x1, #SCTLR_C_BIT
-	msr	sctlr_el3, x1
-	isb
-
-	/* ---------------------------------------------
 	 * AEM model supports L3 caches in which case L2
 	 * will be private per core caches and flush
 	 * from L1 to L2 is not sufficient.
@@ -60,15 +51,6 @@
 
 func aem_generic_cluster_pwr_dwn
 	/* ---------------------------------------------
-	 * Disable the Data Cache.
-	 * ---------------------------------------------
-	 */
-	mrs	x1, sctlr_el3
-	bic	x1, x1, #SCTLR_C_BIT
-	msr	sctlr_el3, x1
-	isb
-
-	/* ---------------------------------------------
 	 * Flush all caches to PoC.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/cortex_a35.S b/lib/cpus/aarch64/cortex_a35.S
index 40e6200..bb354df 100644
--- a/lib/cpus/aarch64/cortex_a35.S
+++ b/lib/cpus/aarch64/cortex_a35.S
@@ -13,16 +13,6 @@
 
 cpu_reset_prologue cortex_a35
 	/* ---------------------------------------------
-	 * Disable L1 data cache and unified L2 cache
-	 * ---------------------------------------------
-	 */
-func cortex_a35_disable_dcache
-	sysreg_bit_clear sctlr_el3, SCTLR_C_BIT
-	isb
-	ret
-endfunc cortex_a35_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable intra-cluster coherency
 	 * ---------------------------------------------
 	 */
@@ -55,12 +45,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a35_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
@@ -79,12 +63,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a35_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/cortex_a53.S b/lib/cpus/aarch64/cortex_a53.S
index dbfff87..e3b69ab 100644
--- a/lib/cpus/aarch64/cortex_a53.S
+++ b/lib/cpus/aarch64/cortex_a53.S
@@ -15,16 +15,6 @@
 cpu_reset_prologue cortex_a53
 
 	/* ---------------------------------------------
-	 * Disable L1 data cache and unified L2 cache
-	 * ---------------------------------------------
-	 */
-func cortex_a53_disable_dcache
-	sysreg_bit_clear sctlr_el3, SCTLR_C_BIT
-	isb
-	ret
-endfunc cortex_a53_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable intra-cluster coherency
 	 * ---------------------------------------------
 	 */
@@ -144,12 +134,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a53_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
@@ -168,12 +152,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a53_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/cortex_a57.S b/lib/cpus/aarch64/cortex_a57.S
index 4a61187..18521a2 100644
--- a/lib/cpus/aarch64/cortex_a57.S
+++ b/lib/cpus/aarch64/cortex_a57.S
@@ -16,16 +16,6 @@
 cpu_reset_prologue cortex_a57
 
 	/* ---------------------------------------------
-	 * Disable L1 data cache and unified L2 cache
-	 * ---------------------------------------------
-	 */
-func cortex_a57_disable_dcache
-	sysreg_bit_clear sctlr_el3, SCTLR_C_BIT
-	isb
-	ret
-endfunc cortex_a57_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable all types of L2 prefetches.
 	 * ---------------------------------------------
 	 */
@@ -200,12 +190,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a57_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable the L2 prefetches.
 	 * ---------------------------------------------
 	 */
@@ -240,12 +224,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a57_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable the L2 prefetches.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/cortex_a72.S b/lib/cpus/aarch64/cortex_a72.S
index 23b27ab..f35f867 100644
--- a/lib/cpus/aarch64/cortex_a72.S
+++ b/lib/cpus/aarch64/cortex_a72.S
@@ -18,18 +18,6 @@
 cpu_reset_prologue cortex_a72
 
 	/* ---------------------------------------------
-	 * Disable L1 data cache and unified L2 cache
-	 * ---------------------------------------------
-	 */
-func cortex_a72_disable_dcache
-	mrs	x1, sctlr_el3
-	bic	x1, x1, #SCTLR_C_BIT
-	msr	sctlr_el3, x1
-	isb
-	ret
-endfunc cortex_a72_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable all types of L2 prefetches.
 	 * ---------------------------------------------
 	 */
@@ -177,12 +165,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a72_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable the L2 prefetches.
 	 * ---------------------------------------------
 	 */
@@ -223,12 +205,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a72_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable the L2 prefetches.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/cortex_a73.S b/lib/cpus/aarch64/cortex_a73.S
index 9cc6fdb..14f1ef8 100644
--- a/lib/cpus/aarch64/cortex_a73.S
+++ b/lib/cpus/aarch64/cortex_a73.S
@@ -13,16 +13,6 @@
 cpu_reset_prologue cortex_a73
 
 	/* ---------------------------------------------
-	 * Disable L1 data cache
-	 * ---------------------------------------------
-	 */
-func cortex_a73_disable_dcache
-	sysreg_bit_clear sctlr_el3, SCTLR_C_BIT
-	isb
-	ret
-endfunc cortex_a73_disable_dcache
-
-	/* ---------------------------------------------
 	 * Disable intra-cluster coherency
 	 * ---------------------------------------------
 	 */
@@ -123,12 +113,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a73_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
@@ -147,12 +131,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	cortex_a73_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/cpu_helpers.S b/lib/cpus/aarch64/cpu_helpers.S
index 105da5c..1b20d5c 100644
--- a/lib/cpus/aarch64/cpu_helpers.S
+++ b/lib/cpus/aarch64/cpu_helpers.S
@@ -14,44 +14,6 @@
 #include <lib/cpus/errata.h>
 #include <lib/el3_runtime/cpu_data.h>
 
-#ifdef IMAGE_BL31 /* The power down core and cluster is needed only in  BL31 */
-	/*
-	 * void prepare_cpu_pwr_dwn(unsigned int power_level)
-	 *
-	 * Prepare CPU power down function for all platforms. The function takes
-	 * a domain level to be powered down as its parameter. After the cpu_ops
-	 * pointer is retrieved from cpu_data, the handler for requested power
-	 * level is called.
-	 */
-	.globl	prepare_cpu_pwr_dwn
-func prepare_cpu_pwr_dwn
-	/*
-	 * If the given power level exceeds CPU_MAX_PWR_DWN_OPS, we call the
-	 * power down handler for the last power level
-	 */
-	mov_imm	x2, (CPU_MAX_PWR_DWN_OPS - 1)
-	cmp	x0, x2
-	csel	x2, x2, x0, hi
-
-	mrs	x1, tpidr_el3
-	ldr	x0, [x1, #CPU_DATA_CPU_OPS_PTR]
-#if ENABLE_ASSERTIONS
-	cmp	x0, #0
-	ASM_ASSERT(ne)
-#endif
-
-	/* Get the appropriate power down handler */
-	mov	x1, #CPU_PWR_DWN_OPS
-	add	x1, x1, x2, lsl #3
-	ldr	x1, [x0, x1]
-#if ENABLE_ASSERTIONS
-	cmp	x1, #0
-	ASM_ASSERT(ne)
-#endif
-	br	x1
-endfunc prepare_cpu_pwr_dwn
-
-
 	/*
 	 * Initializes the cpu_ops_ptr if not already initialized
 	 * in cpu_data. This can be called without a runtime stack, but may
@@ -70,7 +32,6 @@
 1:
 	ret
 endfunc init_cpu_ops
-#endif /* IMAGE_BL31 */
 
 #if defined(IMAGE_BL31) && CRASH_REPORTING
 	/*
diff --git a/lib/cpus/aarch64/generic.S b/lib/cpus/aarch64/generic.S
index 0a10eed..c59575c 100644
--- a/lib/cpus/aarch64/generic.S
+++ b/lib/cpus/aarch64/generic.S
@@ -13,28 +13,10 @@
 
 cpu_reset_prologue generic
 
-	/* ---------------------------------------------
-	 * Disable L1 data cache and unified L2 cache
-	 * ---------------------------------------------
-	 */
-func generic_disable_dcache
-	mrs	x1, sctlr_el3
-	bic	x1, x1, #SCTLR_C_BIT
-	msr	sctlr_el3, x1
-	isb
-	ret
-endfunc generic_disable_dcache
-
 func generic_core_pwr_dwn
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	generic_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
@@ -48,12 +30,6 @@
 	mov	x18, x30
 
 	/* ---------------------------------------------
-	 * Turn off caches.
-	 * ---------------------------------------------
-	 */
-	bl	generic_disable_dcache
-
-	/* ---------------------------------------------
 	 * Flush L1 caches.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/cpus/aarch64/qemu_max.S b/lib/cpus/aarch64/qemu_max.S
index a727379..7980066 100644
--- a/lib/cpus/aarch64/qemu_max.S
+++ b/lib/cpus/aarch64/qemu_max.S
@@ -12,15 +12,6 @@
 
 func qemu_max_core_pwr_dwn
 	/* ---------------------------------------------
-	 * Disable the Data Cache.
-	 * ---------------------------------------------
-	 */
-	mrs	x1, sctlr_el3
-	bic	x1, x1, #SCTLR_C_BIT
-	msr	sctlr_el3, x1
-	isb
-
-	/* ---------------------------------------------
 	 * Flush L1 cache to L2.
 	 * ---------------------------------------------
 	 */
@@ -33,15 +24,6 @@
 
 func qemu_max_cluster_pwr_dwn
 	/* ---------------------------------------------
-	 * Disable the Data Cache.
-	 * ---------------------------------------------
-	 */
-	mrs	x1, sctlr_el3
-	bic	x1, x1, #SCTLR_C_BIT
-	msr	sctlr_el3, x1
-	isb
-
-	/* ---------------------------------------------
 	 * Flush all caches to PoC.
 	 * ---------------------------------------------
 	 */
diff --git a/lib/psci/aarch32/psci_helpers.S b/lib/psci/aarch32/psci_helpers.S
index 929af8c..493715a 100644
--- a/lib/psci/aarch32/psci_helpers.S
+++ b/lib/psci/aarch32/psci_helpers.S
@@ -12,41 +12,45 @@
 	.globl	psci_do_pwrup_cache_maintenance
 
 /* -----------------------------------------------------------------------
- * void psci_do_pwrdown_cache_maintenance(unsigned int power level);
+ * void psci_do_pwrdown_cache_maintenance(void);
  *
- * This function performs cache maintenance for the specified power
- * level. The levels of cache affected are determined by the power
- * level which is passed as the argument i.e. level 0 results
- * in a flush of the L1 cache. Both the L1 and L2 caches are flushed
- * for a higher power level.
- *
- * Additionally, this function also ensures that stack memory is correctly
- * flushed out to avoid coherency issues due to a change in its memory
- * attributes after the data cache is disabled.
+ * This function turns off data caches and also ensures that stack memory
+ * is correctly flushed out to avoid coherency issues due to a change in
+ * its memory attributes.
  * -----------------------------------------------------------------------
  */
 func psci_do_pwrdown_cache_maintenance
 	push	{r4, lr}
+	bl	plat_get_my_stack
 
-	/* ----------------------------------------------
-	 * Turn OFF cache and do stack maintenance
-	 * prior to cpu operations . This sequence is
-	 * different from AArch64 because in AArch32 the
-	 * assembler routines for cpu operations utilize
-	 * the stack whereas in AArch64 it doesn't.
-	 * ----------------------------------------------
-	 */
-	mov	r4, r0
-	bl	do_stack_maintenance
+	/* Turn off the D-cache */
+	ldcopr	r1, SCTLR
+	bic	r1, #SCTLR_C_BIT
+	stcopr	r1, SCTLR
+	isb
 
 	/* ---------------------------------------------
-	 * Invoke CPU-specifc power down operations for
-	 * the appropriate level
+	 * Calculate and store the size of the used
+	 * stack memory in r1.
 	 * ---------------------------------------------
 	 */
-	mov	r0, r4
-	pop	{r4, lr}
-	b	prepare_cpu_pwr_dwn
+	mov	r4, r0
+	mov	r1, sp
+	sub	r1, r0, r1
+	mov	r0, sp
+	bl	flush_dcache_range
+
+	/* ---------------------------------------------
+	 * Calculate and store the size of the unused
+	 * stack memory in r1. Calculate and store the
+	 * stack base address in r0.
+	 * ---------------------------------------------
+	 */
+	sub	r0, r4, #PLATFORM_STACK_SIZE
+	sub	r1, sp, r0
+	bl	inv_dcache_range
+
+	pop	{r4, pc}
 endfunc psci_do_pwrdown_cache_maintenance
 
 
@@ -92,44 +96,3 @@
 
 	pop	{r12, pc}
 endfunc psci_do_pwrup_cache_maintenance
-
-	/* ---------------------------------------------
-	 * void do_stack_maintenance(void)
-	 * Do stack maintenance by flushing the used
-	 * stack to the main memory and invalidating the
-	 * remainder.
-	 * ---------------------------------------------
-	 */
-func do_stack_maintenance
-	push	{r4, lr}
-	bl	plat_get_my_stack
-
-	/* Turn off the D-cache */
-	ldcopr	r1, SCTLR
-	bic	r1, #SCTLR_C_BIT
-	stcopr	r1, SCTLR
-	isb
-
-	/* ---------------------------------------------
-	 * Calculate and store the size of the used
-	 * stack memory in r1.
-	 * ---------------------------------------------
-	 */
-	mov	r4, r0
-	mov	r1, sp
-	sub	r1, r0, r1
-	mov	r0, sp
-	bl	flush_dcache_range
-
-	/* ---------------------------------------------
-	 * Calculate and store the size of the unused
-	 * stack memory in r1. Calculate and store the
-	 * stack base address in r0.
-	 * ---------------------------------------------
-	 */
-	sub	r0, r4, #PLATFORM_STACK_SIZE
-	sub	r1, sp, r0
-	bl	inv_dcache_range
-
-	pop	{r4, pc}
-endfunc do_stack_maintenance
diff --git a/lib/psci/aarch64/psci_helpers.S b/lib/psci/aarch64/psci_helpers.S
index 4da2f69..ce8adc2 100644
--- a/lib/psci/aarch64/psci_helpers.S
+++ b/lib/psci/aarch64/psci_helpers.S
@@ -14,29 +14,22 @@
 	.globl	psci_do_pwrup_cache_maintenance
 
 /* -----------------------------------------------------------------------
- * void psci_do_pwrdown_cache_maintenance(unsigned int power level);
+ * void psci_do_pwrdown_cache_maintenance(void);
  *
- * This function performs cache maintenance for the specified power
- * level. The levels of cache affected are determined by the power
- * level which is passed as the argument i.e. level 0 results
- * in a flush of the L1 cache. Both the L1 and L2 caches are flushed
- * for a higher power level.
- *
- * Additionally, this function also ensures that stack memory is correctly
- * flushed out to avoid coherency issues due to a change in its memory
- * attributes after the data cache is disabled.
+ * This function turns off data caches and also ensures that stack memory
+ * is correctly flushed out to avoid coherency issues due to a change in
+ * its memory attributes.
  * -----------------------------------------------------------------------
  */
 func psci_do_pwrdown_cache_maintenance
 	stp     x29, x30, [sp,#-16]!
 	stp     x19, x20, [sp,#-16]!
 
-	/* ---------------------------------------------
-	 * Invoke CPU-specific power down operations for
-	 * the appropriate level
-	 * ---------------------------------------------
-	 */
-	bl	prepare_cpu_pwr_dwn
+	/* Disable L1 data cache and unified L2 cache */
+	mrs	x1, sctlr_el3
+	bic	x1, x1, #SCTLR_C_BIT
+	msr	sctlr_el3, x1
+	isb
 
 	/* ---------------------------------------------
 	 * Do stack maintenance by flushing the used
diff --git a/lib/psci/psci_common.c b/lib/psci/psci_common.c
index 7c83a79..15a32f1 100644
--- a/lib/psci/psci_common.c
+++ b/lib/psci/psci_common.c
@@ -1196,6 +1196,26 @@
 	return (n_valid > 1U) ? 1 : 0;
 }
 
+static void call_cpu_pwr_dwn(unsigned int power_level)
+{
+	struct cpu_ops *ops = get_cpu_data(cpu_ops_ptr);
+
+	/* Call the last available power down handler */
+	if (power_level > CPU_MAX_PWR_DWN_OPS - 1) {
+		power_level = CPU_MAX_PWR_DWN_OPS - 1;
+	}
+
+	assert(ops != NULL);
+	assert(ops->pwr_dwn_ops[power_level] != NULL);
+
+	return ops->pwr_dwn_ops[power_level]();
+}
+
+static void prepare_cpu_pwr_dwn(unsigned int power_level)
+{
+	call_cpu_pwr_dwn(power_level);
+}
+
 /*******************************************************************************
  * Initiate power down sequence, by calling power down operations registered for
  * this CPU.
@@ -1213,26 +1233,24 @@
 		PMF_CACHE_MAINT);
 #endif
 
-#if HW_ASSISTED_COHERENCY
+#if !HW_ASSISTED_COHERENCY
 	/*
-	 * With hardware-assisted coherency, the CPU drivers only initiate the
-	 * power down sequence, without performing cache-maintenance operations
-	 * in software. Data caches enabled both before and after this call.
-	 */
-	prepare_cpu_pwr_dwn(power_level);
-#else
-	/*
-	 * Without hardware-assisted coherency, the CPU drivers disable data
-	 * caches, then perform cache-maintenance operations in software.
+	 * Disable data caching and handle the stack's cache maintenance.
 	 *
-	 * This also calls prepare_cpu_pwr_dwn() to initiate power down
-	 * sequence, but that function will return with data caches disabled.
-	 * We must ensure that the stack memory is flushed out to memory before
-	 * we start popping from it again.
+	 * If the core can't automatically exit coherency, the cpu driver needs
+	 * to flush caches and exit coherency. We can't do this with data caches
+	 * enabled. The cpu driver will decide which caches to flush based on
+	 * the power level.
+	 *
+	 * If automatic coherency management is possible, we can keep data
+	 * caches on until the very end and let hardware do cache maintenance.
 	 */
-	psci_do_pwrdown_cache_maintenance(power_level);
+	psci_do_pwrdown_cache_maintenance();
 #endif
 
+	/* Initiate the power down sequence by calling into the cpu driver. */
+	prepare_cpu_pwr_dwn(power_level);
+
 #if ENABLE_RUNTIME_INSTRUMENTATION
 	PMF_CAPTURE_TIMESTAMP(rt_instr_svc,
 		RT_INSTR_EXIT_CFLUSH,
diff --git a/lib/psci/psci_private.h b/lib/psci/psci_private.h
index f3f5a5c..d4c6415 100644
--- a/lib/psci/psci_private.h
+++ b/lib/psci/psci_private.h
@@ -323,13 +323,6 @@
 bool psci_is_last_on_cpu(unsigned int my_idx);
 int psci_spd_migrate_info(u_register_t *mpidr);
 
-/*
- * CPU power down is directly called only when HW_ASSISTED_COHERENCY is
- * available. Otherwise, this needs post-call stack maintenance, which is
- * handled in assembly.
- */
-void prepare_cpu_pwr_dwn(unsigned int power_level);
-
 /* This function applies various CPU errata during power down. */
 void apply_cpu_pwr_dwn_errata(void);
 
@@ -351,7 +344,7 @@
 void psci_cpu_suspend_to_powerdown_finish(unsigned int cpu_idx, unsigned int max_off_lvl, const psci_power_state_t *state_info);
 
 /* Private exported functions from psci_helpers.S */
-void psci_do_pwrdown_cache_maintenance(unsigned int pwr_level);
+void psci_do_pwrdown_cache_maintenance(void);
 void psci_do_pwrup_cache_maintenance(void);
 
 /* Private exported functions from psci_system_off.c */