Emit speculation barriers after ERETs

According to Linux commit 679db70801da9fda91d26caf13bf5b5ccc74e8e8
some ARM64 CPUs may speculate past an ERET. This could be used as part
of a side-channel attack.

To mitigate the issue, emit DSB/ISB barriers after every ERET.

Add a build step which dumps the generated ELF and check that this holds
for every ERET in the binary to prevent regressing.

Bug: 146490856
Change-Id: Idf1c2690637a7edb4a366d30fec26ed444069f5e
diff --git a/src/arch/aarch64/exception_macros.S b/src/arch/aarch64/exception_macros.S
index ef820b7..fee454a 100644
--- a/src/arch/aarch64/exception_macros.S
+++ b/src/arch/aarch64/exception_macros.S
@@ -15,6 +15,26 @@
  */
 
 /**
+ * From Linux commit 679db70801da9fda91d26caf13bf5b5ccc74e8e8:
+ * "Some CPUs can speculate past an ERET instruction and potentially perform
+ * speculative accesses to memory before processing the exception return.
+ * Since the register state is often controlled by a lower privilege level
+ * at the point of an ERET, this could potentially be used as part of a
+ * side-channel attack."
+ *
+ * This macro emits a speculation barrier after the ERET to prevent the CPU
+ * from speculating past the exception return.
+ *
+ * ARMv8.5 introduces a dedicated SB speculative barrier instruction.
+ * Use a DSB/ISB pair on older platforms.
+ */
+.macro eret_with_sb
+	eret
+	dsb	nsh
+	isb
+.endm
+
+/**
  * Saves the volatile registers onto the stack. This currently takes 14
  * instructions, so it can be used in exception handlers with 18 instructions
  * left, 2 of which in the same cache line (assuming a 16-byte cache line).
@@ -81,16 +101,18 @@
  * Switching to SPx and calling the C handler takes 16 instructions, so it's not
  * possible to add a branch to a common exit path without going into the next
  * cache line (assuming 16-byte cache lines). Additionally, to restore and
- * return we need an additional 16 instructions, so we implement the whole
- * handler within the allotted 32 instructions.
+ * return we need an additional 16 instructions, so we could implement the whole
+ * handler within the allotted 32 instructions. However, since we want to emit
+ * a speculation barrier after each ERET, we are forced to move the ERET to
+ * a shared exit path.
  */
-.macro current_exception_sp0 elx:req handler:req
+.macro current_exception_sp0 elx:req handler:req eret_label:req
 	msr spsel, #1
 	save_volatile_to_stack \elx
 	bl \handler
 	restore_volatile_from_stack \elx
 	msr spsel, #0
-	eret
+	b \eret_label
 .endm
 
 /**
diff --git a/src/arch/aarch64/hftest/exceptions.S b/src/arch/aarch64/hftest/exceptions.S
index b54e4d2..0203110 100644
--- a/src/arch/aarch64/hftest/exceptions.S
+++ b/src/arch/aarch64/hftest/exceptions.S
@@ -25,7 +25,7 @@
 
 .balign 0x80
 irq_cur_sp0:
-	current_exception_sp0 el1 irq_current
+	current_exception_sp0 el1 irq_current exception_handler_return
 
 .balign 0x80
 fiq_cur_sp0:
@@ -113,8 +113,10 @@
 skip_elr:
 	/* Restore register spsr_el1 using x1 as scratch. */
 	ldr x1, [sp, #8 * 23]
-        msr spsr_el1, x1
+	msr spsr_el1, x1
 
 	/* Restore x0 & x1, and release stack space. */
 	ldp x0, x1, [sp], #8 * 24
-	eret
+
+exception_handler_return:
+	eret_with_sb
diff --git a/src/arch/aarch64/hypervisor/exceptions.S b/src/arch/aarch64/hypervisor/exceptions.S
index fe89da8..17a5b63 100644
--- a/src/arch/aarch64/hypervisor/exceptions.S
+++ b/src/arch/aarch64/hypervisor/exceptions.S
@@ -518,7 +518,7 @@
 	/* Restore x0..x3, which we have used as scratch before. */
 	ldp x2, x3, [x0, #VCPU_REGS + 8 * 2]
 	ldp x0, x1, [x0, #VCPU_REGS + 8 * 0]
-	eret
+	eret_with_sb
 
 .balign 0x40
 /**
@@ -526,4 +526,4 @@
  */
 restore_from_stack_and_return:
 	restore_volatile_from_stack el2
-	eret
+	eret_with_sb