Improve spinlock implementation for ARMv8.0

Current implementation of spinlocks using C11 atomics compiles into an
LDAXR/STXR pair which we observe does not always make forward progress
on Cortex A72 (concurrent_save_restore test livelocks).

This patch inserts a WFE instruction when the LDAXR returns that the
lock is currently taken, pausing the thread until an event is triggered
and easing off the contention. This is observed to help forward progress
on A72 and should also improve power consumption of our spinlocks.

Note that there is still no guarantee of forward progress and threads
can be shown to livelock on the A72 if the loops are extremely tight.
This should be revisited in the future, possibly by leveraging ARMv8.1
LSE atomics.

Test: vcpu_state.concurrent_save_restore
Bug: 141087046
Change-Id: Ic838ba4d287ef4bfecc7dd84c883770d9c445b1d
diff --git a/inc/hf/spinlock.h b/inc/hf/spinlock.h
index dbd7890..80d1cfc 100644
--- a/inc/hf/spinlock.h
+++ b/inc/hf/spinlock.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2018 The Hafnium Authors.
+ * Copyright 2019 The Hafnium Authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,27 +16,18 @@
 
 #pragma once
 
-#include <stdatomic.h>
-
-struct spinlock {
-	atomic_flag v;
-};
-
-#define SPINLOCK_INIT                 \
-	{                             \
-		.v = ATOMIC_FLAG_INIT \
-	}
+/*
+ * Includes the arch-specific definition of 'struct spinlock' and
+ * implementations of:
+ *  - SPINLOCK_INIT
+ *  - sl_lock()
+ *  - sl_unlock()
+ */
+#include "hf/arch/spinlock.h"
 
 static inline void sl_init(struct spinlock *l)
 {
-	*l = (struct spinlock)SPINLOCK_INIT;
-}
-
-static inline void sl_lock(struct spinlock *l)
-{
-	while (atomic_flag_test_and_set_explicit(&l->v, memory_order_acquire)) {
-		/* do nothing */
-	}
+	*l = SPINLOCK_INIT;
 }
 
 /**
@@ -53,8 +44,3 @@
 		sl_lock(a);
 	}
 }
-
-static inline void sl_unlock(struct spinlock *l)
-{
-	atomic_flag_clear_explicit(&l->v, memory_order_release);
-}
diff --git a/src/arch/aarch64/inc/hf/arch/spinlock.h b/src/arch/aarch64/inc/hf/arch/spinlock.h
new file mode 100644
index 0000000..96b01c8
--- /dev/null
+++ b/src/arch/aarch64/inc/hf/arch/spinlock.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2019 The Hafnium Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/**
+ * Spinlock implementation using ARMv8.0 LDXR/STXR pair and a WFE pause.
+ *
+ * Implementation using C11 atomics also generates a LDXR/STXR pair but no WFE.
+ * Without it we observe that Cortex A72 can easily livelock and not make
+ * forward progress.
+ *
+ * TODO(b/141087046): Forward progress is still not guaranteed as even with WFE
+ * we see that A72 can livelock for extremely tight loops. We should investigate
+ * the guarantees provided by atomic instructions introduced in ARMv8.1 LSE.
+ */
+
+#include <stdint.h>
+
+#include "hf/arch/types.h"
+
+struct spinlock {
+	volatile uint32_t v;
+};
+
+#define SPINLOCK_INIT ((struct spinlock){.v = 0})
+
+static inline void sl_lock(struct spinlock *l)
+{
+	register uintreg_t tmp1;
+	register uintreg_t tmp2;
+
+	/*
+	 * Acquire the lock with a LDAXR/STXR pair (acquire semantics on the
+	 * load instruction). Pause using WFE if the lock is currently taken.
+	 * This is NOT guaranteed to make progress.
+	 */
+	__asm__ volatile(
+		"	mov	%w2, #1\n"
+		"	sevl\n" /* set event bit */
+		"1:	wfe\n"  /* wait for event, clear event bit */
+		"2:	ldaxr	%w1, [%0]\n"      /* load lock value */
+		"	cbnz	%w1, 1b\n"	/* if lock taken, goto WFE */
+		"	stxr	%w1, %w2, [%0]\n" /* try to take lock */
+		"	cbnz	%w1, 2b\n"	/* loop if unsuccessful */
+		: "+r"(l), "=&r"(tmp1), "=&r"(tmp2)
+		:
+		: "cc");
+}
+
+static inline void sl_unlock(struct spinlock *l)
+{
+	/*
+	 * Store zero to lock's value with release semantics. This triggers an
+	 * event which wakes up other threads waiting on a lock (no SEV needed).
+	 */
+	__asm__ volatile("stlr wzr, [%0]" : "+r"(l)::"cc");
+}
diff --git a/src/arch/fake/inc/hf/arch/spinlock.h b/src/arch/fake/inc/hf/arch/spinlock.h
new file mode 100644
index 0000000..db3b45c
--- /dev/null
+++ b/src/arch/fake/inc/hf/arch/spinlock.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2018 The Hafnium Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/**
+ * Generic implementation of a spinlock using C11 atomics.
+ * Does not work very well under contention.
+ */
+
+#include <stdatomic.h>
+
+struct spinlock {
+	atomic_flag v;
+};
+
+#define SPINLOCK_INIT ((struct spinlock){.v = ATOMIC_FLAG_INIT})
+
+static inline void sl_lock(struct spinlock *l)
+{
+	while (atomic_flag_test_and_set_explicit(&l->v, memory_order_acquire)) {
+		/* do nothing */
+	}
+}
+
+static inline void sl_unlock(struct spinlock *l)
+{
+	atomic_flag_clear_explicit(&l->v, memory_order_release);
+}