Export policy for selecting message recipient.

Previously this was a first come first served policy but the scheduler
may be in the position to make a better choice. This also conforms to
our intent of exporting policy outside of the hypervisor.

Change-Id: I8cee6ce9b976e5ed990616c896cd53ecd0f083c8
diff --git a/driver/linux b/driver/linux
index 2bc0a32..71f5736 160000
--- a/driver/linux
+++ b/driver/linux
@@ -1 +1 @@
-Subproject commit 2bc0a3251c74126960108bfeaae300b73e0a9238
+Subproject commit 71f57364e99605d8c881649a34bbf4890051ed71
diff --git a/inc/hf/cpu.h b/inc/hf/cpu.h
index 6b4705a..6f358cd 100644
--- a/inc/hf/cpu.h
+++ b/inc/hf/cpu.h
@@ -63,11 +63,6 @@
 	uint32_t enabled_and_pending_count;
 };
 
-struct retval_state {
-	uintptr_t value;
-	bool force;
-};
-
 struct vcpu_fault_info {
 	ipaddr_t ipaddr;
 	vaddr_t vaddr;
@@ -78,19 +73,18 @@
 
 struct vcpu {
 	struct spinlock lock;
-	enum vcpu_state state;
-	struct cpu *cpu;
-	struct vm *vm;
-	struct vcpu *mailbox_next;
-	struct arch_regs regs;
-	struct interrupts interrupts;
 
 	/*
-	 * The following field is used to force a return value to be set the
-	 * next time a vCPU belonging to a secondary VM runs. For primary VMs,
-	 * 'regs' can be set directly.
+	 * The state is only changed in the context of the vCPU being run. This
+	 * ensures the scheduler can easily keep track of the vCPU state as
+	 * transitions are indicated by the return code from the run call.
 	 */
-	struct retval_state retval;
+	enum vcpu_state state;
+
+	struct cpu *cpu;
+	struct vm *vm;
+	struct arch_regs regs;
+	struct interrupts interrupts;
 
 	/*
 	 * Determine whether the 'regs' field is available for use. This is set
diff --git a/inc/hf/vm.h b/inc/hf/vm.h
index 0eb8324..b3b7492 100644
--- a/inc/hf/vm.h
+++ b/inc/hf/vm.h
@@ -57,7 +57,6 @@
 	int16_t recv_bytes;
 	void *recv;
 	const void *send;
-	struct vcpu *recv_waiter;
 
 	/**
 	 * List of wait_entry structs representing VMs that want to be notified
diff --git a/inc/vmapi/hf/abi.h b/inc/vmapi/hf/abi.h
index b51d3a5..c24700c 100644
--- a/inc/vmapi/hf/abi.h
+++ b/inc/vmapi/hf/abi.h
@@ -36,34 +36,38 @@
 	/**
 	 * The vCPU is blocked waiting for an interrupt. The scheduler MUST take
 	 * it off the run queue and not call `hf_vcpu_run` on the vCPU until it
-	 * has injected an interrupt, sent it a message, or got a
-	 * `HF_VCPU_RUN_WAKE_UP` for it from another vCPU.
+	 * has injected an interrupt, received `HF_VCPU_RUN_WAKE_UP` for it
+	 * from another vCPU or the timeout provided in
+	 * `hf_vcpu_run_return.sleep` is not `HF_SLEEP_INDEFINITE` and the
+	 * specified duration has expired.
 	 */
 	HF_VCPU_RUN_WAIT_FOR_INTERRUPT = 2,
 
 	/**
-	 * The vCPU would like `hf_vcpu_run` to be called on another vCPU,
-	 * specified by `hf_vcpu_run_return.wake_up`. The scheduler MUST
-	 * either wake the vCPU in question up if it is blocked, or preempt and
-	 * re-run it if it is already running somewhere. This gives Hafnium a
-	 * chance to update any CPU state which might have changed.
+	 * The vCPU is blocked waiting for a message. The scheduler MUST take it
+	 * off the run queue and not call `hf_vcpu_run` on the vCPU until it has
+	 * injected an interrupt, sent it a message, or received
+	 * `HF_VCPU_RUN_WAKE_UP` for it from another vCPU from another vCPU or
+	 * the timeout provided in `hf_vcpu_run_return.sleep` is not
+	 * `HF_SLEEP_INDEFINITE` and the specified duration has expired.
 	 */
-	HF_VCPU_RUN_WAKE_UP = 3,
+	HF_VCPU_RUN_WAIT_FOR_MESSAGE = 3,
 
 	/**
-	 * A new message is available for the scheduler VM, as specified by
-	 * `hf_vcpu_run_return.message`.
+	 * Hafnium would like `hf_vcpu_run` to be called on another vCPU,
+	 * specified by `hf_vcpu_run_return.wake_up`. The scheduler MUST either
+	 * wake the vCPU in question up if it is blocked, or preempt and re-run
+	 * it if it is already running somewhere. This gives Hafnium a chance to
+	 * update any CPU state which might have changed.
 	 */
-	HF_VCPU_RUN_MESSAGE = 4,
+	HF_VCPU_RUN_WAKE_UP = 4,
 
 	/**
-	 * Like `HF_VCPU_RUN_WAIT_FOR_INTERRUPT`, but for a limited amount of
-	 * time, specified by `hf_vcpu_run_return.sleep`. After at least that
-	 * amount of time has passed, or any of the events listed for
-	 * `HF_VCPU_RUN_WAIT_FOR_INTERRUPT` occur, the scheduler MUST call
-	 * `hf_vcpu_run` on it again.
+	 * A message has been sent by the vCPU. The scheduler MUST run a vCPU
+	 * from the recipient VM and priority SHOULD be given to those vCPUs
+	 * that are waiting for a message.
 	 */
-	HF_VCPU_RUN_SLEEP = 5,
+	HF_VCPU_RUN_MESSAGE = 5,
 
 	/**
 	 * The vCPU has made the mailbox writable and there are pending waiters.
@@ -88,6 +92,7 @@
 			uint16_t vcpu;
 		} wake_up;
 		struct {
+			uint16_t vm_id;
 			uint32_t size;
 		} message;
 		struct {
@@ -135,8 +140,10 @@
 		break;
 	case HF_VCPU_RUN_MESSAGE:
 		ret |= (uint64_t)res.message.size << 32;
+		ret |= (uint64_t)res.message.vm_id << 16;
 		break;
-	case HF_VCPU_RUN_SLEEP:
+	case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
+	case HF_VCPU_RUN_WAIT_FOR_MESSAGE:
 		ret |= res.sleep.ns << 8;
 		break;
 	default:
@@ -163,8 +170,10 @@
 		break;
 	case HF_VCPU_RUN_MESSAGE:
 		ret.message.size = res >> 32;
+		ret.message.vm_id = (res >> 16) & 0xffff;
 		break;
-	case HF_VCPU_RUN_SLEEP:
+	case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
+	case HF_VCPU_RUN_WAIT_FOR_MESSAGE:
 		ret.sleep.ns = res >> 8;
 		break;
 	default:
diff --git a/inc/vmapi/hf/call.h b/inc/vmapi/hf/call.h
index 7159379..5c8b1c6 100644
--- a/inc/vmapi/hf/call.h
+++ b/inc/vmapi/hf/call.h
@@ -115,12 +115,7 @@
  * If the recipient's receive buffer is busy, it can optionally register the
  * caller to be notified when the recipient's receive buffer becomes available.
  *
- * Returns -1 on failure, and on success either:
- *  - 0, if the caller is a secondary VM
- *  - the ID of the vCPU to run to receive the message, if the caller is the
- *    primary VM.
- *  - HF_INVALID_VCPU if the caller is the primary VM and no vCPUs on the target
- *    VM are currently waiting to receive a message.
+ * Returns -1 on failure and 0 on success.
  */
 static inline int64_t hf_mailbox_send(uint32_t vm_id, size_t size, bool notify)
 {
diff --git a/inc/vmapi/hf/types.h b/inc/vmapi/hf/types.h
index a575d4a6..5826e70 100644
--- a/inc/vmapi/hf/types.h
+++ b/inc/vmapi/hf/types.h
@@ -38,7 +38,9 @@
 
 /* Invalid values for fields to indicate absence or errors. */
 #define HF_INVALID_VM_ID 0xffffffff
-#define HF_INVALID_VCPU 0xffff
+
+/* Sleep value for an indefinite period of time. */
+#define HF_SLEEP_INDEFINITE 0xffffffffffffff
 
 /** The number of virtual interrupt IDs which are supported. */
 #define HF_NUM_INTIDS 64
diff --git a/src/abi_test.cc b/src/abi_test.cc
index 6d1265b..12dde09 100644
--- a/src/abi_test.cc
+++ b/src/abi_test.cc
@@ -95,7 +95,20 @@
 {
 	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
 	res.code = HF_VCPU_RUN_WAIT_FOR_INTERRUPT;
-	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(2));
+	res.sleep.ns = HF_SLEEP_INDEFINITE;
+	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0xffffffffffffff02));
+}
+
+/**
+ * Encoding wait-for-interrupt response with too large sleep duration will drop
+ * the top octet.
+ */
+TEST(abi, hf_vcpu_run_return_encode_wait_for_interrupt_sleep_too_long)
+{
+	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
+	res.code = HF_VCPU_RUN_WAIT_FOR_INTERRUPT;
+	res.sleep.ns = 0xcc22888888888888;
+	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0x2288888888888802));
 }
 
 /**
@@ -106,6 +119,41 @@
 	struct hf_vcpu_run_return res =
 		hf_vcpu_run_return_decode(0x1234abcdbadb0102);
 	EXPECT_THAT(res.code, Eq(HF_VCPU_RUN_WAIT_FOR_INTERRUPT));
+	EXPECT_THAT(res.sleep.ns, Eq(0x1234abcdbadb01));
+}
+
+/**
+ * Encode wait-for-message response without leaking.
+ */
+TEST(abi, hf_vcpu_run_return_encode_wait_for_message)
+{
+	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
+	res.code = HF_VCPU_RUN_WAIT_FOR_MESSAGE;
+	res.sleep.ns = HF_SLEEP_INDEFINITE;
+	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0xffffffffffffff03));
+}
+
+/**
+ * Encoding wait-for-message response with too large sleep duration will drop
+ * the top octet.
+ */
+TEST(abi, hf_vcpu_run_return_encode_wait_for_message_sleep_too_long)
+{
+	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
+	res.code = HF_VCPU_RUN_WAIT_FOR_MESSAGE;
+	res.sleep.ns = 0xaa99777777777777;
+	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0x9977777777777703));
+}
+
+/**
+ * Decode a wait-for-message response ignoring the irrelevant bits.
+ */
+TEST(abi, hf_vcpu_run_return_decode_wait_for_message)
+{
+	struct hf_vcpu_run_return res =
+		hf_vcpu_run_return_decode(0x12347654badb0103);
+	EXPECT_THAT(res.code, Eq(HF_VCPU_RUN_WAIT_FOR_MESSAGE));
+	EXPECT_THAT(res.sleep.ns, Eq(0x12347654badb01));
 }
 
 /**
@@ -117,7 +165,7 @@
 	res.code = HF_VCPU_RUN_WAKE_UP;
 	res.wake_up.vm_id = 0x12345678;
 	res.wake_up.vcpu = 0xabcd;
-	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0x12345678abcd0003));
+	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0x12345678abcd0004));
 }
 
 /**
@@ -126,7 +174,7 @@
 TEST(abi, hf_vcpu_run_return_decode_wake_up)
 {
 	struct hf_vcpu_run_return res =
-		hf_vcpu_run_return_decode(0xbeefd00df00daf03);
+		hf_vcpu_run_return_decode(0xbeefd00df00daf04);
 	EXPECT_THAT(res.code, Eq(HF_VCPU_RUN_WAKE_UP));
 	EXPECT_THAT(res.wake_up.vm_id, Eq(0xbeefd00d));
 	EXPECT_THAT(res.wake_up.vcpu, Eq(0xf00d));
@@ -140,7 +188,8 @@
 	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
 	res.code = HF_VCPU_RUN_MESSAGE;
 	res.message.size = 0xdeadbeef;
-	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0xdeadbeef00000004));
+	res.message.vm_id = 0xf007;
+	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0xdeadbeeff0070005));
 }
 
 /**
@@ -149,43 +198,10 @@
 TEST(abi, hf_vcpu_run_return_decode_message)
 {
 	struct hf_vcpu_run_return res =
-		hf_vcpu_run_return_decode(0x1123581314916204);
+		hf_vcpu_run_return_decode(0x1123581314916205);
 	EXPECT_THAT(res.code, Eq(HF_VCPU_RUN_MESSAGE));
 	EXPECT_THAT(res.message.size, Eq(0x11235813));
-}
-
-/**
- * Encode sleep response without leaking.
- */
-TEST(abi, hf_vcpu_run_return_encode_sleep)
-{
-	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
-	res.code = HF_VCPU_RUN_SLEEP;
-	res.sleep.ns = 0xcafed00dfeeded;
-	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0xcafed00dfeeded05));
-}
-
-/**
- * Encoding a sleep response with too large a sleep duration will drop the top
- * octet.
- */
-TEST(abi, hf_vcpu_run_return_encode_sleep_too_long)
-{
-	struct hf_vcpu_run_return res = dirty_vcpu_run_return();
-	res.code = HF_VCPU_RUN_SLEEP;
-	res.sleep.ns = 0xcc88888888888888;
-	EXPECT_THAT(hf_vcpu_run_return_encode(res), Eq(0x8888888888888805));
-}
-
-/**
- * Decode a sleep response.
- */
-TEST(abi, hf_vcpu_run_return_decode_sleep)
-{
-	struct hf_vcpu_run_return res =
-		hf_vcpu_run_return_decode(0x1a2b3c4d5e6f7705);
-	EXPECT_THAT(res.code, Eq(HF_VCPU_RUN_SLEEP));
-	EXPECT_THAT(res.sleep.ns, Eq(0x1a2b3c4d5e6f77));
+	EXPECT_THAT(res.message.vm_id, Eq(0x1491));
 }
 
 /**
diff --git a/src/api.c b/src/api.c
index 31943ca..938d978 100644
--- a/src/api.c
+++ b/src/api.c
@@ -73,10 +73,18 @@
 	 * If the secondary is blocked but has a timer running, sleep until the
 	 * timer fires rather than indefinitely.
 	 */
-	if (primary_ret.code == HF_VCPU_RUN_WAIT_FOR_INTERRUPT &&
-	    arch_timer_enabled_current()) {
-		primary_ret.code = HF_VCPU_RUN_SLEEP;
-		primary_ret.sleep.ns = arch_timer_remaining_ns_current();
+	switch (primary_ret.code) {
+	case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
+	case HF_VCPU_RUN_WAIT_FOR_MESSAGE:
+		primary_ret.sleep.ns =
+			arch_timer_enabled_current()
+				? arch_timer_remaining_ns_current()
+				: HF_SLEEP_INDEFINITE;
+		break;
+
+	default:
+		/* Do nothing. */
+		break;
 	}
 
 	/* Set the return value for the primary VM's call to HF_VCPU_RUN. */
@@ -129,7 +137,7 @@
 	};
 
 	if (current->vm->id == HF_PRIMARY_VM_ID) {
-		/* Noop on the primary as it makes the scheduling decisions.  */
+		/* Noop on the primary as it makes the scheduling decisions. */
 		return NULL;
 	}
 
@@ -250,31 +258,9 @@
 {
 	uint32_t intid_index = intid / INTERRUPT_REGISTER_BITS;
 	uint32_t intid_mask = 1u << (intid % INTERRUPT_REGISTER_BITS);
-	bool need_vm_lock;
 	int64_t ret = 0;
 
 	sl_lock(&target_vcpu->lock);
-	/*
-	 * If we need the target_vm lock we need to release the target_vcpu lock
-	 * first to maintain the correct order of locks. In-between releasing
-	 * and acquiring it again the state of the vCPU could change in such a
-	 * way that we don't actually need to touch the target_vm after all, but
-	 * that's alright: we'll take the target_vm lock anyway, but it's safe,
-	 * just perhaps a little slow in this unusual case. The reverse is not
-	 * possible: if need_vm_lock is false, we don't release the target_vcpu
-	 * lock until we are done, so nothing should change in such as way that
-	 * we need the VM lock after all.
-	 */
-	need_vm_lock =
-		(target_vcpu->interrupts.interrupt_enabled[intid_index] &
-		 ~target_vcpu->interrupts.interrupt_pending[intid_index] &
-		 intid_mask) &&
-		target_vcpu->state == vcpu_state_blocked_mailbox;
-	if (need_vm_lock) {
-		sl_unlock(&target_vcpu->lock);
-		sl_lock(&target_vm->lock);
-		sl_lock(&target_vcpu->lock);
-	}
 
 	/*
 	 * We only need to change state and (maybe) trigger a virtual IRQ if it
@@ -301,43 +287,6 @@
 		goto out;
 	}
 
-	if (target_vcpu->state == vcpu_state_blocked_interrupt) {
-		target_vcpu->state = vcpu_state_ready;
-	} else if (target_vcpu->state == vcpu_state_blocked_mailbox) {
-		/*
-		 * need_vm_lock must be true if this path is taken, so if you
-		 * change the condition here or those leading up to it make sure
-		 * to update the need_vm_lock logic above to match.
-		 */
-
-		/* Take target vCPU out of mailbox recv_waiter list. */
-		/*
-		 * TODO: Consider using a doubly-linked list for the receive
-		 * waiter list to avoid the linear search here.
-		 */
-		struct vcpu **previous_next_pointer =
-			&target_vm->mailbox.recv_waiter;
-		while (*previous_next_pointer != NULL &&
-		       *previous_next_pointer != target_vcpu) {
-			/*
-			 * TODO(qwandor): Do we need to lock the vCPUs somehow
-			 * while we walk the linked list, or is the VM lock
-			 * enough?
-			 */
-			previous_next_pointer =
-				&(*previous_next_pointer)->mailbox_next;
-		}
-		if (*previous_next_pointer == NULL) {
-			dlog("Target VCPU state is vcpu_state_blocked_mailbox "
-			     "but is not in VM mailbox waiter list. This "
-			     "should never happen.\n");
-		} else {
-			*previous_next_pointer = target_vcpu->mailbox_next;
-		}
-
-		target_vcpu->state = vcpu_state_ready;
-	}
-
 	if (current->vm->id == HF_PRIMARY_VM_ID) {
 		/*
 		 * If the call came from the primary VM, let it know that it
@@ -362,9 +311,6 @@
 	target_vcpu->interrupts.interrupt_pending[intid_index] |= intid_mask;
 
 	sl_unlock(&target_vcpu->lock);
-	if (need_vm_lock) {
-		sl_unlock(&target_vm->lock);
-	}
 
 	return ret;
 }
@@ -374,12 +320,56 @@
  * value needs to be forced onto the vCPU.
  */
 static bool api_vcpu_prepare_run(const struct vcpu *current, struct vcpu *vcpu,
-				 struct retval_state *vcpu_retval,
 				 struct hf_vcpu_run_return *run_ret)
 {
+	bool need_vm_lock;
 	bool ret;
 
-	sl_lock(&vcpu->lock);
+	/*
+	 * Wait until the registers become available. All locks must be
+	 * released between iterations of this loop to avoid potential deadlocks
+	 * if, on any path, a lock needs to be taken after taking the decision
+	 * to switch context but before the registers have been saved.
+	 *
+	 * The VM lock is not needed in the common case so it must only be taken
+	 * when it is going to be needed. This ensures there are no inter-vCPU
+	 * dependencies in the common run case meaning the sensitive context
+	 * switch performance is consistent.
+	 */
+	for (;;) {
+		sl_lock(&vcpu->lock);
+
+		/* The VM needs to be locked to deliver mailbox messages. */
+		need_vm_lock = vcpu->state == vcpu_state_blocked_mailbox;
+		if (need_vm_lock) {
+			sl_unlock(&vcpu->lock);
+			sl_lock(&vcpu->vm->lock);
+			sl_lock(&vcpu->lock);
+		}
+
+		if (vcpu->regs_available) {
+			break;
+		}
+
+		if (vcpu->state == vcpu_state_running) {
+			/*
+			 * vCPU is running on another pCPU.
+			 *
+			 * It's ok to not return the sleep duration here because
+			 * the other physical CPU that is currently running this
+			 * vCPU will return sleep duration if neeed. The default
+			 * return value is HF_VCPU_RUN_WAIT_FOR_INTERRUPT, so no
+			 * need to set it explicitly.
+			 */
+			ret = false;
+			goto out;
+		}
+
+		sl_unlock(&vcpu->lock);
+		if (need_vm_lock) {
+			sl_unlock(&vcpu->vm->lock);
+		}
+	}
 
 	if (atomic_load_explicit(&vcpu->vm->aborting, memory_order_relaxed)) {
 		if (vcpu->state != vcpu_state_aborted) {
@@ -391,39 +381,37 @@
 		goto out;
 	}
 
-	/*
-	 * Wait until the registers become available. Care must be taken when
-	 * looping on this: it shouldn't be done while holding other locks to
-	 * avoid deadlocks.
-	 */
-	while (!vcpu->regs_available) {
-		if (vcpu->state == vcpu_state_running) {
-			/*
-			 * vCPU is running on another pCPU.
-			 *
-			 * It's ok to not return HF_VCPU_RUN_SLEEP here because
-			 * the other physical CPU that is currently running this
-			 * vcpu will return HF_VCPU_RUN_SLEEP if neeed. The
-			 * default return value is
-			 * HF_VCPU_RUN_WAIT_FOR_INTERRUPT, so no need to set it
-			 * explicitly.
-			 */
-			ret = false;
-			goto out;
-		}
-
-		sl_unlock(&vcpu->lock);
-		sl_lock(&vcpu->lock);
-	}
-
 	switch (vcpu->state) {
 	case vcpu_state_running:
 	case vcpu_state_off:
 	case vcpu_state_aborted:
 		ret = false;
 		goto out;
-	case vcpu_state_blocked_interrupt:
+
 	case vcpu_state_blocked_mailbox:
+		/*
+		 * A pending message allows the vCPU to run so the message can
+		 * be delivered directly.
+		 */
+		if (vcpu->vm->mailbox.state == mailbox_state_received) {
+			arch_regs_set_retval(
+				&vcpu->regs,
+				hf_mailbox_receive_return_encode((
+					struct hf_mailbox_receive_return){
+					.vm_id = vcpu->vm->mailbox.recv_from_id,
+					.size = vcpu->vm->mailbox.recv_bytes,
+				}));
+			vcpu->vm->mailbox.state = mailbox_state_read;
+			break;
+		}
+		/* Fall through. */
+	case vcpu_state_blocked_interrupt:
+		/* Allow virtual interrupts to be delivered. */
+		if (vcpu->interrupts.enabled_and_pending_count > 0) {
+			break;
+		}
+
+		/* The timer expired so allow the interrupt to be delivered. */
 		if (arch_timer_pending(&vcpu->regs)) {
 			break;
 		}
@@ -433,31 +421,25 @@
 		 * the primary which called vcpu_run.
 		 */
 		if (arch_timer_enabled(&vcpu->regs)) {
-			run_ret->code = HF_VCPU_RUN_SLEEP;
+			run_ret->code =
+				vcpu->state == vcpu_state_blocked_mailbox
+					? HF_VCPU_RUN_WAIT_FOR_MESSAGE
+					: HF_VCPU_RUN_WAIT_FOR_INTERRUPT;
 			run_ret->sleep.ns =
 				arch_timer_remaining_ns(&vcpu->regs);
 		}
 
 		ret = false;
 		goto out;
+
 	case vcpu_state_ready:
 		break;
 	}
-	/*
-	 * If we made it to here then either the state was vcpu_state_ready or
-	 * the timer is pending, so the vCPU should run to handle the timer
-	 * firing.
-	 */
 
+	/* It has been decided that the vCPU should be run. */
 	vcpu->cpu = current->cpu;
 	vcpu->state = vcpu_state_running;
 
-	/* Fetch return value to inject into vCPU if there is one. */
-	*vcpu_retval = vcpu->retval;
-	if (vcpu_retval->force) {
-		vcpu->retval.force = false;
-	}
-
 	/*
 	 * Mark the registers as unavailable now that we're about to reflect
 	 * them onto the real registers. This will also prevent another physical
@@ -469,6 +451,10 @@
 
 out:
 	sl_unlock(&vcpu->lock);
+	if (need_vm_lock) {
+		sl_unlock(&vcpu->vm->lock);
+	}
+
 	return ret;
 }
 
@@ -481,9 +467,9 @@
 {
 	struct vm *vm;
 	struct vcpu *vcpu;
-	struct retval_state vcpu_retval;
 	struct hf_vcpu_run_return ret = {
 		.code = HF_VCPU_RUN_WAIT_FOR_INTERRUPT,
+		.sleep.ns = HF_SLEEP_INDEFINITE,
 	};
 
 	/* Only the primary VM can switch vcpus. */
@@ -509,7 +495,7 @@
 
 	/* Update state if allowed. */
 	vcpu = &vm->vcpus[vcpu_idx];
-	if (!api_vcpu_prepare_run(current, vcpu, &vcpu_retval, &ret)) {
+	if (!api_vcpu_prepare_run(current, vcpu, &ret)) {
 		goto out;
 	}
 
@@ -543,11 +529,6 @@
 	 */
 	ret.code = HF_VCPU_RUN_PREEMPTED;
 
-	/* Update return value for the next vcpu if one was injected. */
-	if (vcpu_retval.force) {
-		arch_regs_set_retval(&vcpu->regs, vcpu_retval.value);
-	}
-
 out:
 	return ret;
 }
@@ -754,8 +735,10 @@
 	struct vm *from = current->vm;
 	struct vm *to;
 	const void *from_buf;
-	uint16_t vcpu;
 	int64_t ret;
+	struct hf_vcpu_run_return primary_ret = {
+		.code = HF_VCPU_RUN_MESSAGE,
+	};
 
 	/* Limit the size of transfer. */
 	if (size > HF_MAILBOX_SIZE) {
@@ -812,69 +795,24 @@
 	memcpy(to->mailbox.recv, from_buf, size);
 	to->mailbox.recv_bytes = size;
 	to->mailbox.recv_from_id = from->id;
-	to->mailbox.state = mailbox_state_read;
+	primary_ret.message.vm_id = to->id;
+	ret = 0;
 
 	/* Messages for the primary VM are delivered directly. */
 	if (to->id == HF_PRIMARY_VM_ID) {
-		struct hf_vcpu_run_return primary_ret = {
-			.code = HF_VCPU_RUN_MESSAGE,
-			.message.size = size,
-		};
-
+		primary_ret.message.size = size,
+		to->mailbox.state = mailbox_state_read;
 		*next = api_switch_to_primary(current, primary_ret,
 					      vcpu_state_ready);
-		ret = 0;
 		goto out;
 	}
 
-	/*
-	 * Try to find a vcpu to handle the message and tell the scheduler to
-	 * run it.
-	 */
-	if (to->mailbox.recv_waiter == NULL) {
-		/*
-		 * The scheduler must choose a vcpu to interrupt so it can
-		 * handle the message.
-		 */
-		to->mailbox.state = mailbox_state_received;
-		vcpu = HF_INVALID_VCPU;
-	} else {
-		struct vcpu *to_vcpu = to->mailbox.recv_waiter;
-
-		/*
-		 * Take target vcpu out of waiter list and mark it as ready to
-		 * run again.
-		 */
-		sl_lock(&to_vcpu->lock);
-		to->mailbox.recv_waiter = to_vcpu->mailbox_next;
-		to_vcpu->state = vcpu_state_ready;
-
-		/* Return from HF_MAILBOX_RECEIVE. */
-		to_vcpu->retval.force = true;
-		to_vcpu->retval.value = hf_mailbox_receive_return_encode(
-			(struct hf_mailbox_receive_return){
-				.vm_id = to->mailbox.recv_from_id,
-				.size = size,
-			});
-
-		sl_unlock(&to_vcpu->lock);
-
-		vcpu = to_vcpu - to->vcpus;
-	}
+	to->mailbox.state = mailbox_state_received;
 
 	/* Return to the primary VM directly or with a switch. */
-	if (from->id == HF_PRIMARY_VM_ID) {
-		ret = vcpu;
-	} else {
-		struct hf_vcpu_run_return primary_ret = {
-			.code = HF_VCPU_RUN_WAKE_UP,
-			.wake_up.vm_id = to->id,
-			.wake_up.vcpu = vcpu,
-		};
-
+	if (from->id != HF_PRIMARY_VM_ID) {
 		*next = api_switch_to_primary(current, primary_ret,
 					      vcpu_state_ready);
-		ret = 0;
 	}
 
 out:
@@ -925,17 +863,10 @@
 		goto out;
 	}
 
-	sl_lock(&current->lock);
-
-	/* Push vcpu into waiter list. */
-	current->mailbox_next = vm->mailbox.recv_waiter;
-	vm->mailbox.recv_waiter = current;
-	sl_unlock(&current->lock);
-
 	/* Switch back to primary vm to block. */
 	{
 		struct hf_vcpu_run_return run_return = {
-			.code = HF_VCPU_RUN_WAIT_FOR_INTERRUPT,
+			.code = HF_VCPU_RUN_WAIT_FOR_MESSAGE,
 		};
 
 		*next = api_switch_to_primary(current, run_return,
diff --git a/src/arch/aarch64/offsets.h b/src/arch/aarch64/offsets.h
index 8652315..a63d75b 100644
--- a/src/arch/aarch64/offsets.h
+++ b/src/arch/aarch64/offsets.h
@@ -19,5 +19,5 @@
 /* These are checked in offset.c. */
 #define CPU_ID 0
 #define CPU_STACK_BOTTOM 8
-#define VCPU_REGS 32
+#define VCPU_REGS 24
 #define VCPU_LAZY (VCPU_REGS + 264)
diff --git a/test/hftest/inc/hftest_impl.h b/test/hftest/inc/hftest_impl.h
index 093ecf0..5778b20 100644
--- a/test/hftest/inc/hftest_impl.h
+++ b/test/hftest/inc/hftest_impl.h
@@ -277,7 +277,8 @@
 		 * message.                                                   \
 		 */                                                           \
 		run_res = hf_vcpu_run(vm_id, 0);                              \
-		ASSERT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);      \
+		ASSERT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);        \
+		ASSERT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);             \
                                                                               \
 		/* Send the selected service to run and let it be handled. */ \
 		memcpy(send_buffer, service, strlen(service));                \
diff --git a/test/vmapi/gicv3/busy_secondary.c b/test/vmapi/gicv3/busy_secondary.c
index 3d6d176..9ea691f 100644
--- a/test/vmapi/gicv3/busy_secondary.c
+++ b/test/vmapi/gicv3/busy_secondary.c
@@ -36,15 +36,8 @@
 
 SET_UP(busy_secondary)
 {
-	struct hf_vcpu_run_return run_res;
-
 	system_setup();
-
-	/* Configure mailbox pages. */
 	EXPECT_EQ(hf_vm_configure(send_page_addr, recv_page_addr), 0);
-	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
-
 	SERVICE_SELECT(SERVICE_VM0, "busy", send_page);
 }
 
@@ -69,7 +62,8 @@
 
 	/* Let the secondary get started and wait for our message. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Check that no interrupts are active or pending to start with. */
 	EXPECT_EQ(GICD_ISPENDR(0), 0);
@@ -123,7 +117,8 @@
 
 	/* Let the secondary get started and wait for our message. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Check that no interrupts are active or pending to start with. */
 	EXPECT_EQ(GICD_ISPENDR(0), 0);
diff --git a/test/vmapi/gicv3/services/timer.c b/test/vmapi/gicv3/services/timer.c
index 16ccccb..029c729 100644
--- a/test/vmapi/gicv3/services/timer.c
+++ b/test/vmapi/gicv3/services/timer.c
@@ -59,72 +59,74 @@
 	arch_irq_enable();
 
 	for (;;) {
-		const char timer_wfi_message[] = "WFI  xxxxxxx";
-		const char timer_wfe_message[] = "WFE  xxxxxxx";
-		const char timer_receive_message[] = "RECV xxxxxxx";
+		const char timer_wfi_message[] = "**** xxxxxxx";
+		char *message = SERVICE_RECV_BUFFER();
+		bool wfi, wfe, receive;
+		bool disable_interrupts;
+		uint32_t ticks;
 		struct hf_mailbox_receive_return received_message =
 			mailbox_receive_retry();
-		if (received_message.vm_id == HF_PRIMARY_VM_ID &&
-		    received_message.size == sizeof(timer_wfi_message)) {
-			/*
-			 * Start a timer to send the message back: enable it and
-			 * set it for the requested number of ticks.
-			 */
-			char *message = SERVICE_RECV_BUFFER();
-			bool wfi = memcmp(message, timer_wfi_message, 5) == 0;
-			bool wfe = memcmp(message, timer_wfe_message, 5) == 0;
-			bool receive =
-				memcmp(message, timer_receive_message, 5) == 0;
-			int32_t ticks = (message[5] - '0') * 1000000 +
-					(message[6] - '0') * 100000 +
-					(message[7] - '0') * 10000 +
-					(message[8] - '0') * 1000 +
-					(message[9] - '0') * 100 +
-					(message[10] - '0') * 10 +
-					(message[11] - '0');
-			dlog("Starting timer for %d ticks.\n", ticks);
-			if (wfi || receive) {
-				arch_irq_disable();
-			}
-			timer_set(ticks);
-			timer_start();
-			dlog("Waiting for timer...\n");
-			if (wfi) {
-				/* WFI until the timer fires. */
-				interrupt_wait();
-				arch_irq_enable();
-			} else if (wfe) {
-				/* WFE until the timer fires. */
-				while (!timer_fired) {
-					event_wait();
-				}
-			} else if (receive) {
-				/*
-				 * Block on hf_mailbox_receive until timer
-				 * fires.
-				 */
-				struct hf_mailbox_receive_return received =
-					hf_mailbox_receive(true);
-				/*
-				 * Expect to be interrupted, not to actually
-				 * receive a message.
-				 */
-				EXPECT_EQ(received.vm_id, HF_INVALID_VM_ID);
-				EXPECT_EQ(received.size, 0);
-				arch_irq_enable();
-			} else {
-				/* Busy wait until the timer fires. */
-				while (!timer_fired) {
-				}
-			}
-			EXPECT_TRUE(timer_fired);
-			timer_fired = false;
-			dlog("Done waiting.\n");
-		} else {
-			dlog("Got unexpected message from VM %d, size %d.\n",
+
+		if (received_message.vm_id != HF_PRIMARY_VM_ID ||
+		    received_message.size != sizeof(timer_wfi_message)) {
+			FAIL("Got unexpected message from VM %d, size %d.\n",
 			     received_message.vm_id, received_message.size);
-			FAIL("Unexpected message");
 		}
+
+		/*
+		 * Start a timer to send the message back: enable it and
+		 * set it for the requested number of ticks.
+		 */
+		wfi = memcmp(message, "WFI ", 4) == 0;
+		wfe = memcmp(message, "WFE ", 4) == 0;
+		receive = memcmp(message, "RECV", 4) == 0;
+		disable_interrupts = wfi || receive;
+		ticks = (message[5] - '0') * 1000000 +
+			(message[6] - '0') * 100000 +
+			(message[7] - '0') * 10000 + (message[8] - '0') * 1000 +
+			(message[9] - '0') * 100 + (message[10] - '0') * 10 +
+			(message[11] - '0');
+
 		hf_mailbox_clear();
+
+		dlog("Starting timer for %d ticks.\n", ticks);
+
+		if (disable_interrupts) {
+			arch_irq_disable();
+		}
+
+		timer_set(ticks);
+		timer_start();
+		dlog("Waiting for timer...\n");
+
+		/* Wait for the timer interrupt. */
+		if (wfi) {
+			interrupt_wait();
+		} else if (wfe) {
+			while (!timer_fired) {
+				event_wait();
+			}
+		} else if (receive) {
+			struct hf_mailbox_receive_return received =
+				hf_mailbox_receive(true);
+			/*
+			 * Expect to be interrupted, not to actually
+			 * receive a message.
+			 */
+			EXPECT_EQ(received.vm_id, HF_INVALID_VM_ID);
+			EXPECT_EQ(received.size, 0);
+		} else {
+			/* Busy wait until the timer fires. */
+			while (!timer_fired) {
+			}
+		}
+
+		if (disable_interrupts) {
+			arch_irq_enable();
+		}
+
+		EXPECT_TRUE(timer_fired);
+		timer_fired = false;
+		dlog("Done waiting.\n");
 	}
 }
diff --git a/test/vmapi/gicv3/timer_secondary.c b/test/vmapi/gicv3/timer_secondary.c
index 9baa24e..dcb2e87 100644
--- a/test/vmapi/gicv3/timer_secondary.c
+++ b/test/vmapi/gicv3/timer_secondary.c
@@ -27,13 +27,7 @@
 {
 	system_setup();
 
-	struct hf_vcpu_run_return run_res;
-
-	/* Configure mailbox pages. */
 	EXPECT_EQ(hf_vm_configure(send_page_addr, recv_page_addr), 0);
-	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
-
 	SERVICE_SELECT(SERVICE_VM0, "timer", send_page);
 
 	interrupt_enable(VIRTUAL_TIMER_IRQ, true);
@@ -42,7 +36,7 @@
 	arch_irq_enable();
 }
 
-void timer_busywait_secondary()
+static void timer_busywait_secondary()
 {
 	const char message[] = "loop 0099999";
 	const char expected_response[] = "Got IRQ 03.";
@@ -50,7 +44,8 @@
 
 	/* Let the secondary get started and wait for our message. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Send the message for the secondary to set a timer. */
 	memcpy(send_page, message, sizeof(message));
@@ -96,7 +91,8 @@
 	timer_busywait_secondary();
 }
 
-void timer_wfi_secondary(const char message[], bool wfe)
+static void timer_secondary(const char message[],
+			    enum hf_vcpu_run_code expected_code)
 {
 	const char expected_response[] = "Got IRQ 03.";
 	size_t message_length = strlen(message) + 1;
@@ -104,7 +100,8 @@
 
 	/* Let the secondary get started and wait for our message. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Send the message for the secondary to set a timer. */
 	memcpy(send_page, message, message_length);
@@ -112,53 +109,44 @@
 
 	/*
 	 * Let the secondary handle the message and set the timer. Then there's
-	 * a race for whether it manages to WFI before the hardware timer fires,
-	 * so we need to handle both cases.
+	 * a race for whether it manages to block and switch to the primary
+	 * before the hardware timer fires, so we need to handle both cases.
 	 */
 	last_interrupt_id = 0;
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	if (run_res.code == HF_VCPU_RUN_SLEEP && !wfe) {
+	if (run_res.code == expected_code) {
 		/*
-		 * This case happens if the secondary manages to call WFI before
-		 * the timer fires. This is likely when the timer is set for a
-		 * long time.
+		 * This case happens if the secondary manages to block and
+		 * switch to the primary before the timer fires.
 		 */
 		dlog("secondary sleeping after receiving timer message\n");
 		/* Loop until the timer fires. */
-		while (run_res.code == HF_VCPU_RUN_SLEEP) {
-			dlog("Primary looping until timer fires; %d ns "
-			     "remaining\n",
-			     run_res.sleep.ns);
-			run_res = hf_vcpu_run(SERVICE_VM0, 0);
-		}
-		dlog("Primary done looping\n");
-	} else if (run_res.code == HF_VCPU_RUN_YIELD && wfe) {
-		/*
-		 * This case happens if the secondary manages to call WFE before
-		 * the timer fires. This is likely when the timer is set for a
-		 * long time.
-		 */
-		dlog("secondary yielding after receiving timer message\n");
-		/* Loop until the timer fires. */
-		while (run_res.code == HF_VCPU_RUN_YIELD) {
+		while (run_res.code == expected_code) {
 			dlog("Primary looping until timer fires\n");
+			if (expected_code == HF_VCPU_RUN_WAIT_FOR_INTERRUPT ||
+			    expected_code == HF_VCPU_RUN_WAIT_FOR_MESSAGE) {
+				EXPECT_NE(run_res.sleep.ns,
+					  HF_SLEEP_INDEFINITE);
+				dlog("%d ns remaining\n", run_res.sleep.ns);
+			}
 			run_res = hf_vcpu_run(SERVICE_VM0, 0);
 		}
 		dlog("Primary done looping\n");
 	} else if (run_res.code == HF_VCPU_RUN_PREEMPTED) {
 		/*
 		 * This case happens if the (hardware) timer fires before the
-		 * secondary calls WFI. Then we get the interrupt to the
-		 * primary, ignore it, and see a HF_VCPU_RUN_PREEMPTED code from
-		 * the hf_vcpu_run call, so we should call it again for the
-		 * timer interrupt to be injected automatically by Hafnium.
+		 * secondary blocks and switches to the primary. Then we get the
+		 * interrupt to the primary, ignore it, and see a
+		 * HF_VCPU_RUN_PREEMPTED code from the hf_vcpu_run call, so we
+		 * should call it again for the timer interrupt to be injected
+		 * automatically by Hafnium.
 		 */
 		EXPECT_EQ(last_interrupt_id, VIRTUAL_TIMER_IRQ);
-		dlog("Primary yielded, running again\n");
+		dlog("Preempted by timer interrupt, running again\n");
 		run_res = hf_vcpu_run(SERVICE_VM0, 0);
 	} else {
 		/* No other return codes should occur here, so fail. */
-		FAIL("Unexpected run result code.");
+		FAIL("Unexpected run result code (%d).", run_res.code);
 	}
 
 	/* Once we wake it up it should get the timer interrupt and respond. */
@@ -174,7 +162,8 @@
  * Send a message to the interruptible VM, which will start a timer to interrupt
  * itself to send a response back. This test is run with both long and short
  * timer lengths, to try to cover both cases of the race for whether the timer
- * fires before or after the WFI in the secondary VM.
+ * fires before or after the secondary VM blocks and switches back to the
+ * primary.
  */
 TEST(timer_secondary, wfi_short)
 {
@@ -182,8 +171,8 @@
 	 * Run the test twice in a row, to check that the state doesn't get
 	 * messed up.
 	 */
-	timer_wfi_secondary("WFI  0000001", false);
-	timer_wfi_secondary("WFI  0000001", false);
+	timer_secondary("WFI  0000001", HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	timer_secondary("WFI  0000001", HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
 }
 
 TEST(timer_secondary, wfi_long)
@@ -192,8 +181,8 @@
 	 * Run the test twice in a row, to check that the state doesn't get
 	 * messed up.
 	 */
-	timer_wfi_secondary("WFI  0099999", false);
-	timer_wfi_secondary("WFI  0099999", false);
+	timer_secondary("WFI  0099999", HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	timer_secondary("WFI  0099999", HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
 }
 
 TEST(timer_secondary, wfe_short)
@@ -202,8 +191,8 @@
 	 * Run the test twice in a row, to check that the state doesn't get
 	 * messed up.
 	 */
-	timer_wfi_secondary("WFE  0000001", true);
-	timer_wfi_secondary("WFE  0000001", true);
+	timer_secondary("WFE  0000001", HF_VCPU_RUN_YIELD);
+	timer_secondary("WFE  0000001", HF_VCPU_RUN_YIELD);
 }
 
 TEST(timer_secondary, wfe_long)
@@ -212,8 +201,8 @@
 	 * Run the test twice in a row, to check that the state doesn't get
 	 * messed up.
 	 */
-	timer_wfi_secondary("WFE  0099999", true);
-	timer_wfi_secondary("WFE  0099999", true);
+	timer_secondary("WFE  0099999", HF_VCPU_RUN_YIELD);
+	timer_secondary("WFE  0099999", HF_VCPU_RUN_YIELD);
 }
 
 TEST(timer_secondary, receive_short)
@@ -222,8 +211,8 @@
 	 * Run the test twice in a row, to check that the state doesn't get
 	 * messed up.
 	 */
-	timer_wfi_secondary("RECV 0000001", false);
-	timer_wfi_secondary("RECV 0000001", false);
+	timer_secondary("RECV 0000001", HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	timer_secondary("RECV 0000001", HF_VCPU_RUN_WAIT_FOR_MESSAGE);
 }
 
 TEST(timer_secondary, receive_long)
@@ -232,8 +221,8 @@
 	 * Run the test twice in a row, to check that the state doesn't get
 	 * messed up.
 	 */
-	timer_wfi_secondary("RECV 0099999", false);
-	timer_wfi_secondary("RECV 0099999", false);
+	timer_secondary("RECV 0099999", HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	timer_secondary("RECV 0099999", HF_VCPU_RUN_WAIT_FOR_MESSAGE);
 }
 
 /**
@@ -247,7 +236,8 @@
 
 	/* Let the secondary get started and wait for our message. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Send the message for the secondary to set a timer. */
 	memcpy(send_page, message, message_length);
@@ -259,7 +249,7 @@
 	last_interrupt_id = 0;
 	for (int i = 0; i < 20; ++i) {
 		run_res = hf_vcpu_run(SERVICE_VM0, 0);
-		EXPECT_EQ(run_res.code, HF_VCPU_RUN_SLEEP);
+		EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
 		dlog("Primary looping until timer fires; %d ns "
 		     "remaining\n",
 		     run_res.sleep.ns);
diff --git a/test/vmapi/primary_only/primary_only.c b/test/vmapi/primary_only/primary_only.c
index b5e45a4..df19d32 100644
--- a/test/vmapi/primary_only/primary_only.c
+++ b/test/vmapi/primary_only/primary_only.c
@@ -74,6 +74,7 @@
 {
 	struct hf_vcpu_run_return res = hf_vcpu_run(HF_PRIMARY_VM_ID, 0);
 	EXPECT_EQ(res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(res.sleep.ns, HF_SLEEP_INDEFINITE);
 }
 
 /**
@@ -84,6 +85,7 @@
 {
 	struct hf_vcpu_run_return res = hf_vcpu_run(1, 0);
 	EXPECT_EQ(res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(res.sleep.ns, HF_SLEEP_INDEFINITE);
 }
 
 /**
diff --git a/test/vmapi/primary_with_secondaries/interrupts.c b/test/vmapi/primary_with_secondaries/interrupts.c
index 09f8719..7791c64 100644
--- a/test/vmapi/primary_with_secondaries/interrupts.c
+++ b/test/vmapi/primary_with_secondaries/interrupts.c
@@ -37,7 +37,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "interruptible", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Set the message, echo it and wait for a response. */
 	memcpy(mb.send, message, sizeof(message));
@@ -64,7 +65,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "interruptible", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Inject the interrupt and wait for a message. */
 	hf_interrupt_inject(SERVICE_VM0, 0, EXTERNAL_INTERRUPT_ID_A);
@@ -99,7 +101,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "interruptible", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Inject the interrupt and wait for a message. */
 	hf_interrupt_inject(SERVICE_VM0, 0, EXTERNAL_INTERRUPT_ID_A);
@@ -137,7 +140,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "interruptible", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Inject the interrupt and wait for a message. */
 	hf_interrupt_inject(SERVICE_VM0, 0, EXTERNAL_INTERRUPT_ID_A);
@@ -149,7 +153,8 @@
 	EXPECT_EQ(hf_mailbox_clear(), 0);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Now send a message to the secondary. */
 	memcpy(mb.send, message, sizeof(message));
@@ -180,7 +185,8 @@
 	/* Inject the interrupt and expect not to get a message. */
 	hf_interrupt_inject(SERVICE_VM0, 0, EXTERNAL_INTERRUPT_ID_C);
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/*
 	 * Now send a message to the secondary to enable the interrupt ID, and
@@ -249,3 +255,29 @@
 		  0);
 	EXPECT_EQ(hf_mailbox_clear(), 0);
 }
+
+/*
+ * Deliver an interrupt and a message to the same vCPU and check that both are
+ * delivered the next time the vCPU is run.
+ */
+TEST(interrupts, deliver_interrupt_and_message)
+{
+	const char message[] = "I\'ll see you again.";
+	struct hf_vcpu_run_return run_res;
+	struct mailbox_buffers mb = set_up_mailbox();
+
+	SERVICE_SELECT(SERVICE_VM0, "interruptible_echo", mb.send);
+
+	run_res = hf_vcpu_run(SERVICE_VM0, 0);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
+
+	memcpy(mb.send, message, sizeof(message));
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(message), false), 0);
+	hf_interrupt_inject(SERVICE_VM0, 0, EXTERNAL_INTERRUPT_ID_A);
+	run_res = hf_vcpu_run(SERVICE_VM0, 0);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_MESSAGE);
+	EXPECT_EQ(run_res.message.size, sizeof(message));
+	EXPECT_EQ(memcmp(mb.recv, message, sizeof(message)), 0);
+	EXPECT_EQ(hf_mailbox_clear(), 0);
+}
diff --git a/test/vmapi/primary_with_secondaries/mailbox.c b/test/vmapi/primary_with_secondaries/mailbox.c
index afb09eb..c3ee826 100644
--- a/test/vmapi/primary_with_secondaries/mailbox.c
+++ b/test/vmapi/primary_with_secondaries/mailbox.c
@@ -82,7 +82,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "echo", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Set the message, echo it and check it didn't change. */
 	memcpy(mb.send, message, sizeof(message));
@@ -109,7 +110,8 @@
 	for (i = 0; i < 100; i++) {
 		/* Run secondary until it reaches the wait for messages. */
 		run_res = hf_vcpu_run(SERVICE_VM0, 0);
-		EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+		EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+		EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 		/* Set the message, echo it and check it didn't change. */
 		next_permutation(message, sizeof(message) - 1);
@@ -138,9 +140,11 @@
 	SERVICE_SELECT(SERVICE_VM1, "relay", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 	run_res = hf_vcpu_run(SERVICE_VM1, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/*
 	 * Build the message chain so the message is sent from here to
@@ -160,15 +164,16 @@
 
 	/* Let SERVICE_VM0 forward the message. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAKE_UP);
-	EXPECT_EQ(run_res.wake_up.vm_id, SERVICE_VM1);
-	EXPECT_EQ(run_res.wake_up.vcpu, 0);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_MESSAGE);
+	EXPECT_EQ(run_res.message.vm_id, SERVICE_VM1);
+	EXPECT_EQ(run_res.message.size, 0);
 
 	/* Let SERVICE_VM1 forward the message. */
 	run_res = hf_vcpu_run(SERVICE_VM1, 0);
 	EXPECT_EQ(run_res.code, HF_VCPU_RUN_MESSAGE);
 
 	/* Ensure the message is in tact. */
+	EXPECT_EQ(run_res.message.vm_id, HF_PRIMARY_VM_ID);
 	EXPECT_EQ(run_res.message.size, sizeof(message));
 	EXPECT_EQ(memcmp(mb.recv, message, sizeof(message)), 0);
 	EXPECT_EQ(hf_mailbox_clear(), 0);
@@ -187,7 +192,8 @@
 	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, 0, false), -1);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, 0, false), 0);
 }
@@ -215,8 +221,8 @@
 	EXPECT_EQ(hf_mailbox_waiter_get(SERVICE_VM0), HF_PRIMARY_VM_ID);
 	EXPECT_EQ(hf_mailbox_waiter_get(SERVICE_VM0), -1);
 
-	/* Send should succeed now, though no vCPU is blocked waiting for it. */
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, 0, false), HF_INVALID_VCPU);
+	/* Send should now succeed. */
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, 0, false), 0);
 }
 
 /**
@@ -233,7 +239,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "echo_with_notification", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Send a message to echo service, and get response back. */
 	memcpy(mb.send, message, sizeof(message));
@@ -245,7 +252,8 @@
 
 	/* Let secondary VM continue running so that it will wait again. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Without clearing our mailbox, send message again. */
 	reverse(message, strlen(message));
@@ -253,6 +261,7 @@
 	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(message), false), 0);
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
 	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Clear the mailbox. We expect to be told there are pending waiters. */
 	EXPECT_EQ(hf_mailbox_clear(), 1);
@@ -288,7 +297,8 @@
 	SERVICE_SELECT(SERVICE_VM0, "echo_with_notification", mb.send);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 
 	/* Send a message to echo service twice. The second should fail. */
 	memcpy(mb.send, message, sizeof(message));
@@ -309,6 +319,6 @@
 	EXPECT_EQ(hf_mailbox_waiter_get(SERVICE_VM0), HF_PRIMARY_VM_ID);
 	EXPECT_EQ(hf_mailbox_waiter_get(SERVICE_VM0), -1);
 
-	/* Send should succeed now, though no vCPU is blocked waiting for it. */
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, 0, false), HF_INVALID_VCPU);
+	/* Send should now succeed. */
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, 0, false), 0);
 }
diff --git a/test/vmapi/primary_with_secondaries/memory_sharing.c b/test/vmapi/primary_with_secondaries/memory_sharing.c
index 39106ee..d1adea5 100644
--- a/test/vmapi/primary_with_secondaries/memory_sharing.c
+++ b/test/vmapi/primary_with_secondaries/memory_sharing.c
@@ -104,8 +104,7 @@
 	 *       explicitly to test the mechanism.
 	 */
 	memcpy(mb.send, &ptr, sizeof(ptr));
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false),
-		  HF_INVALID_VCPU);
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false), 0);
 
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
 	EXPECT_EQ(run_res.code, HF_VCPU_RUN_YIELD);
@@ -147,8 +146,7 @@
 	 *       explicitly to test the mechanism.
 	 */
 	memcpy(mb.send, &ptr, sizeof(ptr));
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false),
-		  HF_INVALID_VCPU);
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false), 0);
 
 	/* Let the memory be returned. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
@@ -185,8 +183,7 @@
 	 *       explicitly to test the mechanism.
 	 */
 	memcpy(mb.send, &ptr, sizeof(ptr));
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false),
-		  HF_INVALID_VCPU);
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false), 0);
 
 	/* Let the memory be returned. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
@@ -223,8 +220,7 @@
 	 *       explicitly to test the mechanism.
 	 */
 	memcpy(mb.send, &ptr, sizeof(ptr));
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false),
-		  HF_INVALID_VCPU);
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false), 0);
 
 	/* Let the memory be returned. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
@@ -260,8 +256,7 @@
 	 *       explicitly to test the mechanism.
 	 */
 	memcpy(mb.send, &ptr, sizeof(ptr));
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false),
-		  HF_INVALID_VCPU);
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false), 0);
 
 	/* Let the memory be returned. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
@@ -274,7 +269,8 @@
 
 	/* Observe the service doesn't fault when accessing the memory. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
-	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(run_res.code, HF_VCPU_RUN_WAIT_FOR_MESSAGE);
+	EXPECT_EQ(run_res.sleep.ns, HF_SLEEP_INDEFINITE);
 }
 
 /**
@@ -299,8 +295,7 @@
 	 *       explicitly to test the mechanism.
 	 */
 	memcpy(mb.send, &ptr, sizeof(ptr));
-	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false),
-		  HF_INVALID_VCPU);
+	EXPECT_EQ(hf_mailbox_send(SERVICE_VM0, sizeof(ptr), false), 0);
 
 	/* Let the memory be returned. */
 	run_res = hf_vcpu_run(SERVICE_VM0, 0);
diff --git a/test/vmapi/primary_with_secondaries/no_services.c b/test/vmapi/primary_with_secondaries/no_services.c
index a90da65..9d7f29b 100644
--- a/test/vmapi/primary_with_secondaries/no_services.c
+++ b/test/vmapi/primary_with_secondaries/no_services.c
@@ -75,6 +75,7 @@
 {
 	struct hf_vcpu_run_return res = hf_vcpu_run(HF_PRIMARY_VM_ID, 0);
 	EXPECT_EQ(res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(res.sleep.ns, HF_SLEEP_INDEFINITE);
 }
 
 /**
@@ -84,6 +85,7 @@
 {
 	struct hf_vcpu_run_return res = hf_vcpu_run(1234, 0);
 	EXPECT_EQ(res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(res.sleep.ns, HF_SLEEP_INDEFINITE);
 }
 
 /**
@@ -93,6 +95,7 @@
 {
 	struct hf_vcpu_run_return res = hf_vcpu_run(SERVICE_VM0, 1234);
 	EXPECT_EQ(res.code, HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+	EXPECT_EQ(res.sleep.ns, HF_SLEEP_INDEFINITE);
 }
 
 /**
diff --git a/test/vmapi/primary_with_secondaries/run_race.c b/test/vmapi/primary_with_secondaries/run_race.c
index 4230ba5..51c25a3 100644
--- a/test/vmapi/primary_with_secondaries/run_race.c
+++ b/test/vmapi/primary_with_secondaries/run_race.c
@@ -41,7 +41,8 @@
 		/* Run until it manages to schedule vCPU on this CPU. */
 		do {
 			run_res = hf_vcpu_run(SERVICE_VM0, 0);
-		} while (run_res.code == HF_VCPU_RUN_WAIT_FOR_INTERRUPT);
+		} while (run_res.code == HF_VCPU_RUN_WAIT_FOR_INTERRUPT &&
+			 run_res.sleep.ns == HF_SLEEP_INDEFINITE);
 
 		/* Break out if we received a message with non-zero length. */
 		if (run_res.code == HF_VCPU_RUN_MESSAGE &&
diff --git a/test/vmapi/primary_with_secondaries/services/BUILD.gn b/test/vmapi/primary_with_secondaries/services/BUILD.gn
index 75a797e..b0ef794 100644
--- a/test/vmapi/primary_with_secondaries/services/BUILD.gn
+++ b/test/vmapi/primary_with_secondaries/services/BUILD.gn
@@ -82,6 +82,7 @@
 
   sources = [
     "interruptible.c",
+    "interruptible_echo.c",
   ]
 
   deps = [
diff --git a/test/vmapi/primary_with_secondaries/services/interruptible_echo.c b/test/vmapi/primary_with_secondaries/services/interruptible_echo.c
new file mode 100644
index 0000000..e1eb643
--- /dev/null
+++ b/test/vmapi/primary_with_secondaries/services/interruptible_echo.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2019 The Hafnium Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hf/arch/cpu.h"
+#include "hf/arch/std.h"
+#include "hf/arch/vm/interrupts_gicv3.h"
+
+#include "hf/dlog.h"
+
+#include "vmapi/hf/call.h"
+
+#include "hftest.h"
+#include "primary_with_secondary.h"
+
+static void irq(void)
+{
+	/* Clear the interrupt. */
+	hf_interrupt_get();
+}
+
+TEST_SERVICE(interruptible_echo)
+{
+	exception_setup(irq);
+	hf_interrupt_enable(EXTERNAL_INTERRUPT_ID_A, true);
+	arch_irq_enable();
+
+	for (;;) {
+		struct hf_mailbox_receive_return res = hf_mailbox_receive(true);
+
+		/* Retry if interrupted but made visible with the yield. */
+		while (res.vm_id == HF_INVALID_VM_ID && res.size == 0) {
+			hf_vcpu_yield();
+			res = hf_mailbox_receive(true);
+		}
+
+		memcpy(SERVICE_SEND_BUFFER(), SERVICE_RECV_BUFFER(), res.size);
+		hf_mailbox_clear();
+		hf_mailbox_send(res.vm_id, res.size, false);
+	}
+}