Blame - main.c - hafnium/driver/linux - TrustedFirmware Git Browser

blob: cf3a95d54b86b523a2b43d51d942c3b59d05be14 [file] [log] [blame]

Andrew Walbran	13c3a0b	2018-11-30 11:51:53 +0000	[diff] [blame]	1	/*
				2	* Copyright 2018 Google LLC
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* version 2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				11	* GNU General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public License
				14	* along with this program; if not, write to the Free Software
				15	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
				16	*/
				17
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	18	#include <linux/hrtimer.h>
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	19	#include <linux/atomic.h>
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	20	#include <linux/init.h>
				21	#include <linux/kernel.h>
				22	#include <linux/kthread.h>
Wedson Almeida Filho	f9e1192	2018-08-12 15:54:31 +0100	[diff] [blame]	23	#include <linux/mm.h>
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	24	#include <linux/module.h>
				25	#include <linux/sched/task.h>
				26	#include <linux/slab.h>
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	27	#include <linux/net.h>
				28	#include <net/sock.h>
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	29
Andrew Scull	5570423	2018-08-10 17:19:54 +0100	[diff] [blame]	30	#include <hf/call.h>
				31
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	32	/* TODO: Reusing AF_ECONET for now as it's otherwise unused. */
				33	#define AF_HF AF_ECONET
				34	#define PF_HF AF_HF
				35
				36	#define MESSAGE_INT_ID 1
				37
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	38	#define CONFIG_HAFNIUM_MAX_VMS 16
				39	#define CONFIG_HAFNIUM_MAX_VCPUS 32
				40
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	41	struct hf_vcpu {
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	42	struct hf_vm *vm;
Andrew Scull	5570423	2018-08-10 17:19:54 +0100	[diff] [blame]	43	uint32_t vcpu_index;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	44	struct task_struct *task;
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	45	atomic_t abort_sleep;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	46	struct hrtimer timer;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	47	};
				48
				49	struct hf_vm {
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	50	uint32_t id;
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	51	uint32_t vcpu_count;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	52	struct hf_vcpu *vcpu;
				53	};
				54
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	55	struct hf_msg_hdr {
				56	uint64_t src_port;
				57	uint64_t dst_port;
				58	};
				59
				60	struct hf_sock {
				61	/* This needs to be the first field. */
				62	struct sock sk;
				63
				64	/*
				65	* The following fields are immutable after the socket transitions to
				66	* SS_CONNECTED state.
				67	*/
				68	uint64_t local_port;
				69	uint64_t remote_port;
				70	struct hf_vm *peer_vm;
				71	};
				72
				73	struct sockaddr_hf {
				74	sa_family_t family;
				75	uint32_t vm_id;
				76	uint64_t port;
				77	};
				78
				79	static struct proto hf_sock_proto = {
				80	.name = "hafnium",
				81	.owner = THIS_MODULE,
				82	.obj_size = sizeof(struct hf_sock),
				83	};
				84
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	85	static struct hf_vm *hf_vms;
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	86	static uint32_t hf_vm_count;
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	87	static struct page *hf_send_page;
				88	static struct page *hf_recv_page;
				89	static atomic64_t hf_next_port = ATOMIC64_INIT(0);
				90	static DEFINE_SPINLOCK(hf_send_lock);
				91	static DEFINE_HASHTABLE(hf_local_port_hash, 7);
				92	static DEFINE_SPINLOCK(hf_local_port_hash_lock);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	93
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	94	/**
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	95	* Wakes up the kernel thread responsible for running the given vcpu.
				96	*
				97	* Returns 0 if the thread was already running, 1 otherwise.
				98	*/
				99	static int hf_vcpu_wake_up(struct hf_vcpu *vcpu)
				100	{
				101	/* Set a flag indicating that the thread should not go to sleep. */
				102	atomic_set(&vcpu->abort_sleep, 1);
				103
				104	/* Set the thread to running state. */
				105	return wake_up_process(vcpu->task);
				106	}
				107
				108	/**
				109	* Puts the current thread to sleep. The current thread must be responsible for
				110	* running the given vcpu.
				111	*
				112	* Going to sleep will fail if hf_vcpu_wake_up() or kthread_stop() was called on
				113	* this vcpu/thread since the last time it [re]started running.
				114	*/
				115	static void hf_vcpu_sleep(struct hf_vcpu *vcpu)
				116	{
				117	int abort;
				118
				119	set_current_state(TASK_INTERRUPTIBLE);
				120
				121	/* Check the sleep-abort flag after making thread interruptible. */
				122	abort = atomic_read(&vcpu->abort_sleep);
				123	if (!abort && !kthread_should_stop())
				124	schedule();
				125
				126	/* Set state back to running on the way out. */
				127	set_current_state(TASK_RUNNING);
				128	}
				129
				130	/**
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	131	* Wakes up the thread associated with the vcpu that owns the given timer. This
				132	* is called when the timer the thread is waiting on expires.
				133	*/
				134	static enum hrtimer_restart hf_vcpu_timer_expired(struct hrtimer *timer)
				135	{
				136	struct hf_vcpu *vcpu = container_of(timer, struct hf_vcpu, timer);
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	137	/* TODO: Inject interrupt. */
				138	hf_vcpu_wake_up(vcpu);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	139	return HRTIMER_NORESTART;
				140	}
				141
				142	/**
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	143	* Handles a message delivered to this VM by validating that it's well-formed
				144	* and then queueing it for delivery to the appropriate socket.
				145	*/
				146	static void hf_handle_message(struct hf_vm sender, const void ptr, size_t len)
				147	{
				148	struct hf_sock *hsock;
				149	const struct hf_msg_hdr *hdr = ptr;
				150	struct sk_buff *skb;
				151	int err;
				152
				153	/* Ignore messages that are too small to hold a header. */
				154	if (len < sizeof(struct hf_msg_hdr))
				155	return;
				156
				157	len -= sizeof(struct hf_msg_hdr);
				158
				159	/* Go through the colliding sockets. */
				160	rcu_read_lock();
				161	hash_for_each_possible_rcu(hf_local_port_hash, hsock, sk.sk_node,
				162	hdr->dst_port) {
				163	if (hsock->peer_vm == sender &&
				164	hsock->remote_port == hdr->src_port) {
				165	sock_hold(&hsock->sk);
				166	break;
				167	}
				168	}
				169	rcu_read_unlock();
				170
				171	/* Nothing to do if we couldn't find the target. */
				172	if (!hsock)
				173	return;
				174
Wedson Almeida Filho	89d0e47	2019-01-03 19:18:39 +0000	[diff] [blame^]	175	/*
				176	* TODO: From this point on, there are two failure paths: when we
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	177	* create the skb below, and when we enqueue it to the socket. What
				178	* should we do if they fail? Ideally we would have some form of flow
				179	* control to prevent message loss, but how to do it efficiently?
				180	*
				181	* One option is to have a pre-allocated message that indicates to the
				182	* sender that a message was dropped. This way we guarantee that the
				183	* sender will be aware of loss and should back-off.
				184	*/
				185	/* Create the skb. */
				186	skb = alloc_skb(len, GFP_KERNEL);
				187	if (!skb)
				188	goto exit;
				189
				190	memcpy(skb_put(skb, len), hdr + 1, len);
				191
				192	/*
				193	* Add the skb to the receive queue of the target socket. On success it
				194	* calls sk->sk_data_ready, which is currently set to sock_def_readable,
				195	* which wakes up any waiters.
				196	*/
				197	err = sock_queue_rcv_skb(&hsock->sk, skb);
				198	if (err)
				199	kfree_skb(skb);
				200
				201	exit:
				202	sock_put(&hsock->sk);
				203	}
				204
				205	/**
				206	* This function is called when Hafnium requests that the primary VM wake up a
				207	* vCPU that belongs to a secondary VM.
				208	*
				209	* It wakes up the thread if it's sleeping, or kicks it if it's already running.
				210	*
				211	* If vCPU is HF_INVALID_VCPU, it injects a MESSAGE_INT_ID interrupt into a vCPU
				212	* belonging to the specified VM.
				213	*/
				214	static void hf_handle_wake_up_request(uint32_t vm_id, uint16_t vcpu)
				215	{
				216	struct hf_vm *vm;
				217
				218	if (vm_id > hf_vm_count) {
				219	pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
				220	return;
				221	}
				222
				223	vm = &hf_vms[vm_id - 1];
				224	if (vcpu >= vm->vcpu_count) {
				225	int64_t ret;
				226
				227	if (vcpu != HF_INVALID_VCPU) {
				228	pr_warn("Request to wake up non-existent vCPU: %u.%u\n",
				229	vm_id, vcpu);
				230	return;
				231	}
				232
				233	/*
				234	* TODO: For now we're picking the first vcpu to interrupt, but
				235	* we want to be smarter.
				236	*/
				237	vcpu = 0;
				238	ret = hf_inject_interrupt(vm_id, vcpu, MESSAGE_INT_ID);
				239	if (ret != 1) {
				240	/* We don't need to wake up the vcpu. */
				241	return;
				242	}
				243	}
				244
				245	if (hf_vcpu_wake_up(&vm->vcpu[vcpu]) == 0) {
				246	/*
				247	* The task was already running (presumably on a different
				248	* physical CPU); interrupt it. This gives Hafnium a chance to
				249	* inject any new interrupts.
				250	*/
				251	kick_process(vm->vcpu[vcpu].task);
				252	}
				253	}
				254
				255	/**
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	256	* This is the main loop of each vcpu.
				257	*/
				258	static int hf_vcpu_thread(void *data)
				259	{
				260	struct hf_vcpu *vcpu = data;
Andrew Scull	dc8cab5	2018-10-10 18:29:39 +0100	[diff] [blame]	261	struct hf_vcpu_run_return ret;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	262
				263	hrtimer_init(&vcpu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				264	vcpu->timer.function = &hf_vcpu_timer_expired;
				265
				266	while (!kthread_should_stop()) {
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	267	/*
				268	* We're about to run the vcpu, so we can reset the abort-sleep
				269	* flag.
				270	*/
				271	atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	272
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	273	/* Call into Hafnium to run vcpu. */
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	274	ret = hf_vcpu_run(vcpu->vm->id, vcpu->vcpu_index);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	275
Andrew Scull	dc8cab5	2018-10-10 18:29:39 +0100	[diff] [blame]	276	switch (ret.code) {
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	277	/* Yield (forcibly or voluntarily). */
				278	case HF_VCPU_RUN_YIELD:
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	279	break;
				280
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	281	/* WFI. */
				282	case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	283	hf_vcpu_sleep(vcpu);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	284	break;
				285
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	286	/* Wake up another vcpu. */
				287	case HF_VCPU_RUN_WAKE_UP:
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	288	hf_handle_wake_up_request(ret.wake_up.vm_id,
				289	ret.wake_up.vcpu);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	290	break;
Wedson Almeida Filho	f9e1192	2018-08-12 15:54:31 +0100	[diff] [blame]	291
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	292	/* Response available. */
Andrew Scull	0973a2e	2018-10-05 11:11:24 +0100	[diff] [blame]	293	case HF_VCPU_RUN_MESSAGE:
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	294	hf_handle_message(vcpu->vm, page_address(hf_recv_page),
				295	ret.message.size);
				296	hf_mailbox_clear();
Wedson Almeida Filho	f9e1192	2018-08-12 15:54:31 +0100	[diff] [blame]	297	break;
Andrew Scull	dc8cab5	2018-10-10 18:29:39 +0100	[diff] [blame]	298
				299	case HF_VCPU_RUN_SLEEP:
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	300	hrtimer_start(&vcpu->timer, ret.sleep.ns,
				301	HRTIMER_MODE_REL);
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	302	hf_vcpu_sleep(vcpu);
Andrew Scull	dc8cab5	2018-10-10 18:29:39 +0100	[diff] [blame]	303	hrtimer_cancel(&vcpu->timer);
				304	break;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	305	}
				306	}
				307
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	308	return 0;
				309	}
				310
				311	/**
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	312	* Converts a pointer to a struct sock into a pointer to a struct hf_sock. It
				313	* relies on the fact that the first field of hf_sock is a sock.
				314	*/
				315	static struct hf_sock hsock_from_sk(struct sock sk)
				316	{
				317	return (struct hf_sock *)sk;
				318	}
				319
				320	/**
				321	* This is called when the last reference to the outer socket is released. For
				322	* example, if it's a user-space socket, when the last file descriptor pointing
				323	* to this socket is closed.
				324	*
				325	* It begins cleaning up resources, though some can only be cleaned up after all
				326	* references to the underlying socket are released, which is handled by
				327	* hf_sock_destruct().
				328	*/
				329	static int hf_sock_release(struct socket *sock)
				330	{
				331	struct sock *sk = sock->sk;
				332	struct hf_sock *hsock = hsock_from_sk(sk);
				333	unsigned long flags;
				334
				335	if (!sk)
				336	return 0;
				337
				338	/* Shutdown for both send and receive. */
				339	lock_sock(sk);
				340	sk->sk_shutdown \|= RCV_SHUTDOWN \| SEND_SHUTDOWN;
				341	sk->sk_state_change(sk);
				342	release_sock(sk);
				343
				344	/* Remove from the hash table, so lookups from now on won't find it. */
				345	spin_lock_irqsave(&hf_local_port_hash_lock, flags);
				346	hash_del_rcu(&hsock->sk.sk_node);
				347	spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
				348
				349	/*
				350	* TODO: When we implement a tx queue, we need to clear it here so that
				351	* sk_wmem_alloc will not prevent sk from being freed (sk_free).
				352	*/
				353
				354	/*
				355	* Wait for in-flight lookups to finish. We need to do this here because
Wedson Almeida Filho	89d0e47	2019-01-03 19:18:39 +0000	[diff] [blame^]	356	* in-flight lookups rely on the reference to the socket we're about to
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	357	* release.
				358	*/
				359	synchronize_rcu();
				360	sock_put(sk);
				361	sock->sk = NULL;
				362
				363	return 0;
				364	}
				365
				366	/**
				367	* This is called when there are no more references to the socket. It frees all
				368	* resources that haven't been freed during release.
				369	*/
				370	static void hf_sock_destruct(struct sock *sk)
				371	{
				372	/*
				373	* Clear the receive queue now that the handler cannot add any more
				374	* skbs to it.
				375	*/
				376	skb_queue_purge(&sk->sk_receive_queue);
				377	}
				378
				379	/**
				380	* Connects the Hafnium socket to the provided VM and port. After the socket is
				381	* connected, it can be used to exchange datagrams with the specified peer.
				382	*/
				383	static int hf_sock_connect(struct socket sock, struct sockaddr saddr,
				384	int len, int connect_flags)
				385	{
				386	struct sock *sk = sock->sk;
				387	struct hf_sock *hsock = hsock_from_sk(sk);
				388	struct hf_vm *vm;
				389	struct sockaddr_hf *addr;
				390	int err;
				391	unsigned long flags;
				392
				393	/* Basic address validation. */
				394	if (len < sizeof(struct sockaddr_hf) \|\| saddr->sa_family != AF_HF)
				395	return -EINVAL;
				396
				397	addr = (struct sockaddr_hf *)saddr;
				398	if (addr->vm_id > hf_vm_count)
				399	return -ENETUNREACH;
				400
				401	vm = &hf_vms[addr->vm_id - 1];
				402
				403	/*
				404	* TODO: Once we implement access control in Hafnium, check that the
				405	* caller is allowed to contact the specified VM. Return -ECONNREFUSED
				406	* if access is denied.
				407	*/
				408
				409	/* Take lock to make sure state doesn't change as we connect. */
				410	lock_sock(sk);
				411
				412	/* Only unconnected sockets are allowed to become connected. */
				413	if (sock->state != SS_UNCONNECTED) {
				414	err = -EISCONN;
				415	goto exit;
				416	}
				417
				418	hsock->local_port = atomic64_inc_return(&hf_next_port);
				419	hsock->remote_port = addr->port;
				420	hsock->peer_vm = vm;
				421
				422	sock->state = SS_CONNECTED;
				423
				424	/* Add socket to hash table now that it's fully initialised. */
				425	spin_lock_irqsave(&hf_local_port_hash_lock, flags);
				426	hash_add_rcu(hf_local_port_hash, &sk->sk_node, hsock->local_port);
				427	spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
				428
				429	err = 0;
				430	exit:
				431	release_sock(sk);
				432	return err;
				433	}
				434
				435	/**
				436	* Sends the given skb to the appropriate VM by calling Hafnium. It will also
				437	* trigger the wake up of a recipient VM.
				438	*
				439	* Takes ownership of the skb on success.
				440	*/
				441	static int hf_send_skb(struct sk_buff *skb)
				442	{
				443	unsigned long flags;
				444	int64_t ret;
				445	struct hf_sock *hsock = hsock_from_sk(skb->sk);
				446	struct hf_vm *vm = hsock->peer_vm;
				447
				448	/*
				449	* Call Hafnium under the send lock so that we serialize the use of the
				450	* global send buffer.
				451	*/
				452	spin_lock_irqsave(&hf_send_lock, flags);
				453	memcpy(page_address(hf_send_page), skb->data, skb->len);
				454	ret = hf_mailbox_send(vm->id, skb->len);
				455	spin_unlock_irqrestore(&hf_send_lock, flags);
				456
				457	if (ret < 0)
				458	return -EAGAIN;
				459
				460	/* Wake some vcpu up to handle the new message. */
				461	hf_handle_wake_up_request(vm->id, ret);
				462
				463	kfree_skb(skb);
				464
				465	return 0;
				466	}
				467
				468	/**
				469	* Determines if the given socket is in the connected state. It acquires and
				470	* releases the socket lock.
				471	*/
				472	static bool hf_sock_is_connected(struct socket *sock)
				473	{
				474	bool ret;
				475
				476	lock_sock(sock->sk);
				477	ret = sock->state == SS_CONNECTED;
				478	release_sock(sock->sk);
				479
				480	return ret;
				481	}
				482
				483	/**
				484	* Sends a message to the VM & port the socket is connected to. All variants
				485	* of write/send/sendto/sendmsg eventually call this function.
				486	*/
				487	static int hf_sock_sendmsg(struct socket sock, struct msghdr m, size_t len)
				488	{
				489	struct sock *sk = sock->sk;
				490	struct sk_buff *skb;
				491	int err;
				492	struct hf_msg_hdr *hdr;
				493	struct hf_sock *hsock = hsock_from_sk(sk);
				494
				495	/* Check length. */
				496	if (len > HF_MAILBOX_SIZE - sizeof(struct hf_msg_hdr))
				497	return -EMSGSIZE;
				498
				499	/* We don't allow the destination address to be specified. */
				500	if (m->msg_namelen > 0)
				501	return -EISCONN;
				502
				503	/* We don't support out of band messages. */
				504	if (m->msg_flags & MSG_OOB)
				505	return -EOPNOTSUPP;
				506
				507	/*
				508	* Ensure that the socket is connected. We don't need to hold the socket
				509	* lock (acquired and released by hf_sock_is_connected) for the
				510	* remainder of the function because the fields we care about are
				511	* immutable once the state is SS_CONNECTED.
				512	*/
				513	if (!hf_sock_is_connected(sock))
				514	return -ENOTCONN;
				515
				516	/*
				517	* Allocate an skb for this write. If there isn't enough room in the
				518	* socket's send buffer (sk_wmem_alloc >= sk_sndbuf), this will block
				519	* (if it's a blocking call). On success, it increments sk_wmem_alloc
				520	* and sets up the skb such that sk_wmem_alloc gets decremented when
				521	* the skb is freed (sock_wfree gets called).
				522	*/
				523	skb = sock_alloc_send_skb(sk, len + sizeof(struct hf_msg_hdr),
				524	m->msg_flags & MSG_DONTWAIT, &err);
				525	if (!skb)
				526	return err;
				527
				528	/* Reserve room for the header and initialise it. */
				529	skb_reserve(skb, sizeof(struct hf_msg_hdr));
				530	hdr = skb_push(skb, sizeof(struct hf_msg_hdr));
				531	hdr->src_port = hsock->local_port;
				532	hdr->dst_port = hsock->remote_port;
				533
				534	/* Allocate area for the contents, then copy into skb. */
				535	if (!copy_from_iter_full(skb_put(skb, len), len, &m->msg_iter)) {
				536	err = -EFAULT;
				537	goto err_cleanup;
				538	}
				539
				540	/*
				541	* TODO: We currently do this inline, but when we have support for
				542	* readiness notification from Hafnium, we must add this to a per-VM tx
				543	* queue that can make progress when the VM becomes writable. This will
				544	* fix send buffering and poll readiness notification.
				545	*/
				546	err = hf_send_skb(skb);
				547	if (err)
				548	goto err_cleanup;
				549
				550	return 0;
				551
				552	err_cleanup:
				553	kfree_skb(skb);
				554	return err;
				555	}
				556
				557	/**
				558	* Receives a message originated from the VM & port the socket is connected to.
				559	* All variants of read/recv/recvfrom/recvmsg eventually call this function.
				560	*/
				561	static int hf_sock_recvmsg(struct socket sock, struct msghdr m, size_t len,
				562	int flags)
				563	{
				564	struct sock *sk = sock->sk;
				565	struct sk_buff *skb;
				566	int err;
				567	size_t copy_len;
				568
				569	if (!hf_sock_is_connected(sock))
				570	return -ENOTCONN;
				571
				572	/* Grab the next skb from the receive queue. */
				573	skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
				574	if (!skb)
				575	return err;
				576
				577	/* Make sure we don't copy more than what fits in the output buffer. */
				578	copy_len = skb->len;
				579	if (copy_len > len) {
				580	copy_len = len;
				581	m->msg_flags \|= MSG_TRUNC;
				582	}
				583
				584	/* Make sure we don't overflow the return value type. */
				585	if (copy_len > INT_MAX) {
				586	copy_len = INT_MAX;
				587	m->msg_flags \|= MSG_TRUNC;
				588	}
				589
				590	/* Copy skb to output iterator, then free it. */
				591	err = skb_copy_datagram_msg(skb, 0, m, copy_len);
				592	skb_free_datagram(sk, skb);
				593	if (err)
				594	return err;
				595
				596	return copy_len;
				597	}
				598
				599	/**
				600	* This function is called when a Hafnium socket is created. It initialises all
				601	* state such that the caller will be able to connect the socket and then send
				602	* and receive messages through it.
				603	*/
				604	static int hf_sock_create(struct net net, struct socket sock, int protocol,
				605	int kern)
				606	{
				607	static const struct proto_ops ops = {
				608	.family = PF_HF,
				609	.owner = THIS_MODULE,
				610	.release = hf_sock_release,
				611	.bind = sock_no_bind,
				612	.connect = hf_sock_connect,
				613	.socketpair = sock_no_socketpair,
				614	.accept = sock_no_accept,
				615	.ioctl = sock_no_ioctl,
				616	.listen = sock_no_listen,
				617	.shutdown = sock_no_shutdown,
				618	.setsockopt = sock_no_setsockopt,
				619	.getsockopt = sock_no_getsockopt,
				620	.sendmsg = hf_sock_sendmsg,
				621	.recvmsg = hf_sock_recvmsg,
				622	.mmap = sock_no_mmap,
				623	.sendpage = sock_no_sendpage,
				624	.poll = datagram_poll,
				625	};
				626	struct sock *sk;
				627
				628	if (sock->type != SOCK_DGRAM)
				629	return -ESOCKTNOSUPPORT;
				630
				631	if (protocol != 0)
				632	return -EPROTONOSUPPORT;
				633
				634	/*
				635	* For now we only allow callers with sys admin capability to create
				636	* Hafnium sockets.
				637	*/
				638	if (!capable(CAP_SYS_ADMIN))
				639	return -EPERM;
				640
				641	/* Allocate and initialise socket. */
				642	sk = sk_alloc(net, PF_HF, GFP_KERNEL, &hf_sock_proto, kern);
				643	if (!sk)
				644	return -ENOMEM;
				645
				646	sock_init_data(sock, sk);
				647
				648	sk->sk_destruct = hf_sock_destruct;
				649	sock->ops = &ops;
				650	sock->state = SS_UNCONNECTED;
				651
				652	return 0;
				653	}
				654
				655	/**
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	656	* Frees all resources, including threads, associated with the Hafnium driver.
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	657	*/
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	658	static void hf_free_resources(void)
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	659	{
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	660	uint32_t i, j;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	661
				662	/*
				663	* First stop all worker threads. We need to do this before freeing
				664	* resources because workers may reference each other, so it is only
				665	* safe to free resources after they have all stopped.
				666	*/
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	667	for (i = 0; i < hf_vm_count; i++) {
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	668	struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	669
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	670	for (j = 0; j < vm->vcpu_count; j++)
				671	kthread_stop(vm->vcpu[j].task);
				672	}
				673
				674	/* Free resources. */
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	675	for (i = 0; i < hf_vm_count; i++) {
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	676	struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	677
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	678	for (j = 0; j < vm->vcpu_count; j++)
				679	put_task_struct(vm->vcpu[j].task);
				680	kfree(vm->vcpu);
				681	}
				682
				683	kfree(hf_vms);
				684	}
				685
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	686	/**
				687	* Initializes the Hafnium driver by creating a thread for each vCPU of each
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	688	* virtual machine.
				689	*/
				690	static int __init hf_init(void)
				691	{
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	692	static const struct net_proto_family proto_family = {
				693	.family = PF_HF,
				694	.create = hf_sock_create,
				695	.owner = THIS_MODULE,
				696	};
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	697	int64_t ret;
				698	uint32_t i, j;
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	699	uint32_t total_vm_count;
				700	uint32_t total_vcpu_count;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	701
Wedson Almeida Filho	f9e1192	2018-08-12 15:54:31 +0100	[diff] [blame]	702	/* Allocate a page for send and receive buffers. */
				703	hf_send_page = alloc_page(GFP_KERNEL);
				704	if (!hf_send_page) {
				705	pr_err("Unable to allocate send buffer\n");
				706	return -ENOMEM;
				707	}
				708
				709	hf_recv_page = alloc_page(GFP_KERNEL);
				710	if (!hf_recv_page) {
				711	__free_page(hf_send_page);
				712	pr_err("Unable to allocate receive buffer\n");
				713	return -ENOMEM;
				714	}
				715
				716	/*
				717	* Configure both addresses. Once configured, we cannot free these pages
				718	* because the hypervisor will use them, even if the module is
				719	* unloaded.
				720	*/
Andrew Scull	5570423	2018-08-10 17:19:54 +0100	[diff] [blame]	721	ret = hf_vm_configure(page_to_phys(hf_send_page),
				722	page_to_phys(hf_recv_page));
Wedson Almeida Filho	f9e1192	2018-08-12 15:54:31 +0100	[diff] [blame]	723	if (ret) {
				724	__free_page(hf_send_page);
				725	__free_page(hf_recv_page);
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	726	/*
				727	* TODO: We may want to grab this information from hypervisor
				728	* and go from there.
				729	*/
Wedson Almeida Filho	f9e1192	2018-08-12 15:54:31 +0100	[diff] [blame]	730	pr_err("Unable to configure VM\n");
				731	return -EIO;
				732	}
				733
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	734	/* Get the number of VMs. */
Andrew Scull	5570423	2018-08-10 17:19:54 +0100	[diff] [blame]	735	ret = hf_vm_get_count();
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	736	if (ret < 0) {
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	737	pr_err("Unable to retrieve number of VMs: %lld\n", ret);
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	738	return -EIO;
				739	}
				740
				741	/* Confirm the maximum number of VMs looks sane. */
				742	BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS < 1);
				743	BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS > U16_MAX);
				744
				745	/* Validate the number of VMs. There must at least be the primary. */
				746	if (ret < 1 \|\| ret > CONFIG_HAFNIUM_MAX_VMS) {
				747	pr_err("Number of VMs is out of range: %lld\n", ret);
				748	return -EDQUOT;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	749	}
				750
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	751	/* Only track the secondary VMs. */
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	752	total_vm_count = ret - 1;
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	753	hf_vms = kmalloc_array(total_vm_count, sizeof(struct hf_vm),
				754	GFP_KERNEL);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	755	if (!hf_vms)
				756	return -ENOMEM;
				757
				758	/* Initialize each VM. */
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	759	total_vcpu_count = 0;
				760	for (i = 0; i < total_vm_count; i++) {
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	761	struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	762
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	763	/* Adjust the ID as only the secondaries are tracked. */
				764	vm->id = i + 1;
				765
				766	ret = hf_vcpu_get_count(vm->id);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	767	if (ret < 0) {
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	768	pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %lld",
				769	vm->id, ret);
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	770	ret = -EIO;
				771	goto fail_with_cleanup;
				772	}
				773
				774	/* Avoid overflowing the vcpu count. */
				775	if (ret > (U32_MAX - total_vcpu_count)) {
				776	pr_err("Too many vcpus: %u\n", total_vcpu_count);
				777	ret = -EDQUOT;
				778	goto fail_with_cleanup;
				779	}
				780
				781	/* Confirm the maximum number of VCPUs looks sane. */
				782	BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS < 1);
				783	BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS > U16_MAX);
				784
				785	/* Enforce the limit on vcpus. */
				786	total_vcpu_count += ret;
				787	if (total_vcpu_count > CONFIG_HAFNIUM_MAX_VCPUS) {
				788	pr_err("Too many vcpus: %u\n", total_vcpu_count);
				789	ret = -EDQUOT;
				790	goto fail_with_cleanup;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	791	}
				792
				793	vm->vcpu_count = ret;
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	794	vm->vcpu = kmalloc_array(vm->vcpu_count, sizeof(struct hf_vcpu),
				795	GFP_KERNEL);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	796	if (!vm->vcpu) {
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	797	pr_err("No memory for %u vcpus for vm %u",
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	798	vm->vcpu_count, vm->id);
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	799	ret = -ENOMEM;
				800	goto fail_with_cleanup;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	801	}
				802
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	803	/* Update the number of initialized VMs. */
				804	hf_vm_count = i + 1;
				805
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	806	/* Create a kernel thread for each vcpu. */
				807	for (j = 0; j < vm->vcpu_count; j++) {
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	808	struct hf_vcpu *vcpu = &vm->vcpu[j];
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	809	vcpu->task = kthread_create(hf_vcpu_thread, vcpu,
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	810	"vcpu_thread_%u_%u",
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	811	vm->id, j);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	812	if (IS_ERR(vcpu->task)) {
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	813	pr_err("Error creating task (vm=%u,vcpu=%u): %ld\n",
				814	vm->id, j, PTR_ERR(vcpu->task));
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	815	vm->vcpu_count = j;
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	816	ret = PTR_ERR(vcpu->task);
				817	goto fail_with_cleanup;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	818	}
				819
				820	get_task_struct(vcpu->task);
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	821	vcpu->vm = vm;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	822	vcpu->vcpu_index = j;
Wedson Almeida Filho	7fe6233	2018-12-15 03:09:57 +0000	[diff] [blame]	823	atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	824	}
				825	}
				826
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	827	/* Register protocol and socket family. */
				828	ret = proto_register(&hf_sock_proto, 0);
				829	if (ret) {
				830	pr_err("Unable to register protocol: %lld\n", ret);
				831	goto fail_with_cleanup;
				832	}
				833
				834	ret = sock_register(&proto_family);
				835	if (ret) {
				836	pr_err("Unable to register Hafnium's socket family: %lld\n",
				837	ret);
				838	goto fail_unregister_proto;
				839	}
				840
				841	/*
				842	* Start running threads now that all is initialized.
				843	*
				844	* Any failures from this point on must also unregister the socket
				845	* family with a call to sock_unregister().
				846	*/
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	847	for (i = 0; i < hf_vm_count; i++) {
Andrew Scull	b3a61b5	2018-09-17 14:30:34 +0100	[diff] [blame]	848	struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	849	for (j = 0; j < vm->vcpu_count; j++)
				850	wake_up_process(vm->vcpu[j].task);
				851	}
				852
				853	/* Dump vm/vcpu count info. */
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	854	pr_info("Hafnium successfully loaded with %u VMs:\n", hf_vm_count);
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	855	for (i = 0; i < hf_vm_count; i++) {
				856	struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	857
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	858	pr_info("\tVM %u: %u vCPUS\n", vm->id, vm->vcpu_count);
Andrew Scull	b722f95	2018-09-27 15:39:10 +0100	[diff] [blame]	859	}
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	860
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	861	return 0;
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	862
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	863	fail_unregister_proto:
				864	proto_unregister(&hf_sock_proto);
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	865	fail_with_cleanup:
				866	hf_free_resources();
				867	return ret;
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	868	}
				869
				870	/**
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	871	* Frees up all resources used by the Hafnium driver in preparation for
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	872	* unloading it.
				873	*/
				874	static void __exit hf_exit(void)
				875	{
Andrew Scull	bb7ae41	2018-09-28 21:07:15 +0100	[diff] [blame]	876	pr_info("Preparing to unload Hafnium\n");
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	877	sock_unregister(PF_HF);
				878	proto_unregister(&hf_sock_proto);
Andrew Scull	82257c4	2018-10-01 10:37:48 +0100	[diff] [blame]	879	hf_free_resources();
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	880	pr_info("Hafnium ready to unload\n");
				881	}
				882
Wedson Almeida Filho	1ee3565	2018-12-24 01:36:48 +0000	[diff] [blame]	883	MODULE_LICENSE("GPL v2");
Wedson Almeida Filho	2f62b42	2018-06-19 06:44:32 +0100	[diff] [blame]	884
				885	module_init(hf_init);
				886	module_exit(hf_exit);