blob: bedc3f0f419c6840ee232cac634f23ec34d0efb0 [file] [log] [blame]
Andrew Walbran13c3a0b2018-11-30 11:51:53 +00001/*
2 * Copyright 2018 Google LLC
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 */
17
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010018#include <linux/hrtimer.h>
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000019#include <linux/atomic.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010020#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/kthread.h>
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +010023#include <linux/mm.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010024#include <linux/module.h>
25#include <linux/sched/task.h>
26#include <linux/slab.h>
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000027#include <linux/net.h>
28#include <net/sock.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010029
Andrew Scull55704232018-08-10 17:19:54 +010030#include <hf/call.h>
31
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000032/* TODO: Reusing AF_ECONET for now as it's otherwise unused. */
33#define AF_HF AF_ECONET
34#define PF_HF AF_HF
35
Andrew Scull82257c42018-10-01 10:37:48 +010036#define CONFIG_HAFNIUM_MAX_VMS 16
37#define CONFIG_HAFNIUM_MAX_VCPUS 32
38
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010039struct hf_vcpu {
Andrew Scullb722f952018-09-27 15:39:10 +010040 struct hf_vm *vm;
Andrew Scull55704232018-08-10 17:19:54 +010041 uint32_t vcpu_index;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010042 struct task_struct *task;
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +000043 atomic_t abort_sleep;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010044 struct hrtimer timer;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010045};
46
47struct hf_vm {
Andrew Scullb722f952018-09-27 15:39:10 +010048 uint32_t id;
Andrew Scullbb7ae412018-09-28 21:07:15 +010049 uint32_t vcpu_count;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010050 struct hf_vcpu *vcpu;
51};
52
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000053struct hf_msg_hdr {
54 uint64_t src_port;
55 uint64_t dst_port;
56};
57
58struct hf_sock {
59 /* This needs to be the first field. */
60 struct sock sk;
61
62 /*
63 * The following fields are immutable after the socket transitions to
64 * SS_CONNECTED state.
65 */
66 uint64_t local_port;
67 uint64_t remote_port;
68 struct hf_vm *peer_vm;
69};
70
71struct sockaddr_hf {
72 sa_family_t family;
73 uint32_t vm_id;
74 uint64_t port;
75};
76
77static struct proto hf_sock_proto = {
78 .name = "hafnium",
79 .owner = THIS_MODULE,
80 .obj_size = sizeof(struct hf_sock),
81};
82
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010083static struct hf_vm *hf_vms;
Andrew Scullbb7ae412018-09-28 21:07:15 +010084static uint32_t hf_vm_count;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000085static struct page *hf_send_page;
86static struct page *hf_recv_page;
87static atomic64_t hf_next_port = ATOMIC64_INIT(0);
88static DEFINE_SPINLOCK(hf_send_lock);
89static DEFINE_HASHTABLE(hf_local_port_hash, 7);
90static DEFINE_SPINLOCK(hf_local_port_hash_lock);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010091
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010092/**
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +000093 * Wakes up the kernel thread responsible for running the given vcpu.
94 *
95 * Returns 0 if the thread was already running, 1 otherwise.
96 */
97static int hf_vcpu_wake_up(struct hf_vcpu *vcpu)
98{
99 /* Set a flag indicating that the thread should not go to sleep. */
100 atomic_set(&vcpu->abort_sleep, 1);
101
102 /* Set the thread to running state. */
103 return wake_up_process(vcpu->task);
104}
105
106/**
107 * Puts the current thread to sleep. The current thread must be responsible for
108 * running the given vcpu.
109 *
110 * Going to sleep will fail if hf_vcpu_wake_up() or kthread_stop() was called on
111 * this vcpu/thread since the last time it [re]started running.
112 */
113static void hf_vcpu_sleep(struct hf_vcpu *vcpu)
114{
115 int abort;
116
117 set_current_state(TASK_INTERRUPTIBLE);
118
119 /* Check the sleep-abort flag after making thread interruptible. */
120 abort = atomic_read(&vcpu->abort_sleep);
121 if (!abort && !kthread_should_stop())
122 schedule();
123
124 /* Set state back to running on the way out. */
125 set_current_state(TASK_RUNNING);
126}
127
128/**
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100129 * Wakes up the thread associated with the vcpu that owns the given timer. This
130 * is called when the timer the thread is waiting on expires.
131 */
132static enum hrtimer_restart hf_vcpu_timer_expired(struct hrtimer *timer)
133{
134 struct hf_vcpu *vcpu = container_of(timer, struct hf_vcpu, timer);
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000135 /* TODO: Inject interrupt. */
136 hf_vcpu_wake_up(vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100137 return HRTIMER_NORESTART;
138}
139
140/**
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000141 * Handles a message delivered to this VM by validating that it's well-formed
142 * and then queueing it for delivery to the appropriate socket.
143 */
144static void hf_handle_message(struct hf_vm *sender, const void *ptr, size_t len)
145{
146 struct hf_sock *hsock;
147 const struct hf_msg_hdr *hdr = ptr;
148 struct sk_buff *skb;
149 int err;
150
151 /* Ignore messages that are too small to hold a header. */
152 if (len < sizeof(struct hf_msg_hdr))
153 return;
154
155 len -= sizeof(struct hf_msg_hdr);
156
157 /* Go through the colliding sockets. */
158 rcu_read_lock();
159 hash_for_each_possible_rcu(hf_local_port_hash, hsock, sk.sk_node,
160 hdr->dst_port) {
161 if (hsock->peer_vm == sender &&
162 hsock->remote_port == hdr->src_port) {
163 sock_hold(&hsock->sk);
164 break;
165 }
166 }
167 rcu_read_unlock();
168
169 /* Nothing to do if we couldn't find the target. */
170 if (!hsock)
171 return;
172
Wedson Almeida Filho89d0e472019-01-03 19:18:39 +0000173 /*
174 * TODO: From this point on, there are two failure paths: when we
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000175 * create the skb below, and when we enqueue it to the socket. What
176 * should we do if they fail? Ideally we would have some form of flow
177 * control to prevent message loss, but how to do it efficiently?
178 *
179 * One option is to have a pre-allocated message that indicates to the
180 * sender that a message was dropped. This way we guarantee that the
181 * sender will be aware of loss and should back-off.
182 */
183 /* Create the skb. */
184 skb = alloc_skb(len, GFP_KERNEL);
185 if (!skb)
186 goto exit;
187
188 memcpy(skb_put(skb, len), hdr + 1, len);
189
190 /*
191 * Add the skb to the receive queue of the target socket. On success it
192 * calls sk->sk_data_ready, which is currently set to sock_def_readable,
193 * which wakes up any waiters.
194 */
195 err = sock_queue_rcv_skb(&hsock->sk, skb);
196 if (err)
197 kfree_skb(skb);
198
199exit:
200 sock_put(&hsock->sk);
201}
202
203/**
204 * This function is called when Hafnium requests that the primary VM wake up a
205 * vCPU that belongs to a secondary VM.
206 *
207 * It wakes up the thread if it's sleeping, or kicks it if it's already running.
208 *
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000209 * If vCPU is HF_INVALID_VCPU, it injects an interrupt into a vCPU belonging to
210 * the specified VM.
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000211 */
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000212static void hf_handle_wake_up_request(uint32_t vm_id, uint16_t vcpu,
213 uint64_t int_id)
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000214{
215 struct hf_vm *vm;
216
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000217 if (vm_id < 1 || vm_id > hf_vm_count) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000218 pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
219 return;
220 }
221
222 vm = &hf_vms[vm_id - 1];
223 if (vcpu >= vm->vcpu_count) {
224 int64_t ret;
225
226 if (vcpu != HF_INVALID_VCPU) {
227 pr_warn("Request to wake up non-existent vCPU: %u.%u\n",
228 vm_id, vcpu);
229 return;
230 }
231
232 /*
233 * TODO: For now we're picking the first vcpu to interrupt, but
234 * we want to be smarter.
235 */
236 vcpu = 0;
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000237 ret = hf_interrupt_inject(vm_id, vcpu, int_id);
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000238 if (ret != 1) {
239 /* We don't need to wake up the vcpu. */
240 return;
241 }
242 }
243
244 if (hf_vcpu_wake_up(&vm->vcpu[vcpu]) == 0) {
245 /*
246 * The task was already running (presumably on a different
247 * physical CPU); interrupt it. This gives Hafnium a chance to
248 * inject any new interrupts.
249 */
250 kick_process(vm->vcpu[vcpu].task);
251 }
252}
253
254/**
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000255 * Notify all waiters on the given VM.
256 */
257static void hf_notify_waiters(uint32_t vm_id)
258{
259 int64_t ret;
260
261 while ((ret = hf_mailbox_waiter_get(vm_id)) != -1) {
262 if (ret == HF_PRIMARY_VM_ID) {
263 /*
264 * TODO: Use this information when implementing per-vm
265 * queues.
266 */
267 } else {
268 hf_handle_wake_up_request(ret, HF_INVALID_VCPU,
269 HF_MAILBOX_WRITABLE_INTID);
270 }
271 }
272}
273
274/**
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100275 * This is the main loop of each vcpu.
276 */
277static int hf_vcpu_thread(void *data)
278{
279 struct hf_vcpu *vcpu = data;
Andrew Sculldc8cab52018-10-10 18:29:39 +0100280 struct hf_vcpu_run_return ret;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100281
282 hrtimer_init(&vcpu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
283 vcpu->timer.function = &hf_vcpu_timer_expired;
284
285 while (!kthread_should_stop()) {
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000286 /*
287 * We're about to run the vcpu, so we can reset the abort-sleep
288 * flag.
289 */
290 atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100291
Andrew Scullbb7ae412018-09-28 21:07:15 +0100292 /* Call into Hafnium to run vcpu. */
Andrew Scullb722f952018-09-27 15:39:10 +0100293 ret = hf_vcpu_run(vcpu->vm->id, vcpu->vcpu_index);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100294
Andrew Sculldc8cab52018-10-10 18:29:39 +0100295 switch (ret.code) {
Andrew Sculle05702e2019-01-08 14:46:46 +0000296 /* Preempted. */
297 case HF_VCPU_RUN_PREEMPTED:
298 if (need_resched())
299 schedule();
300 break;
301
302 /* Yield. */
Andrew Scullb3a61b52018-09-17 14:30:34 +0100303 case HF_VCPU_RUN_YIELD:
Andrew Sculle05702e2019-01-08 14:46:46 +0000304 if (!kthread_should_stop())
305 schedule();
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100306 break;
307
Andrew Scullb3a61b52018-09-17 14:30:34 +0100308 /* WFI. */
309 case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000310 hf_vcpu_sleep(vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100311 break;
312
Andrew Scullb3a61b52018-09-17 14:30:34 +0100313 /* Wake up another vcpu. */
314 case HF_VCPU_RUN_WAKE_UP:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000315 hf_handle_wake_up_request(ret.wake_up.vm_id,
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000316 ret.wake_up.vcpu,
317 HF_MAILBOX_READBLE_INTID);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100318 break;
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100319
Andrew Scullb3a61b52018-09-17 14:30:34 +0100320 /* Response available. */
Andrew Scull0973a2e2018-10-05 11:11:24 +0100321 case HF_VCPU_RUN_MESSAGE:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000322 hf_handle_message(vcpu->vm, page_address(hf_recv_page),
323 ret.message.size);
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000324 if (hf_mailbox_clear() == 1)
325 hf_notify_waiters(HF_PRIMARY_VM_ID);
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100326 break;
Andrew Sculldc8cab52018-10-10 18:29:39 +0100327
328 case HF_VCPU_RUN_SLEEP:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000329 hrtimer_start(&vcpu->timer, ret.sleep.ns,
330 HRTIMER_MODE_REL);
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000331 hf_vcpu_sleep(vcpu);
Andrew Sculldc8cab52018-10-10 18:29:39 +0100332 hrtimer_cancel(&vcpu->timer);
333 break;
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000334
335 /* Notify all waiters. */
336 case HF_VCPU_RUN_NOTIFY_WAITERS:
337 hf_notify_waiters(vcpu->vm->id);
338 break;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100339 }
340 }
341
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100342 return 0;
343}
344
345/**
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000346 * Converts a pointer to a struct sock into a pointer to a struct hf_sock. It
347 * relies on the fact that the first field of hf_sock is a sock.
348 */
349static struct hf_sock *hsock_from_sk(struct sock *sk)
350{
351 return (struct hf_sock *)sk;
352}
353
354/**
355 * This is called when the last reference to the outer socket is released. For
356 * example, if it's a user-space socket, when the last file descriptor pointing
357 * to this socket is closed.
358 *
359 * It begins cleaning up resources, though some can only be cleaned up after all
360 * references to the underlying socket are released, which is handled by
361 * hf_sock_destruct().
362 */
363static int hf_sock_release(struct socket *sock)
364{
365 struct sock *sk = sock->sk;
366 struct hf_sock *hsock = hsock_from_sk(sk);
367 unsigned long flags;
368
369 if (!sk)
370 return 0;
371
372 /* Shutdown for both send and receive. */
373 lock_sock(sk);
374 sk->sk_shutdown |= RCV_SHUTDOWN | SEND_SHUTDOWN;
375 sk->sk_state_change(sk);
376 release_sock(sk);
377
378 /* Remove from the hash table, so lookups from now on won't find it. */
379 spin_lock_irqsave(&hf_local_port_hash_lock, flags);
380 hash_del_rcu(&hsock->sk.sk_node);
381 spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
382
383 /*
384 * TODO: When we implement a tx queue, we need to clear it here so that
385 * sk_wmem_alloc will not prevent sk from being freed (sk_free).
386 */
387
388 /*
389 * Wait for in-flight lookups to finish. We need to do this here because
Wedson Almeida Filho89d0e472019-01-03 19:18:39 +0000390 * in-flight lookups rely on the reference to the socket we're about to
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000391 * release.
392 */
393 synchronize_rcu();
394 sock_put(sk);
395 sock->sk = NULL;
396
397 return 0;
398}
399
400/**
401 * This is called when there are no more references to the socket. It frees all
402 * resources that haven't been freed during release.
403 */
404static void hf_sock_destruct(struct sock *sk)
405{
406 /*
407 * Clear the receive queue now that the handler cannot add any more
408 * skbs to it.
409 */
410 skb_queue_purge(&sk->sk_receive_queue);
411}
412
413/**
414 * Connects the Hafnium socket to the provided VM and port. After the socket is
415 * connected, it can be used to exchange datagrams with the specified peer.
416 */
417static int hf_sock_connect(struct socket *sock, struct sockaddr *saddr,
418 int len, int connect_flags)
419{
420 struct sock *sk = sock->sk;
421 struct hf_sock *hsock = hsock_from_sk(sk);
422 struct hf_vm *vm;
423 struct sockaddr_hf *addr;
424 int err;
425 unsigned long flags;
426
427 /* Basic address validation. */
428 if (len < sizeof(struct sockaddr_hf) || saddr->sa_family != AF_HF)
429 return -EINVAL;
430
431 addr = (struct sockaddr_hf *)saddr;
432 if (addr->vm_id > hf_vm_count)
433 return -ENETUNREACH;
434
435 vm = &hf_vms[addr->vm_id - 1];
436
437 /*
438 * TODO: Once we implement access control in Hafnium, check that the
439 * caller is allowed to contact the specified VM. Return -ECONNREFUSED
440 * if access is denied.
441 */
442
443 /* Take lock to make sure state doesn't change as we connect. */
444 lock_sock(sk);
445
446 /* Only unconnected sockets are allowed to become connected. */
447 if (sock->state != SS_UNCONNECTED) {
448 err = -EISCONN;
449 goto exit;
450 }
451
452 hsock->local_port = atomic64_inc_return(&hf_next_port);
453 hsock->remote_port = addr->port;
454 hsock->peer_vm = vm;
455
456 sock->state = SS_CONNECTED;
457
458 /* Add socket to hash table now that it's fully initialised. */
459 spin_lock_irqsave(&hf_local_port_hash_lock, flags);
460 hash_add_rcu(hf_local_port_hash, &sk->sk_node, hsock->local_port);
461 spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
462
463 err = 0;
464exit:
465 release_sock(sk);
466 return err;
467}
468
469/**
470 * Sends the given skb to the appropriate VM by calling Hafnium. It will also
471 * trigger the wake up of a recipient VM.
472 *
473 * Takes ownership of the skb on success.
474 */
475static int hf_send_skb(struct sk_buff *skb)
476{
477 unsigned long flags;
478 int64_t ret;
479 struct hf_sock *hsock = hsock_from_sk(skb->sk);
480 struct hf_vm *vm = hsock->peer_vm;
481
482 /*
483 * Call Hafnium under the send lock so that we serialize the use of the
484 * global send buffer.
485 */
486 spin_lock_irqsave(&hf_send_lock, flags);
487 memcpy(page_address(hf_send_page), skb->data, skb->len);
Wedson Almeida Filhodbfc9032019-01-09 19:03:32 +0000488 ret = hf_mailbox_send(vm->id, skb->len, false);
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000489 spin_unlock_irqrestore(&hf_send_lock, flags);
490
491 if (ret < 0)
492 return -EAGAIN;
493
494 /* Wake some vcpu up to handle the new message. */
Wedson Almeida Filhocd9fef92019-01-11 21:24:08 +0000495 hf_handle_wake_up_request(vm->id, ret, HF_MAILBOX_READBLE_INTID);
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000496
497 kfree_skb(skb);
498
499 return 0;
500}
501
502/**
503 * Determines if the given socket is in the connected state. It acquires and
504 * releases the socket lock.
505 */
506static bool hf_sock_is_connected(struct socket *sock)
507{
508 bool ret;
509
510 lock_sock(sock->sk);
511 ret = sock->state == SS_CONNECTED;
512 release_sock(sock->sk);
513
514 return ret;
515}
516
517/**
518 * Sends a message to the VM & port the socket is connected to. All variants
519 * of write/send/sendto/sendmsg eventually call this function.
520 */
521static int hf_sock_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
522{
523 struct sock *sk = sock->sk;
524 struct sk_buff *skb;
525 int err;
526 struct hf_msg_hdr *hdr;
527 struct hf_sock *hsock = hsock_from_sk(sk);
528
529 /* Check length. */
530 if (len > HF_MAILBOX_SIZE - sizeof(struct hf_msg_hdr))
531 return -EMSGSIZE;
532
533 /* We don't allow the destination address to be specified. */
534 if (m->msg_namelen > 0)
535 return -EISCONN;
536
537 /* We don't support out of band messages. */
538 if (m->msg_flags & MSG_OOB)
539 return -EOPNOTSUPP;
540
541 /*
542 * Ensure that the socket is connected. We don't need to hold the socket
543 * lock (acquired and released by hf_sock_is_connected) for the
544 * remainder of the function because the fields we care about are
545 * immutable once the state is SS_CONNECTED.
546 */
547 if (!hf_sock_is_connected(sock))
548 return -ENOTCONN;
549
550 /*
551 * Allocate an skb for this write. If there isn't enough room in the
552 * socket's send buffer (sk_wmem_alloc >= sk_sndbuf), this will block
553 * (if it's a blocking call). On success, it increments sk_wmem_alloc
554 * and sets up the skb such that sk_wmem_alloc gets decremented when
555 * the skb is freed (sock_wfree gets called).
556 */
557 skb = sock_alloc_send_skb(sk, len + sizeof(struct hf_msg_hdr),
558 m->msg_flags & MSG_DONTWAIT, &err);
559 if (!skb)
560 return err;
561
562 /* Reserve room for the header and initialise it. */
563 skb_reserve(skb, sizeof(struct hf_msg_hdr));
564 hdr = skb_push(skb, sizeof(struct hf_msg_hdr));
565 hdr->src_port = hsock->local_port;
566 hdr->dst_port = hsock->remote_port;
567
568 /* Allocate area for the contents, then copy into skb. */
569 if (!copy_from_iter_full(skb_put(skb, len), len, &m->msg_iter)) {
570 err = -EFAULT;
571 goto err_cleanup;
572 }
573
574 /*
575 * TODO: We currently do this inline, but when we have support for
576 * readiness notification from Hafnium, we must add this to a per-VM tx
577 * queue that can make progress when the VM becomes writable. This will
578 * fix send buffering and poll readiness notification.
579 */
580 err = hf_send_skb(skb);
581 if (err)
582 goto err_cleanup;
583
584 return 0;
585
586err_cleanup:
587 kfree_skb(skb);
588 return err;
589}
590
591/**
592 * Receives a message originated from the VM & port the socket is connected to.
593 * All variants of read/recv/recvfrom/recvmsg eventually call this function.
594 */
595static int hf_sock_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
596 int flags)
597{
598 struct sock *sk = sock->sk;
599 struct sk_buff *skb;
600 int err;
601 size_t copy_len;
602
603 if (!hf_sock_is_connected(sock))
604 return -ENOTCONN;
605
606 /* Grab the next skb from the receive queue. */
607 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
608 if (!skb)
609 return err;
610
611 /* Make sure we don't copy more than what fits in the output buffer. */
612 copy_len = skb->len;
613 if (copy_len > len) {
614 copy_len = len;
615 m->msg_flags |= MSG_TRUNC;
616 }
617
618 /* Make sure we don't overflow the return value type. */
619 if (copy_len > INT_MAX) {
620 copy_len = INT_MAX;
621 m->msg_flags |= MSG_TRUNC;
622 }
623
624 /* Copy skb to output iterator, then free it. */
625 err = skb_copy_datagram_msg(skb, 0, m, copy_len);
626 skb_free_datagram(sk, skb);
627 if (err)
628 return err;
629
630 return copy_len;
631}
632
633/**
634 * This function is called when a Hafnium socket is created. It initialises all
635 * state such that the caller will be able to connect the socket and then send
636 * and receive messages through it.
637 */
638static int hf_sock_create(struct net *net, struct socket *sock, int protocol,
639 int kern)
640{
641 static const struct proto_ops ops = {
642 .family = PF_HF,
643 .owner = THIS_MODULE,
644 .release = hf_sock_release,
645 .bind = sock_no_bind,
646 .connect = hf_sock_connect,
647 .socketpair = sock_no_socketpair,
648 .accept = sock_no_accept,
649 .ioctl = sock_no_ioctl,
650 .listen = sock_no_listen,
651 .shutdown = sock_no_shutdown,
652 .setsockopt = sock_no_setsockopt,
653 .getsockopt = sock_no_getsockopt,
654 .sendmsg = hf_sock_sendmsg,
655 .recvmsg = hf_sock_recvmsg,
656 .mmap = sock_no_mmap,
657 .sendpage = sock_no_sendpage,
658 .poll = datagram_poll,
659 };
660 struct sock *sk;
661
662 if (sock->type != SOCK_DGRAM)
663 return -ESOCKTNOSUPPORT;
664
665 if (protocol != 0)
666 return -EPROTONOSUPPORT;
667
668 /*
669 * For now we only allow callers with sys admin capability to create
670 * Hafnium sockets.
671 */
672 if (!capable(CAP_SYS_ADMIN))
673 return -EPERM;
674
675 /* Allocate and initialise socket. */
676 sk = sk_alloc(net, PF_HF, GFP_KERNEL, &hf_sock_proto, kern);
677 if (!sk)
678 return -ENOMEM;
679
680 sock_init_data(sock, sk);
681
682 sk->sk_destruct = hf_sock_destruct;
683 sock->ops = &ops;
684 sock->state = SS_UNCONNECTED;
685
686 return 0;
687}
688
689/**
Andrew Scullbb7ae412018-09-28 21:07:15 +0100690 * Frees all resources, including threads, associated with the Hafnium driver.
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100691 */
Andrew Scull82257c42018-10-01 10:37:48 +0100692static void hf_free_resources(void)
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100693{
Andrew Scullbb7ae412018-09-28 21:07:15 +0100694 uint32_t i, j;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100695
696 /*
697 * First stop all worker threads. We need to do this before freeing
698 * resources because workers may reference each other, so it is only
699 * safe to free resources after they have all stopped.
700 */
Andrew Scull82257c42018-10-01 10:37:48 +0100701 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100702 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000703
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100704 for (j = 0; j < vm->vcpu_count; j++)
705 kthread_stop(vm->vcpu[j].task);
706 }
707
708 /* Free resources. */
Andrew Scull82257c42018-10-01 10:37:48 +0100709 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100710 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000711
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100712 for (j = 0; j < vm->vcpu_count; j++)
713 put_task_struct(vm->vcpu[j].task);
714 kfree(vm->vcpu);
715 }
716
717 kfree(hf_vms);
718}
719
Andrew Scullbb7ae412018-09-28 21:07:15 +0100720/**
721 * Initializes the Hafnium driver by creating a thread for each vCPU of each
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100722 * virtual machine.
723 */
724static int __init hf_init(void)
725{
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000726 static const struct net_proto_family proto_family = {
727 .family = PF_HF,
728 .create = hf_sock_create,
729 .owner = THIS_MODULE,
730 };
Andrew Scullbb7ae412018-09-28 21:07:15 +0100731 int64_t ret;
732 uint32_t i, j;
Andrew Scull82257c42018-10-01 10:37:48 +0100733 uint32_t total_vm_count;
734 uint32_t total_vcpu_count;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100735
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100736 /* Allocate a page for send and receive buffers. */
737 hf_send_page = alloc_page(GFP_KERNEL);
738 if (!hf_send_page) {
739 pr_err("Unable to allocate send buffer\n");
740 return -ENOMEM;
741 }
742
743 hf_recv_page = alloc_page(GFP_KERNEL);
744 if (!hf_recv_page) {
745 __free_page(hf_send_page);
746 pr_err("Unable to allocate receive buffer\n");
747 return -ENOMEM;
748 }
749
750 /*
751 * Configure both addresses. Once configured, we cannot free these pages
752 * because the hypervisor will use them, even if the module is
753 * unloaded.
754 */
Andrew Scull55704232018-08-10 17:19:54 +0100755 ret = hf_vm_configure(page_to_phys(hf_send_page),
756 page_to_phys(hf_recv_page));
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100757 if (ret) {
758 __free_page(hf_send_page);
759 __free_page(hf_recv_page);
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000760 /*
761 * TODO: We may want to grab this information from hypervisor
762 * and go from there.
763 */
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100764 pr_err("Unable to configure VM\n");
765 return -EIO;
766 }
767
Andrew Scull82257c42018-10-01 10:37:48 +0100768 /* Get the number of VMs. */
Andrew Scull55704232018-08-10 17:19:54 +0100769 ret = hf_vm_get_count();
Andrew Scull82257c42018-10-01 10:37:48 +0100770 if (ret < 0) {
Andrew Scullbb7ae412018-09-28 21:07:15 +0100771 pr_err("Unable to retrieve number of VMs: %lld\n", ret);
Andrew Scull82257c42018-10-01 10:37:48 +0100772 return -EIO;
773 }
774
775 /* Confirm the maximum number of VMs looks sane. */
776 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS < 1);
777 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS > U16_MAX);
778
779 /* Validate the number of VMs. There must at least be the primary. */
780 if (ret < 1 || ret > CONFIG_HAFNIUM_MAX_VMS) {
781 pr_err("Number of VMs is out of range: %lld\n", ret);
782 return -EDQUOT;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100783 }
784
Andrew Scullb722f952018-09-27 15:39:10 +0100785 /* Only track the secondary VMs. */
Andrew Scull82257c42018-10-01 10:37:48 +0100786 total_vm_count = ret - 1;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000787 hf_vms = kmalloc_array(total_vm_count, sizeof(struct hf_vm),
788 GFP_KERNEL);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100789 if (!hf_vms)
790 return -ENOMEM;
791
792 /* Initialize each VM. */
Andrew Scull82257c42018-10-01 10:37:48 +0100793 total_vcpu_count = 0;
794 for (i = 0; i < total_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100795 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100796
Andrew Scullb722f952018-09-27 15:39:10 +0100797 /* Adjust the ID as only the secondaries are tracked. */
798 vm->id = i + 1;
799
800 ret = hf_vcpu_get_count(vm->id);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100801 if (ret < 0) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000802 pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %lld",
803 vm->id, ret);
Andrew Scull82257c42018-10-01 10:37:48 +0100804 ret = -EIO;
805 goto fail_with_cleanup;
806 }
807
808 /* Avoid overflowing the vcpu count. */
809 if (ret > (U32_MAX - total_vcpu_count)) {
810 pr_err("Too many vcpus: %u\n", total_vcpu_count);
811 ret = -EDQUOT;
812 goto fail_with_cleanup;
813 }
814
815 /* Confirm the maximum number of VCPUs looks sane. */
816 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS < 1);
817 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS > U16_MAX);
818
819 /* Enforce the limit on vcpus. */
820 total_vcpu_count += ret;
821 if (total_vcpu_count > CONFIG_HAFNIUM_MAX_VCPUS) {
822 pr_err("Too many vcpus: %u\n", total_vcpu_count);
823 ret = -EDQUOT;
824 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100825 }
826
827 vm->vcpu_count = ret;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000828 vm->vcpu = kmalloc_array(vm->vcpu_count, sizeof(struct hf_vcpu),
829 GFP_KERNEL);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100830 if (!vm->vcpu) {
Andrew Scullbb7ae412018-09-28 21:07:15 +0100831 pr_err("No memory for %u vcpus for vm %u",
Andrew Scullb722f952018-09-27 15:39:10 +0100832 vm->vcpu_count, vm->id);
Andrew Scull82257c42018-10-01 10:37:48 +0100833 ret = -ENOMEM;
834 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100835 }
836
Andrew Scull82257c42018-10-01 10:37:48 +0100837 /* Update the number of initialized VMs. */
838 hf_vm_count = i + 1;
839
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100840 /* Create a kernel thread for each vcpu. */
841 for (j = 0; j < vm->vcpu_count; j++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100842 struct hf_vcpu *vcpu = &vm->vcpu[j];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100843 vcpu->task = kthread_create(hf_vcpu_thread, vcpu,
Andrew Scullbb7ae412018-09-28 21:07:15 +0100844 "vcpu_thread_%u_%u",
Andrew Scullb722f952018-09-27 15:39:10 +0100845 vm->id, j);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100846 if (IS_ERR(vcpu->task)) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000847 pr_err("Error creating task (vm=%u,vcpu=%u): %ld\n",
848 vm->id, j, PTR_ERR(vcpu->task));
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100849 vm->vcpu_count = j;
Andrew Scull82257c42018-10-01 10:37:48 +0100850 ret = PTR_ERR(vcpu->task);
851 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100852 }
853
854 get_task_struct(vcpu->task);
Andrew Scullb722f952018-09-27 15:39:10 +0100855 vcpu->vm = vm;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100856 vcpu->vcpu_index = j;
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000857 atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100858 }
859 }
860
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000861 /* Register protocol and socket family. */
862 ret = proto_register(&hf_sock_proto, 0);
863 if (ret) {
864 pr_err("Unable to register protocol: %lld\n", ret);
865 goto fail_with_cleanup;
866 }
867
868 ret = sock_register(&proto_family);
869 if (ret) {
870 pr_err("Unable to register Hafnium's socket family: %lld\n",
871 ret);
872 goto fail_unregister_proto;
873 }
874
875 /*
876 * Start running threads now that all is initialized.
877 *
878 * Any failures from this point on must also unregister the socket
879 * family with a call to sock_unregister().
880 */
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100881 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100882 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100883 for (j = 0; j < vm->vcpu_count; j++)
884 wake_up_process(vm->vcpu[j].task);
885 }
886
887 /* Dump vm/vcpu count info. */
Andrew Scullbb7ae412018-09-28 21:07:15 +0100888 pr_info("Hafnium successfully loaded with %u VMs:\n", hf_vm_count);
Andrew Scullb722f952018-09-27 15:39:10 +0100889 for (i = 0; i < hf_vm_count; i++) {
890 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000891
Andrew Scullbb7ae412018-09-28 21:07:15 +0100892 pr_info("\tVM %u: %u vCPUS\n", vm->id, vm->vcpu_count);
Andrew Scullb722f952018-09-27 15:39:10 +0100893 }
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100894
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100895 return 0;
Andrew Scull82257c42018-10-01 10:37:48 +0100896
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000897fail_unregister_proto:
898 proto_unregister(&hf_sock_proto);
Andrew Scull82257c42018-10-01 10:37:48 +0100899fail_with_cleanup:
900 hf_free_resources();
901 return ret;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100902}
903
904/**
Andrew Scullbb7ae412018-09-28 21:07:15 +0100905 * Frees up all resources used by the Hafnium driver in preparation for
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100906 * unloading it.
907 */
908static void __exit hf_exit(void)
909{
Andrew Scullbb7ae412018-09-28 21:07:15 +0100910 pr_info("Preparing to unload Hafnium\n");
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000911 sock_unregister(PF_HF);
912 proto_unregister(&hf_sock_proto);
Andrew Scull82257c42018-10-01 10:37:48 +0100913 hf_free_resources();
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100914 pr_info("Hafnium ready to unload\n");
915}
916
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000917MODULE_LICENSE("GPL v2");
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100918
919module_init(hf_init);
920module_exit(hf_exit);