blob: cf3a95d54b86b523a2b43d51d942c3b59d05be14 [file] [log] [blame]
Andrew Walbran13c3a0b2018-11-30 11:51:53 +00001/*
2 * Copyright 2018 Google LLC
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 */
17
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010018#include <linux/hrtimer.h>
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000019#include <linux/atomic.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010020#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/kthread.h>
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +010023#include <linux/mm.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010024#include <linux/module.h>
25#include <linux/sched/task.h>
26#include <linux/slab.h>
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000027#include <linux/net.h>
28#include <net/sock.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010029
Andrew Scull55704232018-08-10 17:19:54 +010030#include <hf/call.h>
31
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000032/* TODO: Reusing AF_ECONET for now as it's otherwise unused. */
33#define AF_HF AF_ECONET
34#define PF_HF AF_HF
35
36#define MESSAGE_INT_ID 1
37
Andrew Scull82257c42018-10-01 10:37:48 +010038#define CONFIG_HAFNIUM_MAX_VMS 16
39#define CONFIG_HAFNIUM_MAX_VCPUS 32
40
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010041struct hf_vcpu {
Andrew Scullb722f952018-09-27 15:39:10 +010042 struct hf_vm *vm;
Andrew Scull55704232018-08-10 17:19:54 +010043 uint32_t vcpu_index;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010044 struct task_struct *task;
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +000045 atomic_t abort_sleep;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010046 struct hrtimer timer;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010047};
48
49struct hf_vm {
Andrew Scullb722f952018-09-27 15:39:10 +010050 uint32_t id;
Andrew Scullbb7ae412018-09-28 21:07:15 +010051 uint32_t vcpu_count;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010052 struct hf_vcpu *vcpu;
53};
54
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000055struct hf_msg_hdr {
56 uint64_t src_port;
57 uint64_t dst_port;
58};
59
60struct hf_sock {
61 /* This needs to be the first field. */
62 struct sock sk;
63
64 /*
65 * The following fields are immutable after the socket transitions to
66 * SS_CONNECTED state.
67 */
68 uint64_t local_port;
69 uint64_t remote_port;
70 struct hf_vm *peer_vm;
71};
72
73struct sockaddr_hf {
74 sa_family_t family;
75 uint32_t vm_id;
76 uint64_t port;
77};
78
79static struct proto hf_sock_proto = {
80 .name = "hafnium",
81 .owner = THIS_MODULE,
82 .obj_size = sizeof(struct hf_sock),
83};
84
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010085static struct hf_vm *hf_vms;
Andrew Scullbb7ae412018-09-28 21:07:15 +010086static uint32_t hf_vm_count;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000087static struct page *hf_send_page;
88static struct page *hf_recv_page;
89static atomic64_t hf_next_port = ATOMIC64_INIT(0);
90static DEFINE_SPINLOCK(hf_send_lock);
91static DEFINE_HASHTABLE(hf_local_port_hash, 7);
92static DEFINE_SPINLOCK(hf_local_port_hash_lock);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010093
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010094/**
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +000095 * Wakes up the kernel thread responsible for running the given vcpu.
96 *
97 * Returns 0 if the thread was already running, 1 otherwise.
98 */
99static int hf_vcpu_wake_up(struct hf_vcpu *vcpu)
100{
101 /* Set a flag indicating that the thread should not go to sleep. */
102 atomic_set(&vcpu->abort_sleep, 1);
103
104 /* Set the thread to running state. */
105 return wake_up_process(vcpu->task);
106}
107
108/**
109 * Puts the current thread to sleep. The current thread must be responsible for
110 * running the given vcpu.
111 *
112 * Going to sleep will fail if hf_vcpu_wake_up() or kthread_stop() was called on
113 * this vcpu/thread since the last time it [re]started running.
114 */
115static void hf_vcpu_sleep(struct hf_vcpu *vcpu)
116{
117 int abort;
118
119 set_current_state(TASK_INTERRUPTIBLE);
120
121 /* Check the sleep-abort flag after making thread interruptible. */
122 abort = atomic_read(&vcpu->abort_sleep);
123 if (!abort && !kthread_should_stop())
124 schedule();
125
126 /* Set state back to running on the way out. */
127 set_current_state(TASK_RUNNING);
128}
129
130/**
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100131 * Wakes up the thread associated with the vcpu that owns the given timer. This
132 * is called when the timer the thread is waiting on expires.
133 */
134static enum hrtimer_restart hf_vcpu_timer_expired(struct hrtimer *timer)
135{
136 struct hf_vcpu *vcpu = container_of(timer, struct hf_vcpu, timer);
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000137 /* TODO: Inject interrupt. */
138 hf_vcpu_wake_up(vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100139 return HRTIMER_NORESTART;
140}
141
142/**
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000143 * Handles a message delivered to this VM by validating that it's well-formed
144 * and then queueing it for delivery to the appropriate socket.
145 */
146static void hf_handle_message(struct hf_vm *sender, const void *ptr, size_t len)
147{
148 struct hf_sock *hsock;
149 const struct hf_msg_hdr *hdr = ptr;
150 struct sk_buff *skb;
151 int err;
152
153 /* Ignore messages that are too small to hold a header. */
154 if (len < sizeof(struct hf_msg_hdr))
155 return;
156
157 len -= sizeof(struct hf_msg_hdr);
158
159 /* Go through the colliding sockets. */
160 rcu_read_lock();
161 hash_for_each_possible_rcu(hf_local_port_hash, hsock, sk.sk_node,
162 hdr->dst_port) {
163 if (hsock->peer_vm == sender &&
164 hsock->remote_port == hdr->src_port) {
165 sock_hold(&hsock->sk);
166 break;
167 }
168 }
169 rcu_read_unlock();
170
171 /* Nothing to do if we couldn't find the target. */
172 if (!hsock)
173 return;
174
Wedson Almeida Filho89d0e472019-01-03 19:18:39 +0000175 /*
176 * TODO: From this point on, there are two failure paths: when we
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000177 * create the skb below, and when we enqueue it to the socket. What
178 * should we do if they fail? Ideally we would have some form of flow
179 * control to prevent message loss, but how to do it efficiently?
180 *
181 * One option is to have a pre-allocated message that indicates to the
182 * sender that a message was dropped. This way we guarantee that the
183 * sender will be aware of loss and should back-off.
184 */
185 /* Create the skb. */
186 skb = alloc_skb(len, GFP_KERNEL);
187 if (!skb)
188 goto exit;
189
190 memcpy(skb_put(skb, len), hdr + 1, len);
191
192 /*
193 * Add the skb to the receive queue of the target socket. On success it
194 * calls sk->sk_data_ready, which is currently set to sock_def_readable,
195 * which wakes up any waiters.
196 */
197 err = sock_queue_rcv_skb(&hsock->sk, skb);
198 if (err)
199 kfree_skb(skb);
200
201exit:
202 sock_put(&hsock->sk);
203}
204
205/**
206 * This function is called when Hafnium requests that the primary VM wake up a
207 * vCPU that belongs to a secondary VM.
208 *
209 * It wakes up the thread if it's sleeping, or kicks it if it's already running.
210 *
211 * If vCPU is HF_INVALID_VCPU, it injects a MESSAGE_INT_ID interrupt into a vCPU
212 * belonging to the specified VM.
213 */
214static void hf_handle_wake_up_request(uint32_t vm_id, uint16_t vcpu)
215{
216 struct hf_vm *vm;
217
218 if (vm_id > hf_vm_count) {
219 pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
220 return;
221 }
222
223 vm = &hf_vms[vm_id - 1];
224 if (vcpu >= vm->vcpu_count) {
225 int64_t ret;
226
227 if (vcpu != HF_INVALID_VCPU) {
228 pr_warn("Request to wake up non-existent vCPU: %u.%u\n",
229 vm_id, vcpu);
230 return;
231 }
232
233 /*
234 * TODO: For now we're picking the first vcpu to interrupt, but
235 * we want to be smarter.
236 */
237 vcpu = 0;
238 ret = hf_inject_interrupt(vm_id, vcpu, MESSAGE_INT_ID);
239 if (ret != 1) {
240 /* We don't need to wake up the vcpu. */
241 return;
242 }
243 }
244
245 if (hf_vcpu_wake_up(&vm->vcpu[vcpu]) == 0) {
246 /*
247 * The task was already running (presumably on a different
248 * physical CPU); interrupt it. This gives Hafnium a chance to
249 * inject any new interrupts.
250 */
251 kick_process(vm->vcpu[vcpu].task);
252 }
253}
254
255/**
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100256 * This is the main loop of each vcpu.
257 */
258static int hf_vcpu_thread(void *data)
259{
260 struct hf_vcpu *vcpu = data;
Andrew Sculldc8cab52018-10-10 18:29:39 +0100261 struct hf_vcpu_run_return ret;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100262
263 hrtimer_init(&vcpu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
264 vcpu->timer.function = &hf_vcpu_timer_expired;
265
266 while (!kthread_should_stop()) {
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000267 /*
268 * We're about to run the vcpu, so we can reset the abort-sleep
269 * flag.
270 */
271 atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100272
Andrew Scullbb7ae412018-09-28 21:07:15 +0100273 /* Call into Hafnium to run vcpu. */
Andrew Scullb722f952018-09-27 15:39:10 +0100274 ret = hf_vcpu_run(vcpu->vm->id, vcpu->vcpu_index);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100275
Andrew Sculldc8cab52018-10-10 18:29:39 +0100276 switch (ret.code) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100277 /* Yield (forcibly or voluntarily). */
278 case HF_VCPU_RUN_YIELD:
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100279 break;
280
Andrew Scullb3a61b52018-09-17 14:30:34 +0100281 /* WFI. */
282 case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000283 hf_vcpu_sleep(vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100284 break;
285
Andrew Scullb3a61b52018-09-17 14:30:34 +0100286 /* Wake up another vcpu. */
287 case HF_VCPU_RUN_WAKE_UP:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000288 hf_handle_wake_up_request(ret.wake_up.vm_id,
289 ret.wake_up.vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100290 break;
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100291
Andrew Scullb3a61b52018-09-17 14:30:34 +0100292 /* Response available. */
Andrew Scull0973a2e2018-10-05 11:11:24 +0100293 case HF_VCPU_RUN_MESSAGE:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000294 hf_handle_message(vcpu->vm, page_address(hf_recv_page),
295 ret.message.size);
296 hf_mailbox_clear();
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100297 break;
Andrew Sculldc8cab52018-10-10 18:29:39 +0100298
299 case HF_VCPU_RUN_SLEEP:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000300 hrtimer_start(&vcpu->timer, ret.sleep.ns,
301 HRTIMER_MODE_REL);
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000302 hf_vcpu_sleep(vcpu);
Andrew Sculldc8cab52018-10-10 18:29:39 +0100303 hrtimer_cancel(&vcpu->timer);
304 break;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100305 }
306 }
307
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100308 return 0;
309}
310
311/**
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000312 * Converts a pointer to a struct sock into a pointer to a struct hf_sock. It
313 * relies on the fact that the first field of hf_sock is a sock.
314 */
315static struct hf_sock *hsock_from_sk(struct sock *sk)
316{
317 return (struct hf_sock *)sk;
318}
319
320/**
321 * This is called when the last reference to the outer socket is released. For
322 * example, if it's a user-space socket, when the last file descriptor pointing
323 * to this socket is closed.
324 *
325 * It begins cleaning up resources, though some can only be cleaned up after all
326 * references to the underlying socket are released, which is handled by
327 * hf_sock_destruct().
328 */
329static int hf_sock_release(struct socket *sock)
330{
331 struct sock *sk = sock->sk;
332 struct hf_sock *hsock = hsock_from_sk(sk);
333 unsigned long flags;
334
335 if (!sk)
336 return 0;
337
338 /* Shutdown for both send and receive. */
339 lock_sock(sk);
340 sk->sk_shutdown |= RCV_SHUTDOWN | SEND_SHUTDOWN;
341 sk->sk_state_change(sk);
342 release_sock(sk);
343
344 /* Remove from the hash table, so lookups from now on won't find it. */
345 spin_lock_irqsave(&hf_local_port_hash_lock, flags);
346 hash_del_rcu(&hsock->sk.sk_node);
347 spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
348
349 /*
350 * TODO: When we implement a tx queue, we need to clear it here so that
351 * sk_wmem_alloc will not prevent sk from being freed (sk_free).
352 */
353
354 /*
355 * Wait for in-flight lookups to finish. We need to do this here because
Wedson Almeida Filho89d0e472019-01-03 19:18:39 +0000356 * in-flight lookups rely on the reference to the socket we're about to
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000357 * release.
358 */
359 synchronize_rcu();
360 sock_put(sk);
361 sock->sk = NULL;
362
363 return 0;
364}
365
366/**
367 * This is called when there are no more references to the socket. It frees all
368 * resources that haven't been freed during release.
369 */
370static void hf_sock_destruct(struct sock *sk)
371{
372 /*
373 * Clear the receive queue now that the handler cannot add any more
374 * skbs to it.
375 */
376 skb_queue_purge(&sk->sk_receive_queue);
377}
378
379/**
380 * Connects the Hafnium socket to the provided VM and port. After the socket is
381 * connected, it can be used to exchange datagrams with the specified peer.
382 */
383static int hf_sock_connect(struct socket *sock, struct sockaddr *saddr,
384 int len, int connect_flags)
385{
386 struct sock *sk = sock->sk;
387 struct hf_sock *hsock = hsock_from_sk(sk);
388 struct hf_vm *vm;
389 struct sockaddr_hf *addr;
390 int err;
391 unsigned long flags;
392
393 /* Basic address validation. */
394 if (len < sizeof(struct sockaddr_hf) || saddr->sa_family != AF_HF)
395 return -EINVAL;
396
397 addr = (struct sockaddr_hf *)saddr;
398 if (addr->vm_id > hf_vm_count)
399 return -ENETUNREACH;
400
401 vm = &hf_vms[addr->vm_id - 1];
402
403 /*
404 * TODO: Once we implement access control in Hafnium, check that the
405 * caller is allowed to contact the specified VM. Return -ECONNREFUSED
406 * if access is denied.
407 */
408
409 /* Take lock to make sure state doesn't change as we connect. */
410 lock_sock(sk);
411
412 /* Only unconnected sockets are allowed to become connected. */
413 if (sock->state != SS_UNCONNECTED) {
414 err = -EISCONN;
415 goto exit;
416 }
417
418 hsock->local_port = atomic64_inc_return(&hf_next_port);
419 hsock->remote_port = addr->port;
420 hsock->peer_vm = vm;
421
422 sock->state = SS_CONNECTED;
423
424 /* Add socket to hash table now that it's fully initialised. */
425 spin_lock_irqsave(&hf_local_port_hash_lock, flags);
426 hash_add_rcu(hf_local_port_hash, &sk->sk_node, hsock->local_port);
427 spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
428
429 err = 0;
430exit:
431 release_sock(sk);
432 return err;
433}
434
435/**
436 * Sends the given skb to the appropriate VM by calling Hafnium. It will also
437 * trigger the wake up of a recipient VM.
438 *
439 * Takes ownership of the skb on success.
440 */
441static int hf_send_skb(struct sk_buff *skb)
442{
443 unsigned long flags;
444 int64_t ret;
445 struct hf_sock *hsock = hsock_from_sk(skb->sk);
446 struct hf_vm *vm = hsock->peer_vm;
447
448 /*
449 * Call Hafnium under the send lock so that we serialize the use of the
450 * global send buffer.
451 */
452 spin_lock_irqsave(&hf_send_lock, flags);
453 memcpy(page_address(hf_send_page), skb->data, skb->len);
454 ret = hf_mailbox_send(vm->id, skb->len);
455 spin_unlock_irqrestore(&hf_send_lock, flags);
456
457 if (ret < 0)
458 return -EAGAIN;
459
460 /* Wake some vcpu up to handle the new message. */
461 hf_handle_wake_up_request(vm->id, ret);
462
463 kfree_skb(skb);
464
465 return 0;
466}
467
468/**
469 * Determines if the given socket is in the connected state. It acquires and
470 * releases the socket lock.
471 */
472static bool hf_sock_is_connected(struct socket *sock)
473{
474 bool ret;
475
476 lock_sock(sock->sk);
477 ret = sock->state == SS_CONNECTED;
478 release_sock(sock->sk);
479
480 return ret;
481}
482
483/**
484 * Sends a message to the VM & port the socket is connected to. All variants
485 * of write/send/sendto/sendmsg eventually call this function.
486 */
487static int hf_sock_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
488{
489 struct sock *sk = sock->sk;
490 struct sk_buff *skb;
491 int err;
492 struct hf_msg_hdr *hdr;
493 struct hf_sock *hsock = hsock_from_sk(sk);
494
495 /* Check length. */
496 if (len > HF_MAILBOX_SIZE - sizeof(struct hf_msg_hdr))
497 return -EMSGSIZE;
498
499 /* We don't allow the destination address to be specified. */
500 if (m->msg_namelen > 0)
501 return -EISCONN;
502
503 /* We don't support out of band messages. */
504 if (m->msg_flags & MSG_OOB)
505 return -EOPNOTSUPP;
506
507 /*
508 * Ensure that the socket is connected. We don't need to hold the socket
509 * lock (acquired and released by hf_sock_is_connected) for the
510 * remainder of the function because the fields we care about are
511 * immutable once the state is SS_CONNECTED.
512 */
513 if (!hf_sock_is_connected(sock))
514 return -ENOTCONN;
515
516 /*
517 * Allocate an skb for this write. If there isn't enough room in the
518 * socket's send buffer (sk_wmem_alloc >= sk_sndbuf), this will block
519 * (if it's a blocking call). On success, it increments sk_wmem_alloc
520 * and sets up the skb such that sk_wmem_alloc gets decremented when
521 * the skb is freed (sock_wfree gets called).
522 */
523 skb = sock_alloc_send_skb(sk, len + sizeof(struct hf_msg_hdr),
524 m->msg_flags & MSG_DONTWAIT, &err);
525 if (!skb)
526 return err;
527
528 /* Reserve room for the header and initialise it. */
529 skb_reserve(skb, sizeof(struct hf_msg_hdr));
530 hdr = skb_push(skb, sizeof(struct hf_msg_hdr));
531 hdr->src_port = hsock->local_port;
532 hdr->dst_port = hsock->remote_port;
533
534 /* Allocate area for the contents, then copy into skb. */
535 if (!copy_from_iter_full(skb_put(skb, len), len, &m->msg_iter)) {
536 err = -EFAULT;
537 goto err_cleanup;
538 }
539
540 /*
541 * TODO: We currently do this inline, but when we have support for
542 * readiness notification from Hafnium, we must add this to a per-VM tx
543 * queue that can make progress when the VM becomes writable. This will
544 * fix send buffering and poll readiness notification.
545 */
546 err = hf_send_skb(skb);
547 if (err)
548 goto err_cleanup;
549
550 return 0;
551
552err_cleanup:
553 kfree_skb(skb);
554 return err;
555}
556
557/**
558 * Receives a message originated from the VM & port the socket is connected to.
559 * All variants of read/recv/recvfrom/recvmsg eventually call this function.
560 */
561static int hf_sock_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
562 int flags)
563{
564 struct sock *sk = sock->sk;
565 struct sk_buff *skb;
566 int err;
567 size_t copy_len;
568
569 if (!hf_sock_is_connected(sock))
570 return -ENOTCONN;
571
572 /* Grab the next skb from the receive queue. */
573 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
574 if (!skb)
575 return err;
576
577 /* Make sure we don't copy more than what fits in the output buffer. */
578 copy_len = skb->len;
579 if (copy_len > len) {
580 copy_len = len;
581 m->msg_flags |= MSG_TRUNC;
582 }
583
584 /* Make sure we don't overflow the return value type. */
585 if (copy_len > INT_MAX) {
586 copy_len = INT_MAX;
587 m->msg_flags |= MSG_TRUNC;
588 }
589
590 /* Copy skb to output iterator, then free it. */
591 err = skb_copy_datagram_msg(skb, 0, m, copy_len);
592 skb_free_datagram(sk, skb);
593 if (err)
594 return err;
595
596 return copy_len;
597}
598
599/**
600 * This function is called when a Hafnium socket is created. It initialises all
601 * state such that the caller will be able to connect the socket and then send
602 * and receive messages through it.
603 */
604static int hf_sock_create(struct net *net, struct socket *sock, int protocol,
605 int kern)
606{
607 static const struct proto_ops ops = {
608 .family = PF_HF,
609 .owner = THIS_MODULE,
610 .release = hf_sock_release,
611 .bind = sock_no_bind,
612 .connect = hf_sock_connect,
613 .socketpair = sock_no_socketpair,
614 .accept = sock_no_accept,
615 .ioctl = sock_no_ioctl,
616 .listen = sock_no_listen,
617 .shutdown = sock_no_shutdown,
618 .setsockopt = sock_no_setsockopt,
619 .getsockopt = sock_no_getsockopt,
620 .sendmsg = hf_sock_sendmsg,
621 .recvmsg = hf_sock_recvmsg,
622 .mmap = sock_no_mmap,
623 .sendpage = sock_no_sendpage,
624 .poll = datagram_poll,
625 };
626 struct sock *sk;
627
628 if (sock->type != SOCK_DGRAM)
629 return -ESOCKTNOSUPPORT;
630
631 if (protocol != 0)
632 return -EPROTONOSUPPORT;
633
634 /*
635 * For now we only allow callers with sys admin capability to create
636 * Hafnium sockets.
637 */
638 if (!capable(CAP_SYS_ADMIN))
639 return -EPERM;
640
641 /* Allocate and initialise socket. */
642 sk = sk_alloc(net, PF_HF, GFP_KERNEL, &hf_sock_proto, kern);
643 if (!sk)
644 return -ENOMEM;
645
646 sock_init_data(sock, sk);
647
648 sk->sk_destruct = hf_sock_destruct;
649 sock->ops = &ops;
650 sock->state = SS_UNCONNECTED;
651
652 return 0;
653}
654
655/**
Andrew Scullbb7ae412018-09-28 21:07:15 +0100656 * Frees all resources, including threads, associated with the Hafnium driver.
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100657 */
Andrew Scull82257c42018-10-01 10:37:48 +0100658static void hf_free_resources(void)
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100659{
Andrew Scullbb7ae412018-09-28 21:07:15 +0100660 uint32_t i, j;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100661
662 /*
663 * First stop all worker threads. We need to do this before freeing
664 * resources because workers may reference each other, so it is only
665 * safe to free resources after they have all stopped.
666 */
Andrew Scull82257c42018-10-01 10:37:48 +0100667 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100668 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000669
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100670 for (j = 0; j < vm->vcpu_count; j++)
671 kthread_stop(vm->vcpu[j].task);
672 }
673
674 /* Free resources. */
Andrew Scull82257c42018-10-01 10:37:48 +0100675 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100676 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000677
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100678 for (j = 0; j < vm->vcpu_count; j++)
679 put_task_struct(vm->vcpu[j].task);
680 kfree(vm->vcpu);
681 }
682
683 kfree(hf_vms);
684}
685
Andrew Scullbb7ae412018-09-28 21:07:15 +0100686/**
687 * Initializes the Hafnium driver by creating a thread for each vCPU of each
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100688 * virtual machine.
689 */
690static int __init hf_init(void)
691{
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000692 static const struct net_proto_family proto_family = {
693 .family = PF_HF,
694 .create = hf_sock_create,
695 .owner = THIS_MODULE,
696 };
Andrew Scullbb7ae412018-09-28 21:07:15 +0100697 int64_t ret;
698 uint32_t i, j;
Andrew Scull82257c42018-10-01 10:37:48 +0100699 uint32_t total_vm_count;
700 uint32_t total_vcpu_count;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100701
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100702 /* Allocate a page for send and receive buffers. */
703 hf_send_page = alloc_page(GFP_KERNEL);
704 if (!hf_send_page) {
705 pr_err("Unable to allocate send buffer\n");
706 return -ENOMEM;
707 }
708
709 hf_recv_page = alloc_page(GFP_KERNEL);
710 if (!hf_recv_page) {
711 __free_page(hf_send_page);
712 pr_err("Unable to allocate receive buffer\n");
713 return -ENOMEM;
714 }
715
716 /*
717 * Configure both addresses. Once configured, we cannot free these pages
718 * because the hypervisor will use them, even if the module is
719 * unloaded.
720 */
Andrew Scull55704232018-08-10 17:19:54 +0100721 ret = hf_vm_configure(page_to_phys(hf_send_page),
722 page_to_phys(hf_recv_page));
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100723 if (ret) {
724 __free_page(hf_send_page);
725 __free_page(hf_recv_page);
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000726 /*
727 * TODO: We may want to grab this information from hypervisor
728 * and go from there.
729 */
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100730 pr_err("Unable to configure VM\n");
731 return -EIO;
732 }
733
Andrew Scull82257c42018-10-01 10:37:48 +0100734 /* Get the number of VMs. */
Andrew Scull55704232018-08-10 17:19:54 +0100735 ret = hf_vm_get_count();
Andrew Scull82257c42018-10-01 10:37:48 +0100736 if (ret < 0) {
Andrew Scullbb7ae412018-09-28 21:07:15 +0100737 pr_err("Unable to retrieve number of VMs: %lld\n", ret);
Andrew Scull82257c42018-10-01 10:37:48 +0100738 return -EIO;
739 }
740
741 /* Confirm the maximum number of VMs looks sane. */
742 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS < 1);
743 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS > U16_MAX);
744
745 /* Validate the number of VMs. There must at least be the primary. */
746 if (ret < 1 || ret > CONFIG_HAFNIUM_MAX_VMS) {
747 pr_err("Number of VMs is out of range: %lld\n", ret);
748 return -EDQUOT;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100749 }
750
Andrew Scullb722f952018-09-27 15:39:10 +0100751 /* Only track the secondary VMs. */
Andrew Scull82257c42018-10-01 10:37:48 +0100752 total_vm_count = ret - 1;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000753 hf_vms = kmalloc_array(total_vm_count, sizeof(struct hf_vm),
754 GFP_KERNEL);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100755 if (!hf_vms)
756 return -ENOMEM;
757
758 /* Initialize each VM. */
Andrew Scull82257c42018-10-01 10:37:48 +0100759 total_vcpu_count = 0;
760 for (i = 0; i < total_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100761 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100762
Andrew Scullb722f952018-09-27 15:39:10 +0100763 /* Adjust the ID as only the secondaries are tracked. */
764 vm->id = i + 1;
765
766 ret = hf_vcpu_get_count(vm->id);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100767 if (ret < 0) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000768 pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %lld",
769 vm->id, ret);
Andrew Scull82257c42018-10-01 10:37:48 +0100770 ret = -EIO;
771 goto fail_with_cleanup;
772 }
773
774 /* Avoid overflowing the vcpu count. */
775 if (ret > (U32_MAX - total_vcpu_count)) {
776 pr_err("Too many vcpus: %u\n", total_vcpu_count);
777 ret = -EDQUOT;
778 goto fail_with_cleanup;
779 }
780
781 /* Confirm the maximum number of VCPUs looks sane. */
782 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS < 1);
783 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS > U16_MAX);
784
785 /* Enforce the limit on vcpus. */
786 total_vcpu_count += ret;
787 if (total_vcpu_count > CONFIG_HAFNIUM_MAX_VCPUS) {
788 pr_err("Too many vcpus: %u\n", total_vcpu_count);
789 ret = -EDQUOT;
790 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100791 }
792
793 vm->vcpu_count = ret;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000794 vm->vcpu = kmalloc_array(vm->vcpu_count, sizeof(struct hf_vcpu),
795 GFP_KERNEL);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100796 if (!vm->vcpu) {
Andrew Scullbb7ae412018-09-28 21:07:15 +0100797 pr_err("No memory for %u vcpus for vm %u",
Andrew Scullb722f952018-09-27 15:39:10 +0100798 vm->vcpu_count, vm->id);
Andrew Scull82257c42018-10-01 10:37:48 +0100799 ret = -ENOMEM;
800 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100801 }
802
Andrew Scull82257c42018-10-01 10:37:48 +0100803 /* Update the number of initialized VMs. */
804 hf_vm_count = i + 1;
805
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100806 /* Create a kernel thread for each vcpu. */
807 for (j = 0; j < vm->vcpu_count; j++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100808 struct hf_vcpu *vcpu = &vm->vcpu[j];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100809 vcpu->task = kthread_create(hf_vcpu_thread, vcpu,
Andrew Scullbb7ae412018-09-28 21:07:15 +0100810 "vcpu_thread_%u_%u",
Andrew Scullb722f952018-09-27 15:39:10 +0100811 vm->id, j);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100812 if (IS_ERR(vcpu->task)) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000813 pr_err("Error creating task (vm=%u,vcpu=%u): %ld\n",
814 vm->id, j, PTR_ERR(vcpu->task));
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100815 vm->vcpu_count = j;
Andrew Scull82257c42018-10-01 10:37:48 +0100816 ret = PTR_ERR(vcpu->task);
817 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100818 }
819
820 get_task_struct(vcpu->task);
Andrew Scullb722f952018-09-27 15:39:10 +0100821 vcpu->vm = vm;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100822 vcpu->vcpu_index = j;
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000823 atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100824 }
825 }
826
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000827 /* Register protocol and socket family. */
828 ret = proto_register(&hf_sock_proto, 0);
829 if (ret) {
830 pr_err("Unable to register protocol: %lld\n", ret);
831 goto fail_with_cleanup;
832 }
833
834 ret = sock_register(&proto_family);
835 if (ret) {
836 pr_err("Unable to register Hafnium's socket family: %lld\n",
837 ret);
838 goto fail_unregister_proto;
839 }
840
841 /*
842 * Start running threads now that all is initialized.
843 *
844 * Any failures from this point on must also unregister the socket
845 * family with a call to sock_unregister().
846 */
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100847 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100848 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100849 for (j = 0; j < vm->vcpu_count; j++)
850 wake_up_process(vm->vcpu[j].task);
851 }
852
853 /* Dump vm/vcpu count info. */
Andrew Scullbb7ae412018-09-28 21:07:15 +0100854 pr_info("Hafnium successfully loaded with %u VMs:\n", hf_vm_count);
Andrew Scullb722f952018-09-27 15:39:10 +0100855 for (i = 0; i < hf_vm_count; i++) {
856 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000857
Andrew Scullbb7ae412018-09-28 21:07:15 +0100858 pr_info("\tVM %u: %u vCPUS\n", vm->id, vm->vcpu_count);
Andrew Scullb722f952018-09-27 15:39:10 +0100859 }
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100860
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100861 return 0;
Andrew Scull82257c42018-10-01 10:37:48 +0100862
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000863fail_unregister_proto:
864 proto_unregister(&hf_sock_proto);
Andrew Scull82257c42018-10-01 10:37:48 +0100865fail_with_cleanup:
866 hf_free_resources();
867 return ret;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100868}
869
870/**
Andrew Scullbb7ae412018-09-28 21:07:15 +0100871 * Frees up all resources used by the Hafnium driver in preparation for
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100872 * unloading it.
873 */
874static void __exit hf_exit(void)
875{
Andrew Scullbb7ae412018-09-28 21:07:15 +0100876 pr_info("Preparing to unload Hafnium\n");
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000877 sock_unregister(PF_HF);
878 proto_unregister(&hf_sock_proto);
Andrew Scull82257c42018-10-01 10:37:48 +0100879 hf_free_resources();
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100880 pr_info("Hafnium ready to unload\n");
881}
882
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000883MODULE_LICENSE("GPL v2");
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100884
885module_init(hf_init);
886module_exit(hf_exit);