blob: 11bbc2e4379fdcce381c6953306729097eefc3e3 [file] [log] [blame]
Andrew Walbran13c3a0b2018-11-30 11:51:53 +00001/*
2 * Copyright 2018 Google LLC
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 */
17
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010018#include <linux/hrtimer.h>
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000019#include <linux/atomic.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010020#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/kthread.h>
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +010023#include <linux/mm.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010024#include <linux/module.h>
25#include <linux/sched/task.h>
26#include <linux/slab.h>
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000027#include <linux/net.h>
28#include <net/sock.h>
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010029
Andrew Scull55704232018-08-10 17:19:54 +010030#include <hf/call.h>
31
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000032/* TODO: Reusing AF_ECONET for now as it's otherwise unused. */
33#define AF_HF AF_ECONET
34#define PF_HF AF_HF
35
36#define MESSAGE_INT_ID 1
37
Andrew Scull82257c42018-10-01 10:37:48 +010038#define CONFIG_HAFNIUM_MAX_VMS 16
39#define CONFIG_HAFNIUM_MAX_VCPUS 32
40
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010041struct hf_vcpu {
Andrew Scullb722f952018-09-27 15:39:10 +010042 struct hf_vm *vm;
Andrew Scull55704232018-08-10 17:19:54 +010043 uint32_t vcpu_index;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010044 struct task_struct *task;
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +000045 atomic_t abort_sleep;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010046 struct hrtimer timer;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010047};
48
49struct hf_vm {
Andrew Scullb722f952018-09-27 15:39:10 +010050 uint32_t id;
Andrew Scullbb7ae412018-09-28 21:07:15 +010051 uint32_t vcpu_count;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010052 struct hf_vcpu *vcpu;
53};
54
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000055struct hf_msg_hdr {
56 uint64_t src_port;
57 uint64_t dst_port;
58};
59
60struct hf_sock {
61 /* This needs to be the first field. */
62 struct sock sk;
63
64 /*
65 * The following fields are immutable after the socket transitions to
66 * SS_CONNECTED state.
67 */
68 uint64_t local_port;
69 uint64_t remote_port;
70 struct hf_vm *peer_vm;
71};
72
73struct sockaddr_hf {
74 sa_family_t family;
75 uint32_t vm_id;
76 uint64_t port;
77};
78
79static struct proto hf_sock_proto = {
80 .name = "hafnium",
81 .owner = THIS_MODULE,
82 .obj_size = sizeof(struct hf_sock),
83};
84
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010085static struct hf_vm *hf_vms;
Andrew Scullbb7ae412018-09-28 21:07:15 +010086static uint32_t hf_vm_count;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +000087static struct page *hf_send_page;
88static struct page *hf_recv_page;
89static atomic64_t hf_next_port = ATOMIC64_INIT(0);
90static DEFINE_SPINLOCK(hf_send_lock);
91static DEFINE_HASHTABLE(hf_local_port_hash, 7);
92static DEFINE_SPINLOCK(hf_local_port_hash_lock);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010093
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +010094/**
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +000095 * Wakes up the kernel thread responsible for running the given vcpu.
96 *
97 * Returns 0 if the thread was already running, 1 otherwise.
98 */
99static int hf_vcpu_wake_up(struct hf_vcpu *vcpu)
100{
101 /* Set a flag indicating that the thread should not go to sleep. */
102 atomic_set(&vcpu->abort_sleep, 1);
103
104 /* Set the thread to running state. */
105 return wake_up_process(vcpu->task);
106}
107
108/**
109 * Puts the current thread to sleep. The current thread must be responsible for
110 * running the given vcpu.
111 *
112 * Going to sleep will fail if hf_vcpu_wake_up() or kthread_stop() was called on
113 * this vcpu/thread since the last time it [re]started running.
114 */
115static void hf_vcpu_sleep(struct hf_vcpu *vcpu)
116{
117 int abort;
118
119 set_current_state(TASK_INTERRUPTIBLE);
120
121 /* Check the sleep-abort flag after making thread interruptible. */
122 abort = atomic_read(&vcpu->abort_sleep);
123 if (!abort && !kthread_should_stop())
124 schedule();
125
126 /* Set state back to running on the way out. */
127 set_current_state(TASK_RUNNING);
128}
129
130/**
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100131 * Wakes up the thread associated with the vcpu that owns the given timer. This
132 * is called when the timer the thread is waiting on expires.
133 */
134static enum hrtimer_restart hf_vcpu_timer_expired(struct hrtimer *timer)
135{
136 struct hf_vcpu *vcpu = container_of(timer, struct hf_vcpu, timer);
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000137 /* TODO: Inject interrupt. */
138 hf_vcpu_wake_up(vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100139 return HRTIMER_NORESTART;
140}
141
142/**
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000143 * Handles a message delivered to this VM by validating that it's well-formed
144 * and then queueing it for delivery to the appropriate socket.
145 */
146static void hf_handle_message(struct hf_vm *sender, const void *ptr, size_t len)
147{
148 struct hf_sock *hsock;
149 const struct hf_msg_hdr *hdr = ptr;
150 struct sk_buff *skb;
151 int err;
152
153 /* Ignore messages that are too small to hold a header. */
154 if (len < sizeof(struct hf_msg_hdr))
155 return;
156
157 len -= sizeof(struct hf_msg_hdr);
158
159 /* Go through the colliding sockets. */
160 rcu_read_lock();
161 hash_for_each_possible_rcu(hf_local_port_hash, hsock, sk.sk_node,
162 hdr->dst_port) {
163 if (hsock->peer_vm == sender &&
164 hsock->remote_port == hdr->src_port) {
165 sock_hold(&hsock->sk);
166 break;
167 }
168 }
169 rcu_read_unlock();
170
171 /* Nothing to do if we couldn't find the target. */
172 if (!hsock)
173 return;
174
175 /* TODO: From this point on, there are two failure paths: when we
176 * create the skb below, and when we enqueue it to the socket. What
177 * should we do if they fail? Ideally we would have some form of flow
178 * control to prevent message loss, but how to do it efficiently?
179 *
180 * One option is to have a pre-allocated message that indicates to the
181 * sender that a message was dropped. This way we guarantee that the
182 * sender will be aware of loss and should back-off.
183 */
184 /* Create the skb. */
185 skb = alloc_skb(len, GFP_KERNEL);
186 if (!skb)
187 goto exit;
188
189 memcpy(skb_put(skb, len), hdr + 1, len);
190
191 /*
192 * Add the skb to the receive queue of the target socket. On success it
193 * calls sk->sk_data_ready, which is currently set to sock_def_readable,
194 * which wakes up any waiters.
195 */
196 err = sock_queue_rcv_skb(&hsock->sk, skb);
197 if (err)
198 kfree_skb(skb);
199
200exit:
201 sock_put(&hsock->sk);
202}
203
204/**
205 * This function is called when Hafnium requests that the primary VM wake up a
206 * vCPU that belongs to a secondary VM.
207 *
208 * It wakes up the thread if it's sleeping, or kicks it if it's already running.
209 *
210 * If vCPU is HF_INVALID_VCPU, it injects a MESSAGE_INT_ID interrupt into a vCPU
211 * belonging to the specified VM.
212 */
213static void hf_handle_wake_up_request(uint32_t vm_id, uint16_t vcpu)
214{
215 struct hf_vm *vm;
216
217 if (vm_id > hf_vm_count) {
218 pr_warn("Request to wake up non-existent VM id: %u\n", vm_id);
219 return;
220 }
221
222 vm = &hf_vms[vm_id - 1];
223 if (vcpu >= vm->vcpu_count) {
224 int64_t ret;
225
226 if (vcpu != HF_INVALID_VCPU) {
227 pr_warn("Request to wake up non-existent vCPU: %u.%u\n",
228 vm_id, vcpu);
229 return;
230 }
231
232 /*
233 * TODO: For now we're picking the first vcpu to interrupt, but
234 * we want to be smarter.
235 */
236 vcpu = 0;
237 ret = hf_inject_interrupt(vm_id, vcpu, MESSAGE_INT_ID);
238 if (ret != 1) {
239 /* We don't need to wake up the vcpu. */
240 return;
241 }
242 }
243
244 if (hf_vcpu_wake_up(&vm->vcpu[vcpu]) == 0) {
245 /*
246 * The task was already running (presumably on a different
247 * physical CPU); interrupt it. This gives Hafnium a chance to
248 * inject any new interrupts.
249 */
250 kick_process(vm->vcpu[vcpu].task);
251 }
252}
253
254/**
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100255 * This is the main loop of each vcpu.
256 */
257static int hf_vcpu_thread(void *data)
258{
259 struct hf_vcpu *vcpu = data;
Andrew Sculldc8cab52018-10-10 18:29:39 +0100260 struct hf_vcpu_run_return ret;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100261
262 hrtimer_init(&vcpu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
263 vcpu->timer.function = &hf_vcpu_timer_expired;
264
265 while (!kthread_should_stop()) {
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000266 /*
267 * We're about to run the vcpu, so we can reset the abort-sleep
268 * flag.
269 */
270 atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100271
Andrew Scullbb7ae412018-09-28 21:07:15 +0100272 /* Call into Hafnium to run vcpu. */
Andrew Scullb722f952018-09-27 15:39:10 +0100273 ret = hf_vcpu_run(vcpu->vm->id, vcpu->vcpu_index);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100274
Andrew Sculldc8cab52018-10-10 18:29:39 +0100275 switch (ret.code) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100276 /* Yield (forcibly or voluntarily). */
277 case HF_VCPU_RUN_YIELD:
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100278 break;
279
Andrew Scullb3a61b52018-09-17 14:30:34 +0100280 /* WFI. */
281 case HF_VCPU_RUN_WAIT_FOR_INTERRUPT:
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000282 hf_vcpu_sleep(vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100283 break;
284
Andrew Scullb3a61b52018-09-17 14:30:34 +0100285 /* Wake up another vcpu. */
286 case HF_VCPU_RUN_WAKE_UP:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000287 hf_handle_wake_up_request(ret.wake_up.vm_id,
288 ret.wake_up.vcpu);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100289 break;
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100290
Andrew Scullb3a61b52018-09-17 14:30:34 +0100291 /* Response available. */
Andrew Scull0973a2e2018-10-05 11:11:24 +0100292 case HF_VCPU_RUN_MESSAGE:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000293 hf_handle_message(vcpu->vm, page_address(hf_recv_page),
294 ret.message.size);
295 hf_mailbox_clear();
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100296 break;
Andrew Sculldc8cab52018-10-10 18:29:39 +0100297
298 case HF_VCPU_RUN_SLEEP:
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000299 hrtimer_start(&vcpu->timer, ret.sleep.ns,
300 HRTIMER_MODE_REL);
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000301 hf_vcpu_sleep(vcpu);
Andrew Sculldc8cab52018-10-10 18:29:39 +0100302 hrtimer_cancel(&vcpu->timer);
303 break;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100304 }
305 }
306
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100307 return 0;
308}
309
310/**
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000311 * Converts a pointer to a struct sock into a pointer to a struct hf_sock. It
312 * relies on the fact that the first field of hf_sock is a sock.
313 */
314static struct hf_sock *hsock_from_sk(struct sock *sk)
315{
316 return (struct hf_sock *)sk;
317}
318
319/**
320 * This is called when the last reference to the outer socket is released. For
321 * example, if it's a user-space socket, when the last file descriptor pointing
322 * to this socket is closed.
323 *
324 * It begins cleaning up resources, though some can only be cleaned up after all
325 * references to the underlying socket are released, which is handled by
326 * hf_sock_destruct().
327 */
328static int hf_sock_release(struct socket *sock)
329{
330 struct sock *sk = sock->sk;
331 struct hf_sock *hsock = hsock_from_sk(sk);
332 unsigned long flags;
333
334 if (!sk)
335 return 0;
336
337 /* Shutdown for both send and receive. */
338 lock_sock(sk);
339 sk->sk_shutdown |= RCV_SHUTDOWN | SEND_SHUTDOWN;
340 sk->sk_state_change(sk);
341 release_sock(sk);
342
343 /* Remove from the hash table, so lookups from now on won't find it. */
344 spin_lock_irqsave(&hf_local_port_hash_lock, flags);
345 hash_del_rcu(&hsock->sk.sk_node);
346 spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
347
348 /*
349 * TODO: When we implement a tx queue, we need to clear it here so that
350 * sk_wmem_alloc will not prevent sk from being freed (sk_free).
351 */
352
353 /*
354 * Wait for in-flight lookups to finish. We need to do this here because
355 * in-flight lookups rely on the reference to the socket we're about to
356 * release.
357 */
358 synchronize_rcu();
359 sock_put(sk);
360 sock->sk = NULL;
361
362 return 0;
363}
364
365/**
366 * This is called when there are no more references to the socket. It frees all
367 * resources that haven't been freed during release.
368 */
369static void hf_sock_destruct(struct sock *sk)
370{
371 /*
372 * Clear the receive queue now that the handler cannot add any more
373 * skbs to it.
374 */
375 skb_queue_purge(&sk->sk_receive_queue);
376}
377
378/**
379 * Connects the Hafnium socket to the provided VM and port. After the socket is
380 * connected, it can be used to exchange datagrams with the specified peer.
381 */
382static int hf_sock_connect(struct socket *sock, struct sockaddr *saddr,
383 int len, int connect_flags)
384{
385 struct sock *sk = sock->sk;
386 struct hf_sock *hsock = hsock_from_sk(sk);
387 struct hf_vm *vm;
388 struct sockaddr_hf *addr;
389 int err;
390 unsigned long flags;
391
392 /* Basic address validation. */
393 if (len < sizeof(struct sockaddr_hf) || saddr->sa_family != AF_HF)
394 return -EINVAL;
395
396 addr = (struct sockaddr_hf *)saddr;
397 if (addr->vm_id > hf_vm_count)
398 return -ENETUNREACH;
399
400 vm = &hf_vms[addr->vm_id - 1];
401
402 /*
403 * TODO: Once we implement access control in Hafnium, check that the
404 * caller is allowed to contact the specified VM. Return -ECONNREFUSED
405 * if access is denied.
406 */
407
408 /* Take lock to make sure state doesn't change as we connect. */
409 lock_sock(sk);
410
411 /* Only unconnected sockets are allowed to become connected. */
412 if (sock->state != SS_UNCONNECTED) {
413 err = -EISCONN;
414 goto exit;
415 }
416
417 hsock->local_port = atomic64_inc_return(&hf_next_port);
418 hsock->remote_port = addr->port;
419 hsock->peer_vm = vm;
420
421 sock->state = SS_CONNECTED;
422
423 /* Add socket to hash table now that it's fully initialised. */
424 spin_lock_irqsave(&hf_local_port_hash_lock, flags);
425 hash_add_rcu(hf_local_port_hash, &sk->sk_node, hsock->local_port);
426 spin_unlock_irqrestore(&hf_local_port_hash_lock, flags);
427
428 err = 0;
429exit:
430 release_sock(sk);
431 return err;
432}
433
434/**
435 * Sends the given skb to the appropriate VM by calling Hafnium. It will also
436 * trigger the wake up of a recipient VM.
437 *
438 * Takes ownership of the skb on success.
439 */
440static int hf_send_skb(struct sk_buff *skb)
441{
442 unsigned long flags;
443 int64_t ret;
444 struct hf_sock *hsock = hsock_from_sk(skb->sk);
445 struct hf_vm *vm = hsock->peer_vm;
446
447 /*
448 * Call Hafnium under the send lock so that we serialize the use of the
449 * global send buffer.
450 */
451 spin_lock_irqsave(&hf_send_lock, flags);
452 memcpy(page_address(hf_send_page), skb->data, skb->len);
453 ret = hf_mailbox_send(vm->id, skb->len);
454 spin_unlock_irqrestore(&hf_send_lock, flags);
455
456 if (ret < 0)
457 return -EAGAIN;
458
459 /* Wake some vcpu up to handle the new message. */
460 hf_handle_wake_up_request(vm->id, ret);
461
462 kfree_skb(skb);
463
464 return 0;
465}
466
467/**
468 * Determines if the given socket is in the connected state. It acquires and
469 * releases the socket lock.
470 */
471static bool hf_sock_is_connected(struct socket *sock)
472{
473 bool ret;
474
475 lock_sock(sock->sk);
476 ret = sock->state == SS_CONNECTED;
477 release_sock(sock->sk);
478
479 return ret;
480}
481
482/**
483 * Sends a message to the VM & port the socket is connected to. All variants
484 * of write/send/sendto/sendmsg eventually call this function.
485 */
486static int hf_sock_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
487{
488 struct sock *sk = sock->sk;
489 struct sk_buff *skb;
490 int err;
491 struct hf_msg_hdr *hdr;
492 struct hf_sock *hsock = hsock_from_sk(sk);
493
494 /* Check length. */
495 if (len > HF_MAILBOX_SIZE - sizeof(struct hf_msg_hdr))
496 return -EMSGSIZE;
497
498 /* We don't allow the destination address to be specified. */
499 if (m->msg_namelen > 0)
500 return -EISCONN;
501
502 /* We don't support out of band messages. */
503 if (m->msg_flags & MSG_OOB)
504 return -EOPNOTSUPP;
505
506 /*
507 * Ensure that the socket is connected. We don't need to hold the socket
508 * lock (acquired and released by hf_sock_is_connected) for the
509 * remainder of the function because the fields we care about are
510 * immutable once the state is SS_CONNECTED.
511 */
512 if (!hf_sock_is_connected(sock))
513 return -ENOTCONN;
514
515 /*
516 * Allocate an skb for this write. If there isn't enough room in the
517 * socket's send buffer (sk_wmem_alloc >= sk_sndbuf), this will block
518 * (if it's a blocking call). On success, it increments sk_wmem_alloc
519 * and sets up the skb such that sk_wmem_alloc gets decremented when
520 * the skb is freed (sock_wfree gets called).
521 */
522 skb = sock_alloc_send_skb(sk, len + sizeof(struct hf_msg_hdr),
523 m->msg_flags & MSG_DONTWAIT, &err);
524 if (!skb)
525 return err;
526
527 /* Reserve room for the header and initialise it. */
528 skb_reserve(skb, sizeof(struct hf_msg_hdr));
529 hdr = skb_push(skb, sizeof(struct hf_msg_hdr));
530 hdr->src_port = hsock->local_port;
531 hdr->dst_port = hsock->remote_port;
532
533 /* Allocate area for the contents, then copy into skb. */
534 if (!copy_from_iter_full(skb_put(skb, len), len, &m->msg_iter)) {
535 err = -EFAULT;
536 goto err_cleanup;
537 }
538
539 /*
540 * TODO: We currently do this inline, but when we have support for
541 * readiness notification from Hafnium, we must add this to a per-VM tx
542 * queue that can make progress when the VM becomes writable. This will
543 * fix send buffering and poll readiness notification.
544 */
545 err = hf_send_skb(skb);
546 if (err)
547 goto err_cleanup;
548
549 return 0;
550
551err_cleanup:
552 kfree_skb(skb);
553 return err;
554}
555
556/**
557 * Receives a message originated from the VM & port the socket is connected to.
558 * All variants of read/recv/recvfrom/recvmsg eventually call this function.
559 */
560static int hf_sock_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
561 int flags)
562{
563 struct sock *sk = sock->sk;
564 struct sk_buff *skb;
565 int err;
566 size_t copy_len;
567
568 if (!hf_sock_is_connected(sock))
569 return -ENOTCONN;
570
571 /* Grab the next skb from the receive queue. */
572 skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
573 if (!skb)
574 return err;
575
576 /* Make sure we don't copy more than what fits in the output buffer. */
577 copy_len = skb->len;
578 if (copy_len > len) {
579 copy_len = len;
580 m->msg_flags |= MSG_TRUNC;
581 }
582
583 /* Make sure we don't overflow the return value type. */
584 if (copy_len > INT_MAX) {
585 copy_len = INT_MAX;
586 m->msg_flags |= MSG_TRUNC;
587 }
588
589 /* Copy skb to output iterator, then free it. */
590 err = skb_copy_datagram_msg(skb, 0, m, copy_len);
591 skb_free_datagram(sk, skb);
592 if (err)
593 return err;
594
595 return copy_len;
596}
597
598/**
599 * This function is called when a Hafnium socket is created. It initialises all
600 * state such that the caller will be able to connect the socket and then send
601 * and receive messages through it.
602 */
603static int hf_sock_create(struct net *net, struct socket *sock, int protocol,
604 int kern)
605{
606 static const struct proto_ops ops = {
607 .family = PF_HF,
608 .owner = THIS_MODULE,
609 .release = hf_sock_release,
610 .bind = sock_no_bind,
611 .connect = hf_sock_connect,
612 .socketpair = sock_no_socketpair,
613 .accept = sock_no_accept,
614 .ioctl = sock_no_ioctl,
615 .listen = sock_no_listen,
616 .shutdown = sock_no_shutdown,
617 .setsockopt = sock_no_setsockopt,
618 .getsockopt = sock_no_getsockopt,
619 .sendmsg = hf_sock_sendmsg,
620 .recvmsg = hf_sock_recvmsg,
621 .mmap = sock_no_mmap,
622 .sendpage = sock_no_sendpage,
623 .poll = datagram_poll,
624 };
625 struct sock *sk;
626
627 if (sock->type != SOCK_DGRAM)
628 return -ESOCKTNOSUPPORT;
629
630 if (protocol != 0)
631 return -EPROTONOSUPPORT;
632
633 /*
634 * For now we only allow callers with sys admin capability to create
635 * Hafnium sockets.
636 */
637 if (!capable(CAP_SYS_ADMIN))
638 return -EPERM;
639
640 /* Allocate and initialise socket. */
641 sk = sk_alloc(net, PF_HF, GFP_KERNEL, &hf_sock_proto, kern);
642 if (!sk)
643 return -ENOMEM;
644
645 sock_init_data(sock, sk);
646
647 sk->sk_destruct = hf_sock_destruct;
648 sock->ops = &ops;
649 sock->state = SS_UNCONNECTED;
650
651 return 0;
652}
653
654/**
Andrew Scullbb7ae412018-09-28 21:07:15 +0100655 * Frees all resources, including threads, associated with the Hafnium driver.
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100656 */
Andrew Scull82257c42018-10-01 10:37:48 +0100657static void hf_free_resources(void)
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100658{
Andrew Scullbb7ae412018-09-28 21:07:15 +0100659 uint32_t i, j;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100660
661 /*
662 * First stop all worker threads. We need to do this before freeing
663 * resources because workers may reference each other, so it is only
664 * safe to free resources after they have all stopped.
665 */
Andrew Scull82257c42018-10-01 10:37:48 +0100666 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100667 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000668
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100669 for (j = 0; j < vm->vcpu_count; j++)
670 kthread_stop(vm->vcpu[j].task);
671 }
672
673 /* Free resources. */
Andrew Scull82257c42018-10-01 10:37:48 +0100674 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100675 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000676
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100677 for (j = 0; j < vm->vcpu_count; j++)
678 put_task_struct(vm->vcpu[j].task);
679 kfree(vm->vcpu);
680 }
681
682 kfree(hf_vms);
683}
684
Andrew Scullbb7ae412018-09-28 21:07:15 +0100685/**
686 * Initializes the Hafnium driver by creating a thread for each vCPU of each
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100687 * virtual machine.
688 */
689static int __init hf_init(void)
690{
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000691 static const struct net_proto_family proto_family = {
692 .family = PF_HF,
693 .create = hf_sock_create,
694 .owner = THIS_MODULE,
695 };
Andrew Scullbb7ae412018-09-28 21:07:15 +0100696 int64_t ret;
697 uint32_t i, j;
Andrew Scull82257c42018-10-01 10:37:48 +0100698 uint32_t total_vm_count;
699 uint32_t total_vcpu_count;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100700
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100701 /* Allocate a page for send and receive buffers. */
702 hf_send_page = alloc_page(GFP_KERNEL);
703 if (!hf_send_page) {
704 pr_err("Unable to allocate send buffer\n");
705 return -ENOMEM;
706 }
707
708 hf_recv_page = alloc_page(GFP_KERNEL);
709 if (!hf_recv_page) {
710 __free_page(hf_send_page);
711 pr_err("Unable to allocate receive buffer\n");
712 return -ENOMEM;
713 }
714
715 /*
716 * Configure both addresses. Once configured, we cannot free these pages
717 * because the hypervisor will use them, even if the module is
718 * unloaded.
719 */
Andrew Scull55704232018-08-10 17:19:54 +0100720 ret = hf_vm_configure(page_to_phys(hf_send_page),
721 page_to_phys(hf_recv_page));
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100722 if (ret) {
723 __free_page(hf_send_page);
724 __free_page(hf_recv_page);
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000725 /*
726 * TODO: We may want to grab this information from hypervisor
727 * and go from there.
728 */
Wedson Almeida Filhof9e11922018-08-12 15:54:31 +0100729 pr_err("Unable to configure VM\n");
730 return -EIO;
731 }
732
Andrew Scull82257c42018-10-01 10:37:48 +0100733 /* Get the number of VMs. */
Andrew Scull55704232018-08-10 17:19:54 +0100734 ret = hf_vm_get_count();
Andrew Scull82257c42018-10-01 10:37:48 +0100735 if (ret < 0) {
Andrew Scullbb7ae412018-09-28 21:07:15 +0100736 pr_err("Unable to retrieve number of VMs: %lld\n", ret);
Andrew Scull82257c42018-10-01 10:37:48 +0100737 return -EIO;
738 }
739
740 /* Confirm the maximum number of VMs looks sane. */
741 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS < 1);
742 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VMS > U16_MAX);
743
744 /* Validate the number of VMs. There must at least be the primary. */
745 if (ret < 1 || ret > CONFIG_HAFNIUM_MAX_VMS) {
746 pr_err("Number of VMs is out of range: %lld\n", ret);
747 return -EDQUOT;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100748 }
749
Andrew Scullb722f952018-09-27 15:39:10 +0100750 /* Only track the secondary VMs. */
Andrew Scull82257c42018-10-01 10:37:48 +0100751 total_vm_count = ret - 1;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000752 hf_vms = kmalloc_array(total_vm_count, sizeof(struct hf_vm),
753 GFP_KERNEL);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100754 if (!hf_vms)
755 return -ENOMEM;
756
757 /* Initialize each VM. */
Andrew Scull82257c42018-10-01 10:37:48 +0100758 total_vcpu_count = 0;
759 for (i = 0; i < total_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100760 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100761
Andrew Scullb722f952018-09-27 15:39:10 +0100762 /* Adjust the ID as only the secondaries are tracked. */
763 vm->id = i + 1;
764
765 ret = hf_vcpu_get_count(vm->id);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100766 if (ret < 0) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000767 pr_err("HF_VCPU_GET_COUNT failed for vm=%u: %lld",
768 vm->id, ret);
Andrew Scull82257c42018-10-01 10:37:48 +0100769 ret = -EIO;
770 goto fail_with_cleanup;
771 }
772
773 /* Avoid overflowing the vcpu count. */
774 if (ret > (U32_MAX - total_vcpu_count)) {
775 pr_err("Too many vcpus: %u\n", total_vcpu_count);
776 ret = -EDQUOT;
777 goto fail_with_cleanup;
778 }
779
780 /* Confirm the maximum number of VCPUs looks sane. */
781 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS < 1);
782 BUILD_BUG_ON(CONFIG_HAFNIUM_MAX_VCPUS > U16_MAX);
783
784 /* Enforce the limit on vcpus. */
785 total_vcpu_count += ret;
786 if (total_vcpu_count > CONFIG_HAFNIUM_MAX_VCPUS) {
787 pr_err("Too many vcpus: %u\n", total_vcpu_count);
788 ret = -EDQUOT;
789 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100790 }
791
792 vm->vcpu_count = ret;
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000793 vm->vcpu = kmalloc_array(vm->vcpu_count, sizeof(struct hf_vcpu),
794 GFP_KERNEL);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100795 if (!vm->vcpu) {
Andrew Scullbb7ae412018-09-28 21:07:15 +0100796 pr_err("No memory for %u vcpus for vm %u",
Andrew Scullb722f952018-09-27 15:39:10 +0100797 vm->vcpu_count, vm->id);
Andrew Scull82257c42018-10-01 10:37:48 +0100798 ret = -ENOMEM;
799 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100800 }
801
Andrew Scull82257c42018-10-01 10:37:48 +0100802 /* Update the number of initialized VMs. */
803 hf_vm_count = i + 1;
804
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100805 /* Create a kernel thread for each vcpu. */
806 for (j = 0; j < vm->vcpu_count; j++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100807 struct hf_vcpu *vcpu = &vm->vcpu[j];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100808 vcpu->task = kthread_create(hf_vcpu_thread, vcpu,
Andrew Scullbb7ae412018-09-28 21:07:15 +0100809 "vcpu_thread_%u_%u",
Andrew Scullb722f952018-09-27 15:39:10 +0100810 vm->id, j);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100811 if (IS_ERR(vcpu->task)) {
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000812 pr_err("Error creating task (vm=%u,vcpu=%u): %ld\n",
813 vm->id, j, PTR_ERR(vcpu->task));
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100814 vm->vcpu_count = j;
Andrew Scull82257c42018-10-01 10:37:48 +0100815 ret = PTR_ERR(vcpu->task);
816 goto fail_with_cleanup;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100817 }
818
819 get_task_struct(vcpu->task);
Andrew Scullb722f952018-09-27 15:39:10 +0100820 vcpu->vm = vm;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100821 vcpu->vcpu_index = j;
Wedson Almeida Filho7fe62332018-12-15 03:09:57 +0000822 atomic_set(&vcpu->abort_sleep, 0);
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100823 }
824 }
825
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000826 /* Register protocol and socket family. */
827 ret = proto_register(&hf_sock_proto, 0);
828 if (ret) {
829 pr_err("Unable to register protocol: %lld\n", ret);
830 goto fail_with_cleanup;
831 }
832
833 ret = sock_register(&proto_family);
834 if (ret) {
835 pr_err("Unable to register Hafnium's socket family: %lld\n",
836 ret);
837 goto fail_unregister_proto;
838 }
839
840 /*
841 * Start running threads now that all is initialized.
842 *
843 * Any failures from this point on must also unregister the socket
844 * family with a call to sock_unregister().
845 */
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100846 for (i = 0; i < hf_vm_count; i++) {
Andrew Scullb3a61b52018-09-17 14:30:34 +0100847 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100848 for (j = 0; j < vm->vcpu_count; j++)
849 wake_up_process(vm->vcpu[j].task);
850 }
851
852 /* Dump vm/vcpu count info. */
Andrew Scullbb7ae412018-09-28 21:07:15 +0100853 pr_info("Hafnium successfully loaded with %u VMs:\n", hf_vm_count);
Andrew Scullb722f952018-09-27 15:39:10 +0100854 for (i = 0; i < hf_vm_count; i++) {
855 struct hf_vm *vm = &hf_vms[i];
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000856
Andrew Scullbb7ae412018-09-28 21:07:15 +0100857 pr_info("\tVM %u: %u vCPUS\n", vm->id, vm->vcpu_count);
Andrew Scullb722f952018-09-27 15:39:10 +0100858 }
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100859
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100860 return 0;
Andrew Scull82257c42018-10-01 10:37:48 +0100861
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000862fail_unregister_proto:
863 proto_unregister(&hf_sock_proto);
Andrew Scull82257c42018-10-01 10:37:48 +0100864fail_with_cleanup:
865 hf_free_resources();
866 return ret;
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100867}
868
869/**
Andrew Scullbb7ae412018-09-28 21:07:15 +0100870 * Frees up all resources used by the Hafnium driver in preparation for
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100871 * unloading it.
872 */
873static void __exit hf_exit(void)
874{
Andrew Scullbb7ae412018-09-28 21:07:15 +0100875 pr_info("Preparing to unload Hafnium\n");
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000876 sock_unregister(PF_HF);
877 proto_unregister(&hf_sock_proto);
Andrew Scull82257c42018-10-01 10:37:48 +0100878 hf_free_resources();
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100879 pr_info("Hafnium ready to unload\n");
880}
881
Wedson Almeida Filho1ee35652018-12-24 01:36:48 +0000882MODULE_LICENSE("GPL v2");
Wedson Almeida Filho2f62b422018-06-19 06:44:32 +0100883
884module_init(hf_init);
885module_exit(hf_exit);