blob: 6ff2c7cac4c463e991b56e430c5d9cd84f863a1c [file] [log] [blame]
David Brazdil0f672f62019-12-10 10:32:29 +00001// SPDX-License-Identifier: GPL-2.0-or-later
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00002/*
3 * KVM paravirt_ops implementation
4 *
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00005 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Copyright IBM Corporation, 2007
7 * Authors: Anthony Liguori <aliguori@us.ibm.com>
8 */
9
10#include <linux/context_tracking.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/kvm_para.h>
14#include <linux/cpu.h>
15#include <linux/mm.h>
16#include <linux/highmem.h>
17#include <linux/hardirq.h>
18#include <linux/notifier.h>
19#include <linux/reboot.h>
20#include <linux/hash.h>
21#include <linux/sched.h>
22#include <linux/slab.h>
23#include <linux/kprobes.h>
24#include <linux/debugfs.h>
25#include <linux/nmi.h>
26#include <linux/swait.h>
Olivier Deprez0e641232021-09-23 10:07:05 +020027#include <linux/syscore_ops.h>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000028#include <asm/timer.h>
29#include <asm/cpu.h>
30#include <asm/traps.h>
31#include <asm/desc.h>
32#include <asm/tlbflush.h>
33#include <asm/apic.h>
34#include <asm/apicdef.h>
35#include <asm/hypervisor.h>
36#include <asm/tlb.h>
Olivier Deprez0e641232021-09-23 10:07:05 +020037#include <asm/reboot.h>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000038
39static int kvmapf = 1;
40
41static int __init parse_no_kvmapf(char *arg)
42{
43 kvmapf = 0;
44 return 0;
45}
46
47early_param("no-kvmapf", parse_no_kvmapf);
48
49static int steal_acc = 1;
50static int __init parse_no_stealacc(char *arg)
51{
52 steal_acc = 0;
53 return 0;
54}
55
56early_param("no-steal-acc", parse_no_stealacc);
57
58static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
David Brazdil0f672f62019-12-10 10:32:29 +000059DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000060static int has_steal_clock = 0;
61
62/*
63 * No need for any "IO delay" on KVM
64 */
65static void kvm_io_delay(void)
66{
67}
68
69#define KVM_TASK_SLEEP_HASHBITS 8
70#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
71
72struct kvm_task_sleep_node {
73 struct hlist_node link;
74 struct swait_queue_head wq;
75 u32 token;
76 int cpu;
77 bool halted;
78};
79
80static struct kvm_task_sleep_head {
81 raw_spinlock_t lock;
82 struct hlist_head list;
83} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
84
85static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
86 u32 token)
87{
88 struct hlist_node *p;
89
90 hlist_for_each(p, &b->list) {
91 struct kvm_task_sleep_node *n =
92 hlist_entry(p, typeof(*n), link);
93 if (n->token == token)
94 return n;
95 }
96
97 return NULL;
98}
99
100/*
101 * @interrupt_kernel: Is this called from a routine which interrupts the kernel
102 * (other than user space)?
103 */
104void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
105{
106 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
107 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
108 struct kvm_task_sleep_node n, *e;
109 DECLARE_SWAITQUEUE(wait);
110
111 rcu_irq_enter();
112
113 raw_spin_lock(&b->lock);
114 e = _find_apf_task(b, token);
115 if (e) {
116 /* dummy entry exist -> wake up was delivered ahead of PF */
117 hlist_del(&e->link);
118 kfree(e);
119 raw_spin_unlock(&b->lock);
120
121 rcu_irq_exit();
122 return;
123 }
124
125 n.token = token;
126 n.cpu = smp_processor_id();
127 n.halted = is_idle_task(current) ||
128 (IS_ENABLED(CONFIG_PREEMPT_COUNT)
129 ? preempt_count() > 1 || rcu_preempt_depth()
130 : interrupt_kernel);
131 init_swait_queue_head(&n.wq);
132 hlist_add_head(&n.link, &b->list);
133 raw_spin_unlock(&b->lock);
134
135 for (;;) {
136 if (!n.halted)
137 prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
138 if (hlist_unhashed(&n.link))
139 break;
140
141 rcu_irq_exit();
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154
155 rcu_irq_enter();
156 }
157 if (!n.halted)
158 finish_swait(&n.wq, &wait);
159
160 rcu_irq_exit();
161 return;
162}
163EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
164
165static void apf_task_wake_one(struct kvm_task_sleep_node *n)
166{
167 hlist_del_init(&n->link);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (swq_has_sleeper(&n->wq))
171 swake_up_one(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 raw_spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 raw_spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 raw_spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kzalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 raw_spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 init_swait_queue_head(&n->wq);
224 hlist_add_head(&n->link, &b->list);
225 } else
226 apf_task_wake_one(n);
227 raw_spin_unlock(&b->lock);
228 return;
229}
230EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
231
232u32 kvm_read_and_reset_pf_reason(void)
233{
234 u32 reason = 0;
235
236 if (__this_cpu_read(apf_reason.enabled)) {
237 reason = __this_cpu_read(apf_reason.reason);
238 __this_cpu_write(apf_reason.reason, 0);
239 }
240
241 return reason;
242}
243EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
244NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void
David Brazdil0f672f62019-12-10 10:32:29 +0000247do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000248{
249 enum ctx_state prev_state;
250
251 switch (kvm_read_and_reset_pf_reason()) {
252 default:
David Brazdil0f672f62019-12-10 10:32:29 +0000253 do_page_fault(regs, error_code, address);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000254 break;
255 case KVM_PV_REASON_PAGE_NOT_PRESENT:
256 /* page is swapped out by the host. */
257 prev_state = exception_enter();
David Brazdil0f672f62019-12-10 10:32:29 +0000258 kvm_async_pf_task_wait((u32)address, !user_mode(regs));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000259 exception_exit(prev_state);
260 break;
261 case KVM_PV_REASON_PAGE_READY:
262 rcu_irq_enter();
David Brazdil0f672f62019-12-10 10:32:29 +0000263 kvm_async_pf_task_wake((u32)address);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000264 rcu_irq_exit();
265 break;
266 }
267}
268NOKPROBE_SYMBOL(do_async_page_fault);
269
270static void __init paravirt_ops_setup(void)
271{
272 pv_info.name = "KVM";
273
274 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
David Brazdil0f672f62019-12-10 10:32:29 +0000275 pv_ops.cpu.io_delay = kvm_io_delay;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000276
277#ifdef CONFIG_X86_IO_APIC
278 no_timer_check = 1;
279#endif
280}
281
282static void kvm_register_steal_time(void)
283{
284 int cpu = smp_processor_id();
285 struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
286
287 if (!has_steal_clock)
288 return;
289
290 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
291 pr_info("kvm-stealtime: cpu %d, msr %llx\n",
292 cpu, (unsigned long long) slow_virt_to_phys(st));
293}
294
295static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
296
297static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
298{
299 /**
300 * This relies on __test_and_clear_bit to modify the memory
301 * in a way that is atomic with respect to the local CPU.
302 * The hypervisor only accesses this memory from the local CPU so
303 * there's no need for lock or memory barriers.
304 * An optimization barrier is implied in apic write.
305 */
306 if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
307 return;
308 apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
309}
310
311static void kvm_guest_cpu_init(void)
312{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000313 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
314 u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
315
David Brazdil0f672f62019-12-10 10:32:29 +0000316#ifdef CONFIG_PREEMPTION
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000317 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
318#endif
319 pa |= KVM_ASYNC_PF_ENABLED;
320
321 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
322 pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
323
324 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
325 __this_cpu_write(apf_reason.enabled, 1);
326 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
327 smp_processor_id());
328 }
329
330 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
331 unsigned long pa;
332 /* Size alignment is implied but just to make it explicit. */
333 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
334 __this_cpu_write(kvm_apic_eoi, 0);
335 pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
336 | KVM_MSR_ENABLED;
337 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
338 }
339
340 if (has_steal_clock)
341 kvm_register_steal_time();
342}
343
344static void kvm_pv_disable_apf(void)
345{
346 if (!__this_cpu_read(apf_reason.enabled))
347 return;
348
349 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
350 __this_cpu_write(apf_reason.enabled, 0);
351
352 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
353 smp_processor_id());
354}
355
Olivier Deprez0e641232021-09-23 10:07:05 +0200356static void kvm_disable_steal_time(void)
357{
358 if (!has_steal_clock)
359 return;
360
361 wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
362}
363
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000364static void kvm_pv_guest_cpu_reboot(void *unused)
365{
366 /*
367 * We disable PV EOI before we load a new kernel by kexec,
368 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
369 * New kernel can re-enable when it boots.
370 */
371 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
372 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
373 kvm_pv_disable_apf();
374 kvm_disable_steal_time();
375}
376
377static int kvm_pv_reboot_notify(struct notifier_block *nb,
378 unsigned long code, void *unused)
379{
380 if (code == SYS_RESTART)
381 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
382 return NOTIFY_DONE;
383}
384
385static struct notifier_block kvm_pv_reboot_nb = {
386 .notifier_call = kvm_pv_reboot_notify,
387};
388
389static u64 kvm_steal_clock(int cpu)
390{
391 u64 steal;
392 struct kvm_steal_time *src;
393 int version;
394
395 src = &per_cpu(steal_time, cpu);
396 do {
397 version = src->version;
398 virt_rmb();
399 steal = src->steal;
400 virt_rmb();
401 } while ((version & 1) || (version != src->version));
402
403 return steal;
404}
405
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000406static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
407{
408 early_set_memory_decrypted((unsigned long) ptr, size);
409}
410
411/*
412 * Iterate through all possible CPUs and map the memory region pointed
413 * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
414 *
415 * Note: we iterate through all possible CPUs to ensure that CPUs
416 * hotplugged will have their per-cpu variable already mapped as
417 * decrypted.
418 */
419static void __init sev_map_percpu_data(void)
420{
421 int cpu;
422
423 if (!sev_active())
424 return;
425
426 for_each_possible_cpu(cpu) {
427 __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
428 __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
429 __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
430 }
431}
432
Olivier Deprez0e641232021-09-23 10:07:05 +0200433static void kvm_guest_cpu_offline(bool shutdown)
434{
435 kvm_disable_steal_time();
436 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
437 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
438 kvm_pv_disable_apf();
439 if (!shutdown)
440 apf_task_wake_all();
441 kvmclock_disable();
442}
443
444static int kvm_cpu_online(unsigned int cpu)
445{
446 unsigned long flags;
447
448 local_irq_save(flags);
449 kvm_guest_cpu_init();
450 local_irq_restore(flags);
451 return 0;
452}
453
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000454#ifdef CONFIG_SMP
455#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
456
457static void __send_ipi_mask(const struct cpumask *mask, int vector)
458{
459 unsigned long flags;
460 int cpu, apic_id, icr;
461 int min = 0, max = 0;
462#ifdef CONFIG_X86_64
463 __uint128_t ipi_bitmap = 0;
464#else
465 u64 ipi_bitmap = 0;
466#endif
David Brazdil0f672f62019-12-10 10:32:29 +0000467 long ret;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000468
469 if (cpumask_empty(mask))
470 return;
471
472 local_irq_save(flags);
473
474 switch (vector) {
475 default:
476 icr = APIC_DM_FIXED | vector;
477 break;
478 case NMI_VECTOR:
479 icr = APIC_DM_NMI;
480 break;
481 }
482
483 for_each_cpu(cpu, mask) {
484 apic_id = per_cpu(x86_cpu_to_apicid, cpu);
485 if (!ipi_bitmap) {
486 min = max = apic_id;
487 } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
488 ipi_bitmap <<= min - apic_id;
489 min = apic_id;
490 } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
491 max = apic_id < max ? max : apic_id;
492 } else {
David Brazdil0f672f62019-12-10 10:32:29 +0000493 ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000494 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
David Brazdil0f672f62019-12-10 10:32:29 +0000495 WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000496 min = max = apic_id;
497 ipi_bitmap = 0;
498 }
499 __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
500 }
501
502 if (ipi_bitmap) {
David Brazdil0f672f62019-12-10 10:32:29 +0000503 ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000504 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
David Brazdil0f672f62019-12-10 10:32:29 +0000505 WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000506 }
507
508 local_irq_restore(flags);
509}
510
511static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
512{
513 __send_ipi_mask(mask, vector);
514}
515
516static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
517{
518 unsigned int this_cpu = smp_processor_id();
519 struct cpumask new_mask;
520 const struct cpumask *local_mask;
521
522 cpumask_copy(&new_mask, mask);
523 cpumask_clear_cpu(this_cpu, &new_mask);
524 local_mask = &new_mask;
525 __send_ipi_mask(local_mask, vector);
526}
527
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000528/*
529 * Set the IPI entry points
530 */
531static void kvm_setup_pv_ipi(void)
532{
533 apic->send_IPI_mask = kvm_send_ipi_mask;
534 apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000535 pr_info("KVM setup pv IPIs\n");
536}
537
David Brazdil0f672f62019-12-10 10:32:29 +0000538static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
539{
540 int cpu;
541
542 native_send_call_func_ipi(mask);
543
544 /* Make sure other vCPUs get a chance to run if they need to. */
545 for_each_cpu(cpu, mask) {
546 if (vcpu_is_preempted(cpu)) {
547 kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
548 break;
549 }
550 }
551}
552
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000553static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
554{
555 native_smp_prepare_cpus(max_cpus);
556 if (kvm_para_has_hint(KVM_HINTS_REALTIME))
557 static_branch_disable(&virt_spin_lock_key);
558}
559
560static void __init kvm_smp_prepare_boot_cpu(void)
561{
562 /*
563 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
564 * shares the guest physical address with the hypervisor.
565 */
566 sev_map_percpu_data();
567
568 kvm_guest_cpu_init();
569 native_smp_prepare_boot_cpu();
570 kvm_spinlock_init();
571}
572
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000573static int kvm_cpu_down_prepare(unsigned int cpu)
574{
Olivier Deprez0e641232021-09-23 10:07:05 +0200575 unsigned long flags;
576
577 local_irq_save(flags);
578 kvm_guest_cpu_offline(false);
579 local_irq_restore(flags);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000580 return 0;
581}
Olivier Deprez0e641232021-09-23 10:07:05 +0200582
583#endif
584
585static int kvm_suspend(void)
586{
587 kvm_guest_cpu_offline(false);
588
589 return 0;
590}
591
592static void kvm_resume(void)
593{
594 kvm_cpu_online(raw_smp_processor_id());
595}
596
597static struct syscore_ops kvm_syscore_ops = {
598 .suspend = kvm_suspend,
599 .resume = kvm_resume,
600};
601
602/*
603 * After a PV feature is registered, the host will keep writing to the
604 * registered memory location. If the guest happens to shutdown, this memory
605 * won't be valid. In cases like kexec, in which you install a new kernel, this
606 * means a random memory location will be kept being written.
607 */
608#ifdef CONFIG_KEXEC_CORE
609static void kvm_crash_shutdown(struct pt_regs *regs)
610{
611 kvm_guest_cpu_offline(true);
612 native_machine_crash_shutdown(regs);
613}
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000614#endif
615
616static void __init kvm_apf_trap_init(void)
617{
618 update_intr_gate(X86_TRAP_PF, async_page_fault);
619}
620
621static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
622
623static void kvm_flush_tlb_others(const struct cpumask *cpumask,
624 const struct flush_tlb_info *info)
625{
626 u8 state;
627 int cpu;
628 struct kvm_steal_time *src;
629 struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
630
631 cpumask_copy(flushmask, cpumask);
632 /*
633 * We have to call flush only on online vCPUs. And
634 * queue flush_on_enter for pre-empted vCPUs
635 */
636 for_each_cpu(cpu, flushmask) {
637 src = &per_cpu(steal_time, cpu);
638 state = READ_ONCE(src->preempted);
639 if ((state & KVM_VCPU_PREEMPTED)) {
640 if (try_cmpxchg(&src->preempted, &state,
641 state | KVM_VCPU_FLUSH_TLB))
642 __cpumask_clear_cpu(cpu, flushmask);
643 }
644 }
645
646 native_flush_tlb_others(flushmask, info);
647}
648
649static void __init kvm_guest_init(void)
650{
651 int i;
652
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000653 paravirt_ops_setup();
654 register_reboot_notifier(&kvm_pv_reboot_nb);
655 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
656 raw_spin_lock_init(&async_pf_sleepers[i].lock);
657 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
658 x86_init.irqs.trap_init = kvm_apf_trap_init;
659
660 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
661 has_steal_clock = 1;
David Brazdil0f672f62019-12-10 10:32:29 +0000662 pv_ops.time.steal_clock = kvm_steal_clock;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000663 }
664
665 if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
666 !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
667 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
David Brazdil0f672f62019-12-10 10:32:29 +0000668 pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
669 pv_ops.mmu.tlb_remove_table = tlb_remove_table;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000670 }
671
672 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
673 apic_set_eoi_write(kvm_guest_apic_eoi_write);
674
675#ifdef CONFIG_SMP
676 smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
677 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
David Brazdil0f672f62019-12-10 10:32:29 +0000678 if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
679 !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
680 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
681 smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
682 pr_info("KVM setup pv sched yield\n");
683 }
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000684 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
685 kvm_cpu_online, kvm_cpu_down_prepare) < 0)
686 pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
687#else
688 sev_map_percpu_data();
689 kvm_guest_cpu_init();
690#endif
691
Olivier Deprez0e641232021-09-23 10:07:05 +0200692#ifdef CONFIG_KEXEC_CORE
693 machine_ops.crash_shutdown = kvm_crash_shutdown;
694#endif
695
696 register_syscore_ops(&kvm_syscore_ops);
697
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000698 /*
699 * Hard lockup detection is enabled by default. Disable it, as guests
700 * can get false positives too easily, for example if the host is
701 * overcommitted.
702 */
703 hardlockup_detector_disable();
704}
705
706static noinline uint32_t __kvm_cpuid_base(void)
707{
708 if (boot_cpu_data.cpuid_level < 0)
709 return 0; /* So we don't blow up on old processors */
710
711 if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
712 return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
713
714 return 0;
715}
716
717static inline uint32_t kvm_cpuid_base(void)
718{
719 static int kvm_cpuid_base = -1;
720
721 if (kvm_cpuid_base == -1)
722 kvm_cpuid_base = __kvm_cpuid_base();
723
724 return kvm_cpuid_base;
725}
726
727bool kvm_para_available(void)
728{
729 return kvm_cpuid_base() != 0;
730}
731EXPORT_SYMBOL_GPL(kvm_para_available);
732
733unsigned int kvm_arch_para_features(void)
734{
735 return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
736}
737
738unsigned int kvm_arch_para_hints(void)
739{
740 return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
741}
David Brazdil0f672f62019-12-10 10:32:29 +0000742EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000743
744static uint32_t __init kvm_detect(void)
745{
746 return kvm_cpuid_base();
747}
748
749static void __init kvm_apic_init(void)
750{
751#if defined(CONFIG_SMP)
752 if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
753 kvm_setup_pv_ipi();
754#endif
755}
756
757static void __init kvm_init_platform(void)
758{
759 kvmclock_init();
760 x86_platform.apic_post_init = kvm_apic_init;
761}
762
763const __initconst struct hypervisor_x86 x86_hyper_kvm = {
764 .name = "KVM",
765 .detect = kvm_detect,
766 .type = X86_HYPER_KVM,
767 .init.guest_late_init = kvm_guest_init,
768 .init.x2apic_available = kvm_para_available,
769 .init.init_platform = kvm_init_platform,
770};
771
772static __init int activate_jump_labels(void)
773{
774 if (has_steal_clock) {
775 static_key_slow_inc(&paravirt_steal_enabled);
776 if (steal_acc)
777 static_key_slow_inc(&paravirt_steal_rq_enabled);
778 }
779
780 return 0;
781}
782arch_initcall(activate_jump_labels);
783
784static __init int kvm_setup_pv_tlb_flush(void)
785{
786 int cpu;
787
788 if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
789 !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
790 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
791 for_each_possible_cpu(cpu) {
792 zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
793 GFP_KERNEL, cpu_to_node(cpu));
794 }
795 pr_info("KVM setup pv remote TLB flush\n");
796 }
797
798 return 0;
799}
800arch_initcall(kvm_setup_pv_tlb_flush);
801
802#ifdef CONFIG_PARAVIRT_SPINLOCKS
803
804/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
805static void kvm_kick_cpu(int cpu)
806{
807 int apicid;
808 unsigned long flags = 0;
809
810 apicid = per_cpu(x86_cpu_to_apicid, cpu);
811 kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
812}
813
814#include <asm/qspinlock.h>
815
816static void kvm_wait(u8 *ptr, u8 val)
817{
818 unsigned long flags;
819
820 if (in_nmi())
821 return;
822
823 local_irq_save(flags);
824
825 if (READ_ONCE(*ptr) != val)
826 goto out;
827
828 /*
829 * halt until it's our turn and kicked. Note that we do safe halt
830 * for irq enabled case to avoid hang when lock info is overwritten
831 * in irq spinlock slowpath and no spurious interrupt occur to save us.
832 */
833 if (arch_irqs_disabled_flags(flags))
834 halt();
835 else
836 safe_halt();
837
838out:
839 local_irq_restore(flags);
840}
841
842#ifdef CONFIG_X86_32
843__visible bool __kvm_vcpu_is_preempted(long cpu)
844{
845 struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
846
847 return !!(src->preempted & KVM_VCPU_PREEMPTED);
848}
849PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
850
851#else
852
853#include <asm/asm-offsets.h>
854
855extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
856
857/*
858 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
859 * restoring to/from the stack.
860 */
861asm(
862".pushsection .text;"
863".global __raw_callee_save___kvm_vcpu_is_preempted;"
864".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
865"__raw_callee_save___kvm_vcpu_is_preempted:"
866"movq __per_cpu_offset(,%rdi,8), %rax;"
867"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
868"setne %al;"
869"ret;"
David Brazdil0f672f62019-12-10 10:32:29 +0000870".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000871".popsection");
872
873#endif
874
875/*
876 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
877 */
878void __init kvm_spinlock_init(void)
879{
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000880 /* Does host kernel support KVM_FEATURE_PV_UNHALT? */
881 if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
882 return;
883
884 if (kvm_para_has_hint(KVM_HINTS_REALTIME))
885 return;
886
887 /* Don't use the pvqspinlock code if there is only 1 vCPU. */
888 if (num_possible_cpus() == 1)
889 return;
890
891 __pv_init_lock_hash();
David Brazdil0f672f62019-12-10 10:32:29 +0000892 pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
893 pv_ops.lock.queued_spin_unlock =
894 PV_CALLEE_SAVE(__pv_queued_spin_unlock);
895 pv_ops.lock.wait = kvm_wait;
896 pv_ops.lock.kick = kvm_kick_cpu;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000897
898 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
David Brazdil0f672f62019-12-10 10:32:29 +0000899 pv_ops.lock.vcpu_is_preempted =
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000900 PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
901 }
902}
903
904#endif /* CONFIG_PARAVIRT_SPINLOCKS */
David Brazdil0f672f62019-12-10 10:32:29 +0000905
906#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
907
908static void kvm_disable_host_haltpoll(void *i)
909{
910 wrmsrl(MSR_KVM_POLL_CONTROL, 0);
911}
912
913static void kvm_enable_host_haltpoll(void *i)
914{
915 wrmsrl(MSR_KVM_POLL_CONTROL, 1);
916}
917
918void arch_haltpoll_enable(unsigned int cpu)
919{
920 if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
921 pr_err_once("kvm: host does not support poll control\n");
922 pr_err_once("kvm: host upgrade recommended\n");
923 return;
924 }
925
926 /* Enable guest halt poll disables host halt poll */
927 smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
928}
929EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
930
931void arch_haltpoll_disable(unsigned int cpu)
932{
933 if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
934 return;
935
936 /* Enable guest halt poll disables host halt poll */
937 smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
938}
939EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
940#endif